In [3]:
import pandas as pd
import numpy as np
import hashlib

def load_data(file_path):
    """Loads the data from a CSV file into a pandas DataFrame.

    Args:
        file_path (str): The path to the CSV file.

    Returns:
        pandas.DataFrame: The loaded DataFrame.
    """
    df = pd.read_csv(file_path, header=0)
    return df

def create_function_id(df):
    """Creates a composite function_id by combining HashOwner, HashApp, HashFunction, and Trigger.

    Args:
        df (pandas.DataFrame): The input DataFrame.

    Returns:
        pandas.DataFrame: The DataFrame with an added 'function_id' column.
    """
    df['function_id'] = (
        df['HashOwner'].astype(str) + '_' +
        df['HashApp'].astype(str) + '_' +
        df['HashFunction'].astype(str) + '_' +
        df['Trigger'].astype(str)
    )
    return df

def exclude_uncommon_rows_separate(df1, df2, hash_cols):
    """
    Excludes rows from df1 and df2 that have uncommon hash values, based on specified columns.

    Args:
        df1: The first DataFrame.
        df2: The second DataFrame.
        hash_cols: A list of column names to use for hashing.

    Returns:
        A tuple containing two DataFrames:
            - The first DataFrame with rows excluded that have uncommon hash values with df2.
            - The second DataFrame with rows excluded that have uncommon hash values with df1.
    """
    # Create a hash function for consistent hashing across DataFrames
    def create_hash(row, cols):
        values = ''.join(str(row[col]) for col in cols)
        hash_object = hashlib.sha256(values.encode('utf-8'))
        return hash_object.hexdigest()

    # Add a hash column to each DataFrame
    df1['hash'] = df1.apply(lambda row: create_hash(row, hash_cols), axis=1)
    df2['hash'] = df2.apply(lambda row: create_hash(row, hash_cols), axis=1)

    # Get the set of common hash values
    common_hashes = set(df1['hash']).intersection(set(df2['hash']))

    # Filter rows in each DataFrame based on common hash values
    df1_filtered = df1[df1['hash'].isin(common_hashes)].drop('hash', axis=1).reset_index(drop=True)
    df2_filtered = df2[df2['hash'].isin(common_hashes)].drop('hash', axis=1).reset_index(drop=True)

    return df1_filtered, df2_filtered

def dummy_prediction(df, function_ids):
    """Performs a dummy prediction of a static 10 invocations per minute for specified function IDs.

    Args:
        df (pandas.DataFrame): The input DataFrame.
        function_ids (list): A list of function IDs to apply the predictions to.

    Returns:
        pandas.Series: A series of predicted invocation rates.
    """
    filtered_df = df[df['function_id'].isin(function_ids)]
    num_rows = len(filtered_df)
    predictions = pd.Series([10] * num_rows, index=filtered_df.index)
    return predictions

def evaluate_predictions(true_values, predicted_values):
    """Evaluates the predicted output and calculates MAE, RMSE, and MAPE.

    Args:
        true_values (pandas.Series): A series of true values.
        predicted_values (pandas.Series): A series of predicted values.

    Returns:
        tuple: A tuple containing MAE, RMSE, and MAPE.
    """

    # Calculate MAE
    mae = np.mean(np.abs(true_values - predicted_values))

    # Calculate RMSE
    rmse = np.sqrt(np.mean((true_values - predicted_values) ** 2))

    # Calculate MAPE
    mape = np.mean(np.abs((true_values - predicted_values) / true_values)) * 100

    return mae, rmse, mape

df_train = load_data('invocations_per_function_md.anon.d01.csv')
df_test = load_data('invocations_per_function_md.anon.d08.csv')

df_train = create_function_id(df_train)
df_test = create_function_id(df_test)

hash_cols = ['HashOwner', 'HashApp', 'HashFunction', 'Trigger']

df_train, df_test = exclude_uncommon_rows_separate(df_train, df_test, hash_cols)

function_ids = df_test['function_id'].unique()

predictions = dummy_prediction(df_test, function_ids)

filtered_df = df_test.copy()

time_columns = [str(i) for i in range(1, 1441)]
filtered_df[time_columns] = filtered_df[time_columns].apply(pd.to_numeric, errors='coerce').fillna(0)
true_values = filtered_df[time_columns].sum(axis=1)

predictions = predictions.reindex(filtered_df.index)

mae, rmse, mape = evaluate_predictions(true_values, predictions)

print("MAE:", mae)
print("RMSE:", rmse)
print("MAPE:", mape)

MAE: 21047.483047647813
RMSE: 825483.1147243218
MAPE: 249.50858428779762
