In [59]:
import pandas as pd
import numpy as np

In [60]:
def calculate_mahalanobis_similarity(vector1, vector2, inv_covariance_matrix):
    diff = vector1 - vector2
    distance = np.sqrt(diff.dot(inv_covariance_matrix).dot(diff))
    return np.exp(-distance)

def get_recommendations(user_profile, train_data, k=5):
    covariance_matrix = np.cov(train_data.values.T)
    inv_covariance_matrix = np.linalg.inv(covariance_matrix)
    
    similarities = np.array([
        calculate_mahalanobis_similarity(
            user_profile.iloc[0].values,
            train_data.iloc[i].values,
            inv_covariance_matrix
        )
        for i in range(len(train_data))
    ])
    
    similar_indices = np.argsort(similarities)[-k:]
    return train_data.iloc[similar_indices], similarities[similar_indices]

def fill_test_values(test_data, train_data):
    filled_predictions = test_data.copy()
    
    for idx in range(len(test_data)):
        test_user = test_data.iloc[[idx]]
        similar_users, similarities = get_recommendations(test_user, train_data)
        
        # Normalize similarities to weights
        weights = similarities / np.sum(similarities)
        weighted_predictions = np.average(similar_users, weights=weights, axis=0)
        filled_predictions.iloc[idx] = np.round(weighted_predictions)
    
    return filled_predictions

In [61]:
from sklearn.metrics import precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error

def calculate_accuracy_metrics(evaluation, threshold=0.5):
    y_true = evaluation['score'] >= threshold
    y_pred = evaluation['prediction'] >= threshold
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return precision, recall, f1

def calculate_error_metrics(evaluation):
    y_true = evaluation['score']
    y_pred = evaluation['prediction']
    
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    return mae, rmse

def calculate_ranking_metrics(evaluation):
    def average_precision(y_true, y_pred):
        sorted_indices = np.argsort(y_pred)[::-1]
        y_true_sorted = np.array(y_true)[sorted_indices]
        cumsum = np.cumsum(y_true_sorted)
        precision_at_k = cumsum / (np.arange(len(y_true_sorted)) + 1)
        return np.sum(precision_at_k * y_true_sorted) / np.sum(y_true_sorted)
    
    def ndcg(y_true, y_pred, k=10):
        sorted_indices = np.argsort(y_pred)[::-1]
        y_true_sorted = np.array(y_true)[sorted_indices]
        dcg = np.sum((2**y_true_sorted - 1) / np.log2(np.arange(1, len(y_true_sorted) + 1) + 1))
        ideal_sorted_indices = np.argsort(y_true)[::-1]
        y_true_ideal_sorted = np.array(y_true)[ideal_sorted_indices]
        idcg = np.sum((2**y_true_ideal_sorted - 1) / np.log2(np.arange(1, len(y_true_ideal_sorted) + 1) + 1))
        return dcg / idcg
    
    y_true = evaluation['score']
    y_pred = evaluation['prediction']
    
    map_score = average_precision(y_true, y_pred)
    ndcg_score = ndcg(y_true, y_pred)
    
    return map_score, ndcg_score

## Load

In [62]:
train = pd.read_csv("../../data/train/user_item.csv")
train.head()

Unnamed: 0,profile_id,offer,score,customer_type
0,406b1422299944039e05c12a48dba84a,discount-web-email-mobile,1.0,1
1,3f62dc31f11b453a9909809e20852450,bogo-email-mobile-social,1.0,2
2,665b6493546141518af2f3a0bf316800,discount-web-email-mobile,1.0,1
3,35c863d477084f7fb46e4b309cf3ea5d,discount-web-email-mobile,1.0,1
4,0a947767586e4587b06b8ca3efc3c8e7,bogo-web-email-mobile-social,0.0,1


In [63]:
test = pd.read_csv("../../data/test/user_item.csv")
test.head()

Unnamed: 0,profile_id,offer,score,customer_type
0,fcbcd28beee1457f8b3672658ea0a1e3,informational-email-mobile-social,1.0,4
1,1698291a4a474d84b7d7fc2e24ab684a,informational-email-mobile-social,1.0,1
2,639314daa82a46558c17020fd84d03f6,bogo-email-mobile-social,1.0,1
3,f626cb1552414edab2afdbf0c32c8476,bogo-web-email-mobile,1.0,3
4,61c9306f27f9423d9630b95cf66c266d,discount-web-email-mobile-social,1.0,0


## Transform

In [64]:
train_user_item = train[['profile_id', 'offer', 'score']]
train_data_df = train_user_item.groupby(['profile_id', 'offer'])['score'].max().unstack().fillna(0)

In [65]:
test_user_item = test[['profile_id', 'offer', 'score']]
test_data_df = test_user_item.groupby(['profile_id', 'offer'])['score'].max().unstack().fillna(0)

## Predict

In [66]:
# Filter train_data_df by test_users index
test_users = train_data_df.index.intersection(test_data_df.index)
filtered_train = train_data_df.loc[test_users]

In [67]:
# Make predictions
predicted = fill_test_values(filtered_train, train_data_df)
predicted.head()

offer,bogo-email-mobile-social,bogo-web-email-mobile,bogo-web-email-mobile-social,discount-web-email,discount-web-email-mobile,discount-web-email-mobile-social,informational-email-mobile-social,informational-web-email-mobile
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0009655768c64bdeb2e877511632db8f,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
0011e0d4e6b944f998e987f904e8c1e5,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
004c5799adbf42868b9cff0396190900,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
005500a7188546ff8a767329a2f7c76a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0056df74b63b4298809f0b375a304cf4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [68]:
predicted = predicted.reset_index().melt(
    id_vars=['profile_id'], 
    var_name='offer', 
    value_name='score'
).rename(columns={'score': 'prediction'})

predicted.head()

Unnamed: 0,profile_id,offer,prediction
0,0009655768c64bdeb2e877511632db8f,bogo-email-mobile-social,0.0
1,0011e0d4e6b944f998e987f904e8c1e5,bogo-email-mobile-social,0.0
2,004c5799adbf42868b9cff0396190900,bogo-email-mobile-social,0.0
3,005500a7188546ff8a767329a2f7c76a,bogo-email-mobile-social,0.0
4,0056df74b63b4298809f0b375a304cf4,bogo-email-mobile-social,0.0


## Evaluate

In [69]:
evaluation = pd.merge(
    predicted, 
    test, 
    on=["profile_id", "offer"], 
    how="inner"
).drop(columns=["customer_type"])

evaluation.head()

Unnamed: 0,profile_id,offer,prediction,score
0,005500a7188546ff8a767329a2f7c76a,bogo-email-mobile-social,0.0,1.0
1,00ae03011f9f49b8a4b3e6d416678b0b,bogo-email-mobile-social,0.0,0.5
2,00b3400e4ff64ee68ce9ada1d0c222f0,bogo-email-mobile-social,0.0,0.0
3,00c5a385c71a4d3db5e9b4e31e430943,bogo-email-mobile-social,0.0,1.0
4,00cf471ed1aa42a8bdde5561d67da2b1,bogo-email-mobile-social,0.0,1.0


In [70]:
evaluation.to_csv("../../data/predictions/cf_ms.csv", index=False)

In [71]:
evaluation = pd.read_csv('../../data/predictions/cf_ms.csv')

# Calculate accuracy metrics
precision, recall, f1 = calculate_accuracy_metrics(evaluation)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

# Calculate error metrics
mae, rmse = calculate_error_metrics(evaluation)
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

# Calculate ranking metrics
map_score, ndcg_score = calculate_ranking_metrics(evaluation)
print(f'Mean Average Precision (MAP): {map_score}')
print(f'Normalized Discounted Cumulative Gain (NDCG): {ndcg_score}')

Precision: 0.9665629860031104
Recall: 0.13082833385959372
F1-Score: 0.23046259386298323
Mean Absolute Error (MAE): 0.6623985560350677
Root Mean Squared Error (RMSE): 0.808610202792579
Mean Average Precision (MAP): 0.8108887343201155
Normalized Discounted Cumulative Gain (NDCG): 0.9755457340548795
