In [18]:
import numpy as np
import pandas as pd

In [19]:
def FunkSVD(ratings_mat, latent_features=12, learning_rate=0.0001, iters=100):
    '''
    This function performs matrix factorization using a basic form of FunkSVD with no regularization

    INPUT:
    ratings_mat - (numpy array) a matrix with users as rows, offers as columns, and ratings as values
    latent_features - (int) the number of latent features used
    learning_rate - (float) the learning rate
    iters - (int) the number of iterations

    OUTPUT:
    user_mat - (numpy array) a user by latent feature matrix
    offer_mat - (numpy array) a latent feature by offer matrix
    '''

    # Set up useful values to be used through the rest of the function
    n_users = ratings_mat.shape[0]
    n_offers = ratings_mat.shape[1]
    num_ratings = np.count_nonzero(~np.isnan(ratings_mat))

    # initialize the user and offer matrices with random values
    user_mat = np.random.rand(n_users, latent_features)
    offer_mat = np.random.rand(latent_features, n_offers)

    # initialize sse at 0 for first iteration
    sse_accum = 0

    # keep track of iteration and MSE
    print("Optimizaiton Statistics")
    print("Iterations | Mean Squared Error ")

    # for each iteration
    for iteration in range(iters):

        # update our sse
        old_sse = sse_accum
        sse_accum = 0

        # For each user-offer pair
        for i in range(n_users):
            for j in range(n_offers):

                # if the rating exists
                if ratings_mat[i, j] > 0:

                    # compute the error as the actual minus the dot product of the user and offer latent features
                    diff = ratings_mat[i, j] - np.dot(user_mat[i, :], offer_mat[:, j])

                    # Keep track of the sum of squared errors for the matrix
                    sse_accum += diff**2

                    # update the values in each matrix in the direction of the gradient
                    for k in range(latent_features):
                        user_mat[i, k] += learning_rate * (2*diff*offer_mat[k, j])
                        offer_mat[k, j] += learning_rate * (2*diff*user_mat[i, k])

        # print results
        print("%d \t\t %f" % (iteration+1, sse_accum / num_ratings))

    return user_mat, offer_mat

In [20]:
def predict_scores(user_matrix, offer_matrix, train_data_df):
    '''
    INPUT:
    user_matrix - user by latent factor matrix
    offer_matrix - latent factor by offer matrix
    train_data_df - user-item matrix with users as rows and offers as columns

    OUTPUT:
    pred_matrix - the predicted rating matrix for all user_id-offer_id combinations
    '''
    # Create series of users and offers in the right order
    user_ids_series = np.array(train_data_df.index)
    offer_ids_series = np.array(train_data_df.columns)

    # Initialize prediction matrix
    pred_matrix = np.zeros((len(user_ids_series), len(offer_ids_series)))

    # Iterate over all user and offer combinations
    for i, user_id in enumerate(user_ids_series):
        for j, offer_id in enumerate(offer_ids_series):
            # Take dot product of that row and column in U and V to make prediction
            pred_matrix[i, j] = np.dot(user_matrix[i, :], offer_matrix[:, j])

    return pred_matrix

In [21]:
from sklearn.metrics import precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error

def calculate_accuracy_metrics(evaluation, threshold=0.5):
    y_true = evaluation['score'] >= threshold
    y_pred = evaluation['prediction'] >= threshold
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return precision, recall, f1

def calculate_error_metrics(evaluation):
    y_true = evaluation['score']
    y_pred = evaluation['prediction']
    
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    return mae, rmse

def calculate_ranking_metrics(evaluation):
    def average_precision(y_true, y_pred):
        sorted_indices = np.argsort(y_pred)[::-1]
        y_true_sorted = np.array(y_true)[sorted_indices]
        cumsum = np.cumsum(y_true_sorted)
        precision_at_k = cumsum / (np.arange(len(y_true_sorted)) + 1)
        return np.sum(precision_at_k * y_true_sorted) / np.sum(y_true_sorted)
    
    def ndcg(y_true, y_pred, k=10):
        sorted_indices = np.argsort(y_pred)[::-1]
        y_true_sorted = np.array(y_true)[sorted_indices]
        dcg = np.sum((2**y_true_sorted - 1) / np.log2(np.arange(1, len(y_true_sorted) + 1) + 1))
        ideal_sorted_indices = np.argsort(y_true)[::-1]
        y_true_ideal_sorted = np.array(y_true)[ideal_sorted_indices]
        idcg = np.sum((2**y_true_ideal_sorted - 1) / np.log2(np.arange(1, len(y_true_ideal_sorted) + 1) + 1))
        return dcg / idcg
    
    y_true = evaluation['score']
    y_pred = evaluation['prediction']
    
    map_score = average_precision(y_true, y_pred)
    ndcg_score = ndcg(y_true, y_pred)
    
    return map_score, ndcg_score

## Load

In [22]:
train = pd.read_csv('../../data/train/user_item.csv')
train.head()

Unnamed: 0,profile_id,offer,score,customer_type
0,406b1422299944039e05c12a48dba84a,discount-web-email-mobile,1.0,1
1,3f62dc31f11b453a9909809e20852450,bogo-email-mobile-social,1.0,2
2,665b6493546141518af2f3a0bf316800,discount-web-email-mobile,1.0,1
3,35c863d477084f7fb46e4b309cf3ea5d,discount-web-email-mobile,1.0,1
4,0a947767586e4587b06b8ca3efc3c8e7,bogo-web-email-mobile-social,0.0,1


In [23]:
test = pd.read_csv('../../data/test/user_item.csv')
test.head()

Unnamed: 0,profile_id,offer,score,customer_type
0,fcbcd28beee1457f8b3672658ea0a1e3,informational-email-mobile-social,1.0,4
1,1698291a4a474d84b7d7fc2e24ab684a,informational-email-mobile-social,1.0,1
2,639314daa82a46558c17020fd84d03f6,bogo-email-mobile-social,1.0,1
3,f626cb1552414edab2afdbf0c32c8476,bogo-web-email-mobile,1.0,3
4,61c9306f27f9423d9630b95cf66c266d,discount-web-email-mobile-social,1.0,0


## Transform

In [24]:
# Create user-by-item matrix - nothing to do here
train_user_item = train[['profile_id', 'offer', 'score']]
train_data_df = train_user_item.groupby(['profile_id', 'offer'])['score'].max().unstack()
train_data_np = np.array(train_data_df)

## Train

In [25]:
user_matrix, offer_matrix = FunkSVD(train_data_np,
                              latent_features=4,
                              learning_rate=0.005,
                              iters=300)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 0.067173
2 		 0.061368
3 		 0.059000
4 		 0.056748
5 		 0.054596
6 		 0.052540
7 		 0.050575
8 		 0.048697
9 		 0.046903
10 		 0.045188
11 		 0.043550
12 		 0.041984
13 		 0.040488
14 		 0.039058
15 		 0.037692
16 		 0.036386
17 		 0.035137
18 		 0.033944
19 		 0.032802
20 		 0.031711
21 		 0.030668
22 		 0.029670
23 		 0.028716
24 		 0.027803
25 		 0.026929
26 		 0.026093
27 		 0.025293
28 		 0.024527
29 		 0.023794
30 		 0.023092
31 		 0.022420
32 		 0.021776
33 		 0.021159
34 		 0.020568
35 		 0.020001
36 		 0.019458
37 		 0.018938
38 		 0.018439
39 		 0.017960
40 		 0.017501
41 		 0.017060
42 		 0.016638
43 		 0.016232
44 		 0.015842
45 		 0.015468
46 		 0.015109
47 		 0.014764
48 		 0.014433
49 		 0.014114
50 		 0.013808
51 		 0.013514
52 		 0.013231
53 		 0.012959
54 		 0.012697
55 		 0.012446
56 		 0.012203
57 		 0.011970
58 		 0.011746
59 		 0.011530
60 		 0.011322
61 		 0.011121
62 		 0.010928
63 		 0.010742
64 		 0

## Predict

In [26]:
# Call the predict_ratings function
predicted_ratings = predict_scores(user_matrix, offer_matrix, train_data_df)

# Print the predicted ratings matrix
predicted = pd.DataFrame(predicted_ratings, columns = train_data_df.columns, index = train_data_df.index)
predicted.head()

offer,bogo-email-mobile-social,bogo-web-email-mobile,bogo-web-email-mobile-social,discount-web-email,discount-web-email-mobile,discount-web-email-mobile-social,informational-email-mobile-social,informational-web-email-mobile
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0009655768c64bdeb2e877511632db8f,0.999927,0.941953,1.020562,0.95962,0.965251,1.025648,1.002572,0.975281
00116118485d4dfda04fdbaba9a87b5c,0.955921,0.919643,0.973735,0.92048,0.915144,0.974836,0.940907,0.936044
0011e0d4e6b944f998e987f904e8c1e5,1.026797,0.989759,1.045782,0.98413,0.969539,1.047878,1.005039,1.003488
0020c2b971eb4e9188eac86d93036a77,0.818678,0.785952,0.837312,0.780754,0.786537,0.84106,0.814262,0.802253
0020ccbbb6d84e358d3414a3ff76cffd,1.004034,0.944244,1.022387,0.967511,0.964118,1.026256,1.00143,0.977516


In [27]:
predicted = predicted.reset_index().melt(
    id_vars=['profile_id'], 
    var_name='offer', 
    value_name='score'
).rename(columns={'score': 'prediction'})

predicted.head()

Unnamed: 0,profile_id,offer,prediction
0,0009655768c64bdeb2e877511632db8f,bogo-email-mobile-social,0.999927
1,00116118485d4dfda04fdbaba9a87b5c,bogo-email-mobile-social,0.955921
2,0011e0d4e6b944f998e987f904e8c1e5,bogo-email-mobile-social,1.026797
3,0020c2b971eb4e9188eac86d93036a77,bogo-email-mobile-social,0.818678
4,0020ccbbb6d84e358d3414a3ff76cffd,bogo-email-mobile-social,1.004034


## Evaluate

In [28]:
evaluation = pd.merge(
    predicted, 
    test, 
    on=["profile_id", "offer"], 
    how="inner"
).drop(columns=["customer_type"])

evaluation.head()

Unnamed: 0,profile_id,offer,prediction,score
0,005500a7188546ff8a767329a2f7c76a,bogo-email-mobile-social,0.80746,1.0
1,00ae03011f9f49b8a4b3e6d416678b0b,bogo-email-mobile-social,1.030298,0.5
2,00b3400e4ff64ee68ce9ada1d0c222f0,bogo-email-mobile-social,0.914968,0.0
3,00c5a385c71a4d3db5e9b4e31e430943,bogo-email-mobile-social,0.988472,1.0
4,00cf471ed1aa42a8bdde5561d67da2b1,bogo-email-mobile-social,0.957907,1.0


In [29]:
evaluation.to_csv("../../data/predicted/funkSVD.csv", index=False)

In [30]:
evaluation = pd.read_csv('../../data/predicted/funkSVD.csv')

# Calculate accuracy metrics
precision, recall, f1 = calculate_accuracy_metrics(evaluation)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

# Calculate error metrics
mae, rmse = calculate_error_metrics(evaluation)
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

# Calculate ranking metrics
map_score, ndcg_score = calculate_ranking_metrics(evaluation)
print(f'Mean Average Precision (MAP): {map_score}')
print(f'Normalized Discounted Cumulative Gain (NDCG): {ndcg_score}')

Precision: 0.7746802230239422
Recall: 0.9944216398273866
F1-Score: 0.8709038115868554
Mean Absolute Error (MAE): 0.26560995393393005
Root Mean Squared Error (RMSE): 0.46353163478869763
Mean Average Precision (MAP): 0.8843025214690233
Normalized Discounted Cumulative Gain (NDCG): 0.9844160683366693
