In [1]:
import jax 
import jax.numpy as jnp
import numpy as np 
import pandas as pd
from cluster import BasicClusteringRecommender, KMeansRecommender

In [2]:
train_df = pd.read_csv('../preprocessed_dataset/train_dataset.csv')
test_df = pd.read_csv('../preprocessed_dataset/test_dataset.csv')

In [3]:
train_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userID,review_count,songID,rating,user_id_idx,item_id_idx
0,0,966383,5226,10,82304,5,1115,12236
1,1,753438,23119,10,61708,5,4880,9184
2,2,1156841,7835,10,122957,5,1667,18346
3,3,503221,16895,10,110494,5,3568,16467
4,4,415592,15695,10,57161,5,3321,8548


In [4]:
train_df.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userID,review_count,songID,rating,user_id_idx,item_id_idx
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0
mean,19999.5,665742.2,11804.231675,10.0,68053.0381,4.549575,2497.230375,10150.07885
std,11547.14972,350443.2,6830.278556,0.0,39515.208908,0.696171,1442.006203,5875.358351
min,0.0,0.0,0.0,10.0,2.0,3.0,0.0,0.0
25%,9999.75,380163.5,5835.0,10.0,33848.75,4.0,1251.0,5088.75
50%,19999.5,734468.5,11837.0,10.0,67838.5,5.0,2495.0,10079.5
75%,29999.25,924752.2,17732.0,10.0,102227.0,5.0,3746.0,15246.0
max,39999.0,1315059.0,23680.0,10.0,136729.0,5.0,4999.0,20360.0


In [5]:
test_df.describe()

Unnamed: 0.1,Unnamed: 0,userID,review_count,songID,rating,user_id_idx,item_id_idx
count,2466.0,2466.0,2466.0,2466.0,2466.0,2466.0,2466.0
mean,3307.095702,11858.433496,10.0,67045.714517,4.5588,2508.562855,9999.652068
std,1912.952314,6858.378833,0.0,39385.201394,0.685987,1447.535411,5855.695673
min,0.0,90.0,10.0,60.0,3.0,17.0,10.0
25%,1608.0,6005.0,10.0,33557.75,4.0,1284.5,5045.0
50%,3355.5,11871.5,10.0,66685.0,5.0,2504.0,9901.0
75%,4928.25,17832.75,10.0,99568.75,5.0,3764.25,14847.25
max,6603.0,23668.0,10.0,136507.0,5.0,4997.0,20337.0


In [6]:
train_df = train_df[['rating', 'user_id_idx', 'item_id_idx']]
test_df = test_df[['rating', 'user_id_idx', 'item_id_idx']]

In [7]:
train_df.rename(columns={'user_id_idx': 'userID', 'item_id_idx': 'songID'}, inplace=True)
test_df.rename(columns={'user_id_idx': 'userID', 'item_id_idx': 'songID'}, inplace=True)

In [8]:
test_df.shape

(2466, 3)

In [9]:
test_df.userID.nunique()

736

In [10]:
def eval_model(model, test_df):
    tp = 0
    users = test_df.userID.unique()
    num_recommendation = 3
    total_num_recommendations = num_recommendation * len(users)
    num_test_songs = len(test_df)

    for user in users:
        recommendations = model.predict(user, num_recommendation)
        tp += len(test_df[test_df.userID == user & np.isin(test_df.songID, recommendations)])

    precision = tp / total_num_recommendations
    recall = tp / num_test_songs 
    f1 = 0 if tp == 0 else 2 / (1/recall + 1/precision)

    return precision, recall, f1

**Evaluate Basic Clustering Recommender**

In [21]:
base_recommender = BasicClusteringRecommender(n_neighbor=50)

base_recommender.fit(train_df)

eval_model(base_recommender, test_df) # this may take quite long
# We can get:
# precision = 0.0005
# recall = 0.0004
# f1 = 0.0004

**Evaluate K-Means Recommdender with Minibatch**

In [14]:
minibatch_kmeans_recommender = KMeansRecommender(n_neighbor=50, use_minibatch=True)

minibatch_kmeans_recommender.fit(train_df)

eval_model(minibatch_kmeans_recommender, test_df) # this may take quite long
# We can get:
# precision = 0.0005
# recall = 0.0004
# f1 = 0.0004

**Evaluate K-Means Recommender without Minibatch**

In [17]:
regular_kmeans_recommender = KMeansRecommender(n_neighbor=50, use_minibatch=False)

regular_kmeans_recommender.fit(train_df)

eval_model(regular_kmeans_recommender, test_df) # this may take quite long
# We can get:
# precision = 0.0005
# recall = 0.0004
# f1 = 0.0004