In [25]:
import jax 
import jax.numpy as jnp
import numpy as np 
import pandas as pd
from cluster import BasicClusteringRecommender, KMeansRecommender

In [26]:
train_df = pd.read_csv('../preprocessed_dataset/train_dataset.csv')
test_df = pd.read_csv('../preprocessed_dataset/test_dataset.csv')

In [27]:
train_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userID,review_count,songID,rating,user_id_idx,item_id_idx
0,0,1179720,815,10,115603,5,186,17212
1,1,1037453,620,10,65510,5,138,9767
2,2,103387,11415,10,74460,5,2402,11085
3,3,895146,4255,10,7294,5,910,1125
4,4,305030,14171,10,128621,5,2988,19123


In [28]:
train_df.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userID,review_count,songID,rating,user_id_idx,item_id_idx
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0
mean,19999.5,662928.3,11816.062275,10.0,67869.195175,4.548675,2499.701775,10121.7178
std,11547.14972,350189.0,6833.546813,0.0,39498.025905,0.695265,1442.664511,5859.528774
min,0.0,0.0,0.0,10.0,2.0,3.0,0.0,0.0
25%,9999.75,377334.0,5806.0,10.0,33787.25,4.0,1247.0,5084.75
50%,19999.5,734041.5,11853.0,10.0,67459.5,5.0,2501.0,10064.5
75%,29999.25,918107.2,17743.0,10.0,101899.0,5.0,3748.0,15193.0
max,39999.0,1315057.0,23680.0,10.0,136729.0,5.0,4999.0,20308.0


In [29]:
test_df.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userID,review_count,songID,rating,user_id_idx,item_id_idx
count,6551.0,6551.0,6551.0,6551.0,6551.0,6551.0,6551.0,6551.0
mean,5003.284689,672751.6,11839.409098,10.0,68064.745535,4.565715,2504.632117,10150.780034
std,2887.366461,349748.1,6833.804936,0.0,39461.786362,0.685672,1442.70945,5853.983402
min,0.0,33.0,17.0,10.0,69.0,3.0,1.0,9.0
25%,2498.0,389804.0,5896.0,10.0,34241.0,4.0,1265.0,5143.0
50%,5012.0,735416.0,11909.0,10.0,67818.0,5.0,2514.0,10118.0
75%,7490.5,928712.5,17723.5,10.0,102331.0,5.0,3743.5,15254.5
max,9999.0,1315058.0,23680.0,10.0,136719.0,5.0,4999.0,20307.0


In [30]:
train_df = train_df[['rating', 'user_id_idx', 'item_id_idx']]
test_df = test_df[['rating', 'user_id_idx', 'item_id_idx']]

In [31]:
train_df.rename(columns={'user_id_idx': 'userID', 'item_id_idx': 'songID'}, inplace=True)
test_df.rename(columns={'user_id_idx': 'userID', 'item_id_idx': 'songID'}, inplace=True)

In [32]:
from sklearn.metrics import precision_score, recall_score, f1_score

def eval_model(model, test_df):
    predicted_rating = test_df.apply(lambda row: model.predict_score(row.userID, row.songID), axis=1)
    precision = precision_score(test_df.rating, predicted_rating, average='macro')
    recall = recall_score(test_df.rating, predicted_rating, average='macro')
    f1 = f1_score(test_df.rating, predicted_rating, average='macro')

    # print(predicted_rating.describe())

    return precision, recall, f1

**Evaluate Basic Clustering Recommender**

In [33]:
recommender = BasicClusteringRecommender(n_neighbor=200)

In [34]:
recommender.fit(train_df)

In [35]:
eval_model(recommender, test_df.iloc[:100])
# May take ~2 hours to evaluate all the test cases

(0.3511011011011011, 0.34150326797385616, 0.19122234826999432)

**Evaluate K-Means Recommdender with Minibatch**

In [36]:
recommender = KMeansRecommender(n_neighbor=200, use_minibatch=True)

In [37]:
recommender.fit(train_df)

In [38]:
eval_model(recommender, test_df.iloc[:100])

(0.35964912280701755, 0.33660130718954245, 0.18452380952380953)

**Evaluate K-Means Recommender without Minibatch**

In [39]:
recommender = KMeansRecommender(n_neighbor=200, use_minibatch=False)

In [40]:
recommender.fit(train_df)

In [41]:
eval_model(recommender, test_df.iloc[:100])

(0.3189102564102564, 0.4011437908496733, 0.17153931339977854)