In [90]:
import jax 
import jax.numpy as jnp
import numpy as np 
import pandas as pd
from cluster import BasicClusteringRecommender, KMeansRecommender

In [91]:
train_df = pd.read_csv('../preprocessed_dataset/train_dataset.csv')
test_df = pd.read_csv('../preprocessed_dataset/test_dataset.csv')

In [92]:
train_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userID,review_count,songID,rating,user_id_idx,item_id_idx
0,0,966383,5226,10,82304,5,1115,12236
1,1,753438,23119,10,61708,5,4880,9184
2,2,1156841,7835,10,122957,5,1667,18346
3,3,503221,16895,10,110494,5,3568,16467
4,4,415592,15695,10,57161,5,3321,8548


In [93]:
train_df.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userID,review_count,songID,rating,user_id_idx,item_id_idx
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0
mean,19999.5,665742.2,11804.231675,10.0,68053.0381,4.549575,2497.230375,10150.07885
std,11547.14972,350443.2,6830.278556,0.0,39515.208908,0.696171,1442.006203,5875.358351
min,0.0,0.0,0.0,10.0,2.0,3.0,0.0,0.0
25%,9999.75,380163.5,5835.0,10.0,33848.75,4.0,1251.0,5088.75
50%,19999.5,734468.5,11837.0,10.0,67838.5,5.0,2495.0,10079.5
75%,29999.25,924752.2,17732.0,10.0,102227.0,5.0,3746.0,15246.0
max,39999.0,1315059.0,23680.0,10.0,136729.0,5.0,4999.0,20360.0


In [94]:
test_df.describe()

Unnamed: 0.1,Unnamed: 0,userID,review_count,songID,rating,user_id_idx,item_id_idx
count,2466.0,2466.0,2466.0,2466.0,2466.0,2466.0,2466.0
mean,3307.095702,11858.433496,10.0,67045.714517,4.5588,2508.562855,9999.652068
std,1912.952314,6858.378833,0.0,39385.201394,0.685987,1447.535411,5855.695673
min,0.0,90.0,10.0,60.0,3.0,17.0,10.0
25%,1608.0,6005.0,10.0,33557.75,4.0,1284.5,5045.0
50%,3355.5,11871.5,10.0,66685.0,5.0,2504.0,9901.0
75%,4928.25,17832.75,10.0,99568.75,5.0,3764.25,14847.25
max,6603.0,23668.0,10.0,136507.0,5.0,4997.0,20337.0


In [95]:
train_df = train_df[['rating', 'user_id_idx', 'item_id_idx']]
test_df = test_df[['rating', 'user_id_idx', 'item_id_idx']]

In [None]:
train_df.rename(columns={'user_id_idx': 'userID', 'item_id_idx': 'songID'}, inplace=True)
test_df.rename(columns={'user_id_idx': 'userID', 'item_id_idx': 'songID'}, inplace=True)

In [None]:
def eval_model(model, test_df):
    users = test_df.userID.unique()
    recall_scores = []
    precision_scores = []
    f1_scores = []
    n_recommendation = 3

    for user in users:
        recommendations = model.predict(user, n_recommendation)
        tp = len(test_df[test_df.userID == user & np.isin(test_df.songID, recommendations)])

        recall = tp / len(test_df[test_df.userID == user])
        precision = tp / n_recommendation
        f1 = 0 if (recall == 0 or precision == 0) else 2 / (1/recall + 1/precision)

        recall_scores.append(recall)
        precision_scores.append(precision)
        f1_scores.append(f1)

    return np.mean(recall_scores), np.mean(precision_scores), np.mean(f1_scores)

**Evaluate Basic Clustering Recommender**

In [None]:
recommender = BasicClusteringRecommender(n_neighbor=50)

In [None]:
recommender.fit(train_df)

In [None]:
eval_model(recommender, test_df)

ValueError: Cannot take a larger sample than population when 'replace=False'

**Evaluate K-Means Recommdender with Minibatch**

In [None]:
recommender = KMeansRecommender(n_neighbor=50, use_minibatch=True)

In [None]:
recommender.fit(train_df)

ValueError: setting an array element with a sequence.

In [None]:
eval_model(recommender)

(0.35964912280701755, 0.33660130718954245, 0.18452380952380953)

**Evaluate K-Means Recommender without Minibatch**

In [None]:
recommender = KMeansRecommender(n_neighbor=50, use_minibatch=False)

In [None]:
recommender.fit(train_df)

In [None]:
eval_model(recommender, test_df)

(0.3189102564102564, 0.4011437908496733, 0.17153931339977854)