In [2]:
!pip install numpy
!pip install scikit-surprise



In [15]:
import pandas as pd
import pickle

# Load dataset

In [27]:
load_df = pd.read_pickle("../data/raw/scraped/usersong")
load_df.shape[0]

499860

In [43]:
#add myself data
load_rzhan97_df = pd.read_pickle("../data/processed/rzhan97")
load_df = load_df.append(load_rzhan97_df)

In [44]:
#Make sure all the count is number
load_df[["count"]] = load_df[["count"]].apply(pd.to_numeric)

In [45]:
#Changed the column name here to fit more data
cols = ['user', 'song', 'count']
load_df = pd.DataFrame(load_df, columns = cols)

In [46]:
#Take a look at the current data
load_df.head()

Unnamed: 0,user,song,count
0,raedrexlre,Rex Orange County-Corduroy Dreams,9
1,raedrexlre,Keshi-blue,8
2,raedrexlre,Zeph-Lucky,8
3,raedrexlre,Pink Sweat$-17,7
4,raedrexlre,Childish Gambino-Feels Like Summer,6


In [47]:
load_df.tail()

Unnamed: 0,user,song,count
45,rzhan97,88Rising-History,5
46,rzhan97,Calinn-Relationship Obsession,5
47,rzhan97,Joji-ATTENTION,5
48,rzhan97,Yaeji-Year to Year,5
49,rzhan97,告五人-愛在夏天,5


In [66]:
#Nomalize our count for each user
#As I realized, I cannot assigned 1 as the lowest rating, since user usually give 3 if they listen several times, which means they actually mildly like it
load_df['new_rating'] = load_df.groupby('user').transform(lambda x: ((5-3)/(x.max()-x.min())) * (x - x.max())+5)
load_df = load_df.reset_index(drop=True)

In [81]:
load_df[load_df.new_rating == 5]

Unnamed: 0,user,song,count,new_rating
0,raedrexlre,Rex Orange County-Corduroy Dreams,9,5.0
50,eluviahn,Doja Cat-Addiction,154,5.0
100,thamirws,globoplay.globo.com-TV Globo Ao Vivo | Assista...,144,5.0
150,lordsubseven,Taylor Swift-Enchanted,1003,5.0
200,pghpenguins71,$uicideboy$-KILL YOURSELF (Part III),334,5.0
...,...,...,...,...
499660,jo_alvaradoo,Joji-Sanctuary,560,5.0
499661,jo_alvaradoo,Mayer Hawthorne-A Strange Arrangement,560,5.0
499710,fiyafuadina,BTS / BANGTAN BOYS (방탄소년단)-We On,171,5.0
499760,Bad_Doc,TWICE-I CAN'T STOP ME,482,5.0


In [82]:
load_df.tail(5)

Unnamed: 0,user,song,count,new_rating
499905,rzhan97,88Rising-History,5,3.0
499906,rzhan97,Calinn-Relationship Obsession,5,3.0
499907,rzhan97,Joji-ATTENTION,5,3.0
499908,rzhan97,Yaeji-Year to Year,5,3.0
499909,rzhan97,告五人-愛在夏天,5,3.0


In [69]:
#Create the surprise dataset
from surprise import Dataset
from surprise import Reader
reader = Reader()
data = Dataset.load_from_df(load_df[['user', 'song', 'new_rating']], reader)

In [13]:
#Benchmark by using different algorithm
benchmark = []
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import NormalPredictor
from surprise import KNNBaseline
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import BaselineOnly
from surprise import CoClustering
from surprise.model_selection import cross_validate

# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

KeyboardInterrupt: 

In [70]:
#From the benchmark above, with the low RMSE and fit time, SVD win the game
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV


param_grid = {'n_epochs': [10,20], 'lr_all': [0.002, 0.005, 0.1],
              'reg_all': [0.1, 0.5]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

algo = gs.best_estimator['rmse']

0.4503521003213648
{'n_epochs': 20, 'lr_all': 0.1, 'reg_all': 0.1}


In [71]:
trainset = data.build_full_trainset()
#algo.fit(data.build_full_trainset())

In [72]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f98b7934910>

In [73]:
model_path="../model/final_model.pkl"
pickle.dump(algo,open(model_path, 'wb'))

In [74]:
model_path="../model/final_model.pkl"
load_model = pickle.load(open(model_path, 'rb'))

In [84]:
uid = "rzhan97"  # raw user id (as in the ratings file). They are **strings**!
iid = "Joji-Sanctuary"  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = load_model.predict(uid, iid, r_ui=3, verbose=True)

user: rzhan97    item: Joji-Sanctuary r_ui = 3.00   est = 3.21   {'was_impossible': False}


In [22]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
#Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

In [87]:
#Divide dataset into train and test dataset
trainset, testset = train_test_split(data, test_size=0.25)
algo = gs.best_estimator['rmse']
predictions = algo.fit(trainset).test(testset)

In [88]:
from surprise import accuracy
accuracy.rmse(predictions)

RMSE: 0.4459


0.44587830834283626

In [89]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [92]:
df[df.uid == 'rzhan97']

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
1553,rzhan97,88Rising-Midsummer Madness,3.09009,3.328115,{'was_impossible': False},33,64,0.238025
15327,rzhan97,柳爽-漠河舞厅,3.792793,3.291157,{'was_impossible': False},33,0,0.501636
24072,rzhan97,"Scenery-Baby, It's Been a Long Time",3.054054,3.291157,{'was_impossible': False},33,0,0.237103
33156,rzhan97,莫宰羊-健康快樂,3.09009,3.291157,{'was_impossible': False},33,0,0.201067
33496,rzhan97,Jiafeng-Cuisine Lullaby 報菜名,3.108108,3.291157,{'was_impossible': False},33,0,0.183049
37726,rzhan97,Leslie Odom Jr.-Non-Stop,3.234234,3.151874,{'was_impossible': False},33,10,0.082361
37916,rzhan97,于贞-飞奔向你 - Piano Version,3.054054,3.291157,{'was_impossible': False},33,0,0.237103
56682,rzhan97,Jesy Chiang-Sunlit Grassland,3.072072,3.291157,{'was_impossible': False},33,0,0.219085
59808,rzhan97,Rocketman-Lost Film,3.072072,3.291157,{'was_impossible': False},33,0,0.219085
72432,rzhan97,Juno Roome-how deep is your love,3.216216,3.291157,{'was_impossible': False},33,0,0.074941


In [90]:
best_predictions[]

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
70496,audreydn,Marcin Przybyłowicz-The Temple Of Lilvani,5.0,5.0,{'was_impossible': False},35,3,0.0
98829,markbaek,Cadmium-Feel It Too,5.0,5.0,{'was_impossible': False},76,3,0.0
112819,hamboyisstoopid,Kanye West-POWER,3.900585,3.900588,{'was_impossible': False},70,121,3e-06
85928,seulgiz,MOMOLAND-Freeze,3.834862,3.834857,{'was_impossible': False},84,1,5e-06
39076,Heloisa-vi,Kid Cudi-Cudi Zone,3.275229,3.275235,{'was_impossible': False},72,64,5e-06
19390,dylansbeaudry,TWICE-Feel Special,4.00885,4.008844,{'was_impossible': False},34,511,5e-06
122278,edbd111,MOMOLAND-BBoom BBoom,3.016216,3.016207,{'was_impossible': False},35,93,9e-06
72697,Raymartlight,OH MY GIRL-Dolphin,3.168297,3.168287,{'was_impossible': False},320,79,1.1e-05
102314,yungwykk,TWICE-CRY FOR ME,3.772727,3.772741,{'was_impossible': False},33,424,1.4e-05
20501,olipopp,Metro Boomin-10 Freaky Girls (with 21 Savage),3.290323,3.290338,{'was_impossible': False},38,58,1.6e-05


In [91]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
115889,Justinkook,fromis_9-Talk & Talk,5.0,2.943377,{'was_impossible': False},70,34,2.056623
112007,lizdevaux,Pixies-All I Think About Now,5.0,2.942572,{'was_impossible': False},40,2,2.057428
119036,NajeebR,"Cocteau Twins-Sea, Swallow Me",5.0,2.918299,{'was_impossible': False},38,16,2.081701
64579,Tounihh,My Bloody Valentine-Off Your Face,5.0,2.904627,{'was_impossible': False},109,2,2.095373
3771,fatimahazzahra,Abra-Unlock It (feat. Playboi Carti),5.0,2.893646,{'was_impossible': False},36,9,2.106354
34868,VebiMonica,Phoebe Bridgers-Smoke Signals (Reprise),5.0,2.863368,{'was_impossible': False},38,1,2.136632
41684,lovergirl222,7!!-オレンジ,5.0,2.853781,{'was_impossible': False},38,4,2.146219
26030,kevn2,dubdogz-Infinity - DubDogz & Bhaskar Edit,5.0,2.841707,{'was_impossible': False},34,1,2.158293
69186,Emilia_Porcu,Stray Kids-YOU.,5.0,2.770702,{'was_impossible': False},119,2,2.229298
103829,cominhome,Alexisonfire-Mailbox Arson,5.0,2.744666,{'was_impossible': False},37,1,2.255334
