In [91]:
import surprise
import pandas as pd
import numpy as np
import datetime
from surprise.model_selection import GridSearchCV
from scipy import sparse

In [211]:
#loading data
data = pd.read_csv('data/ml-latest-small/ratings.csv')

In [110]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [111]:
data.shape

(100004, 4)

In [212]:
#timestamp to yr,month conversion
data['year'] = data.timestamp.apply(
    lambda x: datetime.datetime.fromtimestamp(x).year)

data['month'] = data.timestamp.apply(
    lambda x: datetime.datetime.fromtimestamp(x).month)

In [213]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,year,month
0,1,31,2.5,1260759144,2009,12
1,1,1029,3.0,1260759179,2009,12
2,1,1061,3.0,1260759182,2009,12
3,1,1129,2.0,1260759185,2009,12
4,1,1172,4.0,1260759205,2009,12


In [214]:
data.userId.nunique()

671

In [215]:
data.movieId.nunique()

9066

In [216]:
# main dataframe to be used
df = data.loc[:, ['userId', 'movieId', 'rating']]

Continuous ID mapping

In [117]:
# continuous IDs
def proc_col(col):
    uniq = col.unique()
    name2idx = {o: i for i, o in enumerate(uniq)}
    idx2name = {i: e for i, e in enumerate(name2idx.keys())}
    return idx2name, np.array([name2idx[x] for x in col]), len(uniq)

In [119]:
def encode_data(df):
    idx2user, user_col, num_users = proc_col(df.userId)
    idx2movie, movie_col, num_movies = proc_col(df.movieId)
    df.userId = user_col
    df.movieId = movie_col
    return df, idx2user, idx2movie, num_users, num_movies

In [120]:
df, idx2user, idx2movie, num_users, num_movies = encode_data(df)

Creating embeddings

In [122]:
def create_embedings(n, num_factors):
    embedding = 6 * np.random.random((n, num_factors)) / num_factors
    return embedding

In [123]:
def df2matrix(df, nrows, ncols, column_name="rating"):
    values = df[column_name].values
    ind_movie = df['movieId'].values
    ind_user = df['userId'].values
    return sparse.csc_matrix((values, (ind_user, ind_movie)), shape=(nrows, ncols))

In [124]:
Y = df2matrix(df, num_users, num_movies)

In [129]:
def sparse_multiply(df, emb_user, emb_movie):
    df["prediction"] = np.sum(emb_user[df["userId"].values] * emb_movie[df["movieId"].values],axis=1)
    return df2matrix(df, emb_user.shape[0], emb_movie.shape[0], column_name="prediction")

In [130]:
def cost(df, emb_user, emb_movie):
    df["prediction"] = np.sum(emb_user[df["userId"].values]*emb_movie[df["movieId"].values], axis=1)
    error = np.mean(np.square(df.prediction - df.rating))
    return error

In [133]:
def gradient(df, Y, emb_user, emb_movie):
    R = Y.sign().todense()
    delta = np.multiply(Y.todense(),R) - sparse_multiply(df,emb_user,emb_movie).todense()
    d_emb_user = -2*np.dot(delta,emb_movie)/len(Y.data)
    d_emb_movie = -2*np.dot(delta.transpose(),emb_user)/len(Y.data)
    return d_emb_user,d_emb_movie

In [138]:
def gradient_descent(df,
                     emb_user,
                     emb_movie,
                     iterations=100,
                     learning_rate=0.01,
                     df_val=None):
    Y = df2matrix(df, emb_user.shape[0], emb_movie.shape[0])
    grad_u_moment, grad_m_moment = gradient(df, Y, emb_user, emb_movie)
    emb_user = np.array(np.subtract(emb_user, learning_rate * grad_u_moment))
    emb_movie = np.array(np.subtract(emb_movie, learning_rate * grad_m_moment))
    for i in range(iterations - 1):
        grad_user, grad_movie = gradient(df, Y, emb_user, emb_movie)
        grad_u_moment = .9 * grad_u_moment + .1 * grad_user
        grad_m_moment = .9 * grad_m_moment + .1 * grad_movie
        emb_user = np.array(
            np.subtract(emb_user, learning_rate * grad_u_moment))
        emb_movie = np.array(
            np.subtract(emb_movie, learning_rate * grad_m_moment))
        if i % 50 == 0:
            print("Training cost:", cost(df, emb_user, emb_movie))
        if df_val is not None and i % 50 == 0:
            print("Validation cost:", cost(df_val, emb_user, emb_movie))
    return emb_user, emb_movie

In [147]:
K = 50
emb_user = create_embedings(num_users, K)
emb_movie = create_embedings(num_movies, K)

In [148]:
msk = np.random.rand(len(df)) < 0.8
train = df[msk].copy()
val = df[~msk].copy()

In [149]:
train.shape

(79975, 4)

In [150]:
val.shape

(20029, 4)

In [151]:
emb_user, emb_movie = gradient_descent(train, emb_user, emb_movie, iterations=500, learning_rate=1,df_val=val)

Training cost: 12.327841399250467
Validation cost: 12.346553586992771
Training cost: 9.863873836475703
Validation cost: 9.894877773490808
Training cost: 7.10636303744941
Validation cost: 7.185340992337435
Training cost: 5.15209404292156
Validation cost: 5.275329074662304
Training cost: 4.0288353664395125
Validation cost: 4.17586501559257
Training cost: 3.306673837393524
Validation cost: 3.4662901948384612
Training cost: 2.8052537950947953
Validation cost: 2.9745943450452867
Training cost: 2.441673330622611
Validation cost: 2.619949679562914
Training cost: 2.168911309585991
Validation cost: 2.3554852957576777
Training cost: 1.9584002757142958
Validation cost: 2.1525752779353025


In [152]:
emb_movie.shape

(9066, 50)

In [153]:
emb_user.shape

(671, 50)

In [155]:
val.head()

Unnamed: 0,userId,movieId,rating,prediction
5,0,5,2.0,0.954744
6,0,6,2.0,0.919814
7,0,7,2.0,0.974646
9,0,9,2.0,0.838881
10,0,10,2.5,0.688386


In [156]:
movieId = 5

In [157]:
movie_emb = emb_movie[5]

In [159]:
movie_emb.shape

(50,)

In [168]:
user = pd.DataFrame(emb_user)

In [169]:
user['idx'] = user.index

In [170]:
user.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,idx
0,0.075774,0.08719,0.143355,0.075602,0.072874,0.100537,0.169856,0.055601,0.067458,0.049219,...,0.072918,0.127727,0.090135,0.141335,0.108283,0.12046,0.163365,0.154001,0.141434,0
1,0.324426,0.344349,0.298928,0.248014,0.267309,0.236502,0.283352,0.247145,0.216981,0.261925,...,0.227566,0.235831,0.297773,0.225322,0.285864,0.263241,0.243186,0.259415,0.318395,1
2,0.283232,0.211811,0.188078,0.198078,0.224124,0.227436,0.273774,0.190159,0.276533,0.224431,...,0.167486,0.239141,0.224663,0.25375,0.263012,0.218164,0.247686,0.273554,0.283557,2
3,0.476498,0.424242,0.496214,0.465258,0.46612,0.542126,0.473251,0.480762,0.486591,0.498327,...,0.464328,0.49248,0.468006,0.525331,0.521093,0.518539,0.452737,0.452803,0.448321,3
4,0.413721,0.363192,0.330802,0.380994,0.372192,0.353819,0.277184,0.317804,0.309241,0.310072,...,0.313701,0.308145,0.296386,0.299122,0.356641,0.324782,0.293738,0.348399,0.330234,4


In [171]:
user['userid'] = user.idx.apply(lambda x: idx2user[x])

In [175]:
user['score'] = user.apply(lambda x: np.dot(x[:50], movie_emb), axis=1)

In [177]:
user.sort_values(by = 'score',ascending=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,idx,userid,score
298,0.542951,0.534376,0.576110,0.531818,0.602705,0.539814,0.528999,0.602673,0.520030,0.602078,...,0.620353,0.511752,0.546375,0.623657,0.631192,0.522197,0.522804,298,299,5.356599
651,0.534810,0.640972,0.553720,0.545373,0.558429,0.541880,0.531075,0.528594,0.586832,0.506193,...,0.586316,0.534387,0.580944,0.614849,0.486281,0.561432,0.502913,651,652,5.212927
286,0.531698,0.496631,0.509895,0.475379,0.515521,0.597858,0.514784,0.524066,0.528765,0.521697,...,0.484548,0.574872,0.487943,0.466497,0.502496,0.518065,0.554744,286,287,5.030118
563,0.455467,0.512256,0.474841,0.487855,0.448161,0.515322,0.589571,0.479124,0.550869,0.599542,...,0.646398,0.573668,0.609889,0.624429,0.584418,0.508584,0.445220,563,564,5.027698
241,0.604619,0.576207,0.564450,0.551569,0.499320,0.529696,0.514883,0.526240,0.540737,0.490828,...,0.516837,0.547369,0.522387,0.547842,0.501971,0.503697,0.494431,241,242,4.945577
546,0.547251,0.417759,0.520556,0.577611,0.574033,0.507473,0.571418,0.555195,0.574088,0.641306,...,0.511129,0.492738,0.585409,0.472148,0.474528,0.585951,0.473464,546,547,4.870940
557,0.520664,0.552205,0.558891,0.468017,0.562223,0.521901,0.556958,0.462780,0.487071,0.548060,...,0.525072,0.485988,0.570186,0.571072,0.501202,0.570128,0.562956,557,558,4.865219
94,0.452870,0.557964,0.459374,0.473871,0.565923,0.573097,0.537372,0.555315,0.470960,0.518560,...,0.482120,0.516739,0.503479,0.588621,0.507916,0.483621,0.441414,94,95,4.820253
426,0.431350,0.512708,0.475721,0.474049,0.555673,0.537387,0.529922,0.498237,0.476224,0.478523,...,0.509230,0.531754,0.541889,0.518389,0.472093,0.560843,0.445512,426,427,4.802888
29,0.475472,0.494658,0.554785,0.552867,0.582831,0.531629,0.529357,0.450177,0.499195,0.559978,...,0.506430,0.491463,0.527406,0.548763,0.516094,0.513670,0.469271,29,30,4.801448


Using surprise: SVD and NMF

In [227]:
from surprise import SVDpp
from surprise import SVD,NMF
from surprise import Dataset,Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

In [217]:
df = data.loc[:, ['userId', 'movieId', 'rating','timestamp']]

In [218]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [219]:
reader = Reader(rating_scale=(0.5, 5.0))

In [225]:
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

In [226]:
data = data.build_full_trainset()

In [205]:
train, test = train_test_split(data, test_size=0.01)

In [222]:
# param_grid = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 'lr_all': [0.001, 0.003, 0.005, 0.008],
#               'reg_all': [0.08, 0.1, 0.15]}
# gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
# gs.fit(data)
# algo = gs.best_estimator['rmse']
# print(gs.best_score['rmse'])
# print(gs.best_params['rmse'])
# cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Use the new parameters with the train data
algo = SVD(n_factors=160, n_epochs=100, lr_all=0.005, reg_all=0.1)
algo.fit(data)
# test_pred = algo.test(test)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1166ffa20>

In [224]:
algo.pu

array([[ 0.05793621, -0.06761844, -0.15600489, ..., -0.03391789,
        -0.03101116, -0.0531752 ],
       [ 0.13941621,  0.10012024, -0.07491198, ..., -0.01462149,
        -0.02027692,  0.08650187],
       [-0.01491837, -0.02872689,  0.01620339, ...,  0.03429133,
        -0.04094031, -0.00210094],
       ...,
       [-0.02636237, -0.01790011, -0.08826658, ..., -0.04968326,
         0.03362109, -0.0120746 ],
       [ 0.02458236, -0.11687885,  0.02645557, ...,  0.06891265,
         0.09785065,  0.04425158],
       [ 0.09232802,  0.00841886, -0.07332009, ..., -0.03233327,
         0.05790043, -0.01323758]])

In [232]:
# param_grid = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 'lr_all': [0.001, 0.003, 0.005, 0.008],
#               'reg_all': [0.08, 0.1, 0.15]}
# gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
# gs.fit(data)
# algo = gs.best_estimator['rmse']
# print(gs.best_score['rmse'])
# print(gs.best_params['rmse'])
# cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Use the new parameters with the train data
algo = NMF()
algo.fit(data)
# test_pred = algo.test(test)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1168c7278>

In [238]:
algo.pu

array([[0.51941725, 0.52493431, 0.09810816, ..., 0.61321427, 0.83428332,
        0.31614544],
       [0.7331803 , 0.51780602, 0.40529173, ..., 0.20202588, 0.50589223,
        0.67314813],
       [0.35898583, 0.49557728, 0.36242313, ..., 0.34145478, 0.43219474,
        0.55558954],
       ...,
       [0.25954847, 0.60547448, 0.72055633, ..., 0.65208159, 0.65836988,
        0.32775851],
       [0.23349387, 0.53933165, 0.53925977, ..., 0.35180689, 0.24806165,
        0.09125572],
       [0.78238722, 0.51520383, 0.61014149, ..., 0.00867874, 0.77454668,
        0.12002469]])