In [1]:
import json
import pandas as pd
import numpy as np
import scipy.sparse as sp
import scipy.linalg as la
from time import perf_counter as pc
from sklearn.decomposition import NMF

# Load Data

In [2]:
df = pd.read_csv("../data/user_movie_40k.csv", header=None)
df = df.rename(columns={0: 'user_id', 1: 'movie_title', 2: 'year', 3: 'movie_id'})

movies = pd.read_csv("../data/movies_40k.csv")

In [3]:
df.shape

(42033, 4)

In [4]:
 def matrix_data(df):
    dfr = df.pivot_table(index='user_id', columns='movie_id').fillna(0)
    X = dfr.to_numpy().astype(int)
    return X

In [5]:
X = matrix_data(df)
X_sparse = sp.coo_matrix(X)

In [41]:
X.shape

(3286, 4948)

# Scikit-learn NMF
Non-negative matrix factorization. https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html

In [70]:
def prediction(X_sparse, U, V):
    X_mask = (X_sparse!=0).multiply(np.ones(X_sparse.shape)).toarray()
    X_mask = np.ones(X_sparse.shape) - X_mask
    X_pred = X_mask * (U @ V)
    return X_pred

def recommend(X_pred, user_id, N=10):
    X_user = X_pred[user_id,:]
    ids = np.argpartition(X_user, -N)[-N:]
    ids = ids[np.argsort(X_user[ids])][::-1]
    scores = X_user[ids]
    return ids, scores

In [62]:
model = NMF(init='random', random_state=0, verbose=True)

start_time = pc()
W = model.fit_transform(X_sparse)
H = model.components_
end_time = pc()

print('Used (secs): ', end_time - start_time)
print(model.reconstruction_err_)
print(model.n_iter_)

violation: 1.0
violation: 0.14462126351626187
violation: 0.08494279570935867
violation: 0.03807603029699105
violation: 0.018192296749403748
violation: 0.009474872395966063
violation: 0.005327998977052398
violation: 0.0033021225769176397
violation: 0.0021987584586655477
violation: 0.0015352410555469433
violation: 0.0011097853619402875
violation: 0.0008228180656657295
violation: 0.0006247112184840441
violation: 0.00048471992677806655
violation: 0.0003854554847995767
violation: 0.00031183826062510287
violation: 0.0002570658621648219
violation: 0.0002155651056971088
violation: 0.00018202879184183124
violation: 0.00015551303729543795
violation: 0.0001334125588016438
violation: 0.00011238324549857223
violation: 9.51965851377968e-05
Converged at iteration 24
Used (secs):  4725.864551000001
3078.700401406589
23


In [63]:
with open('nmf_w.npy', 'wb') as f:
    np.save(f, W)

with open('nmf_h.npy', 'wb') as f:
    np.save(f, H)

In [50]:
W = np.load('nmf_w.npy')
H = np.load('nmf_h.npy')

In [61]:
X_pred = prediction(X_sparse, W, H)

In [66]:
X_csr = X_sparse.tocsr()

In [72]:
userid = 1
ids, scores = recommend(X_pred, userid, N=20)
pd.DataFrame({"movies": movies.loc[ids]['movieid'], "score": scores, "already_liked": np.in1d(ids, X_csr[userid].indices)})

Unnamed: 0,movies,score,already_liked
3524,the+bonfire+of+the+vanities+1990,6.8778,False
2802,fellinis+casanova+1976,2.558316,False
4710,night+falls+on+manhattan+1996,2.337484,False
2020,southland+tales+2006,2.31219,False
134,the+wizard+of+oz+1939,2.075056,False
930,the+english+patient+1996,1.888421,False
3463,all+about+my+mother+1999,1.851044,False
3119,orange+county+2002,1.806287,False
3603,dawn+of+the+planet+of+the+apes+2014,1.68823,False
4191,a+walk+to+remember+2002,1.681692,False


In [88]:
userid=123
ids, scores = recommend(X_pred, userid, N=20)
pd.DataFrame({"movies": movies.loc[ids]['movieid'], "score": scores, "already_liked": np.in1d(ids, X_csr[userid].indices)})

Unnamed: 0,movies,score,already_liked
1574,sleeper+1973,5.871484,False
1615,rules+of+engagement+2000,1.442161,False
1076,arsenic+and+old+lace+1944,1.026592,False
4011,im+all+right+jack+1959,1.008805,False
3882,kung+fu+panda+holiday+2010,0.836257,False
3182,major+league+back+to+the+minors+1998,0.756551,False
3133,things+to+do+in+denver+when+youre+dead+1995,0.343658,False
2507,man+of+the+house+1995,0.288278,False
4639,dreamscape+1984,0.247385,False
2514,pollyanna+1960,0.224254,False


# LigtFM
https://github.com/lyst/lightfm

## Using sample data
MovieLens dataset

In [4]:
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k

In [92]:
# Load the MovieLens 100k dataset. Only five
# star ratings are treated as positive.
data = fetch_movielens(min_rating=5.0)

# Instantiate and train the model
model_lfm = LightFM(loss='warp')
model_lfm.fit(data['train'], epochs=30, num_threads=2)

# Evaluate the trained model
test_precision = precision_at_k(model_lfm, data['test'], k=5).mean()

In [6]:
data

{'train': <943x1682 sparse matrix of type '<class 'numpy.float32'>'
 	with 19048 stored elements in COOrdinate format>,
 'test': <943x1682 sparse matrix of type '<class 'numpy.int32'>'
 	with 2153 stored elements in COOrdinate format>,
 'item_features': <1682x1682 sparse matrix of type '<class 'numpy.float32'>'
 	with 1682 stored elements in Compressed Sparse Row format>,
 'item_feature_labels': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object),
 'item_labels': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object)}

In [93]:
model_lfm

<lightfm.lightfm.LightFM at 0x7f86762ebac0>

In [8]:
test_precision

0.052311756

In [101]:
len(model_lfm.get_user_representations()[0])

943

In [102]:
len(model_lfm.get_item_representations()[0])

1682

In [103]:
def recommend_lfm(model, user_id, N=10):
    X_user = model.predict(user_id, np.arange(data['item_features'].shape[0]))
    ids = np.argpartition(X_user, -N)[-N:]
    ids = ids[np.argsort(X_user[ids])][::-1]
    scores = X_user[ids]
    return ids, scores

In [107]:
userid = 1
ids, scores = recommend_lfm(model_lfm, userid)

In [110]:
pd.DataFrame({"movies": data['item_feature_labels'][ids], "score": scores})

Unnamed: 0,movies,score
0,"English Patient, The (1996)",2.164846
1,"Full Monty, The (1997)",1.530353
2,Fargo (1996),1.386078
3,Leaving Las Vegas (1995),1.327225
4,L.A. Confidential (1997),1.278503
5,"Postino, Il (1994)",1.27234
6,Secrets & Lies (1996),1.269994
7,Contact (1997),1.249399
8,Chasing Amy (1997),1.14908
9,Titanic (1997),1.148281


## Using our data

In [111]:
model_mylfm = LightFM(loss='warp')
model_mylfm.fit(X_sparse, epochs=30, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f86301e65e0>

In [113]:
userid = 1
ids, scores = recommend_lfm(model_mylfm, userid, N=20)
pd.DataFrame({"movies": movies.loc[ids]['movieid'], "score": scores, "already_liked": np.in1d(ids, X_csr[userid].indices)})

Unnamed: 0,movies,score,already_liked
263,blade+runner+1982,1.727342,False
1287,st.+elmos+fire+1985,1.451966,False
269,fantasia+2000+1999,1.351043,False
1615,rules+of+engagement+2000,1.195428,False
1545,my+son+the+fanatic+1997,1.140657,False
644,the+bridge+on+the+river+kwai+1957,1.095225,False
1149,universal+soldier+1992,1.050794,False
364,mrs+brown+1997,1.035769,False
1109,12+2007,0.981934,False
1320,a+fistful+of+dollars+1964,0.952321,False


In [114]:
userid = 123
ids, scores = recommend_lfm(model_mylfm, userid, N=20)
pd.DataFrame({"movies": movies.loc[ids]['movieid'], "score": scores, "already_liked": np.in1d(ids, X_csr[userid].indices)})

Unnamed: 0,movies,score,already_liked
679,johnny+mnemonic+1995,2.073541,False
1615,rules+of+engagement+2000,1.880858,False
1287,st.+elmos+fire+1985,1.880659,True
1210,bound+by+honor+1993,1.868272,False
1655,21+jump+street+2012,1.684915,False
987,the+bridges+of+madison+county+1995,1.610018,True
1574,sleeper+1973,1.538605,False
1101,a+little+princess+1995,1.503409,False
486,united+93+2006,1.447861,False
1303,the+killers+1946,1.363573,False


# Implicit
https://github.com/benfred/implicit

## Using sample data
LastFM dataset

In [6]:
from implicit.datasets.lastfm import get_lastfm

artists, users, artist_user_plays = get_lastfm()

In [7]:
artists.shape

(292385,)

In [25]:
users.shape

(358868,)

In [26]:
artist_user_plays.shape

(292385, 358868)

In [9]:
artist_user_plays

<292385x358868 sparse matrix of type '<class 'numpy.float32'>'
	with 17535606 stored elements in Compressed Sparse Row format>

In [10]:
artist_user_plays.tocsr().sum()

3773558300.0

In [14]:
from implicit.nearest_neighbours import bm25_weight

# weight the matrix, both to reduce impact of users that have played the same artist thousands of times
# and to reduce the weight given to popular items
artist_user_plays = bm25_weight(artist_user_plays, K1=100, B=0.8)

# get the transpose since the most of the functions in implicit expect (user, item) sparse matrices instead of (item, user)
user_plays = artist_user_plays.T.tocsr()

In [15]:
from implicit.als import AlternatingLeastSquares

model = AlternatingLeastSquares(factors=64, regularization=0.05, alpha=2.0)
model.fit(user_plays)



  0%|          | 0/15 [00:00<?, ?it/s]

In [16]:
# Get recommendations for the a single user
userid = 12345
ids, scores = model.recommend(userid, user_plays[userid], N=10, filter_already_liked_items=False)

In [20]:
# Use pandas to display the output in a table, pandas isn't a dependency of implicit otherwise
pd.DataFrame({"artist": artists[ids], "score": scores, "already_liked": np.in1d(ids, user_plays[userid].indices)})

Unnamed: 0,artist,score,already_liked
0,spiritual front,1.040421,False
1,puissance,1.021218,True
2,karjalan sissit,0.989997,False
3,rome,0.988649,True
4,d-a-d,0.987983,True
5,ordo rosarius equilibrio & spiritual front,0.981284,False
6,arditi,0.981159,True
7,the coffinshakers,0.980529,True
8,ordo rosarius equilibrio,0.980149,False
9,blood axis,0.97307,False


In [21]:
# get related items for the beatles (itemid = 25512)
ids, scores= model.similar_items(252512)

# display the results using pandas for nicer formatting
pd.DataFrame({"artist": artists[ids], "score": scores})

Unnamed: 0,artist,score
0,the beatles,1.0
1,the beach boys,0.99345
2,the rolling stones,0.993244
3,john lennon,0.992751
4,bob dylan,0.992225
5,the who,0.992147
6,david bowie,0.991132
7,simon & garfunkel,0.991071
8,led zeppelin,0.990473
9,the kinks,0.989937


## Using our data

In [42]:
X_imp = bm25_weight(X_sparse, K1=100, B=0.8)
X_imp = X_imp.tocsr()

model_imp = AlternatingLeastSquares(factors=64, regularization=0.05, alpha=2.0)
model_imp.fit(X_imp)

  0%|          | 0/15 [00:00<?, ?it/s]

In [73]:
userid = 1
ids, scores = model_imp.recommend(userid, X_imp[userid], N=20, filter_already_liked_items=False)
pd.DataFrame({"movies": movies.loc[ids]['movieid'], "score": scores, "already_liked": np.in1d(ids, X_imp[userid].indices)})

Unnamed: 0,movies,score,already_liked
3806,childs+play+1988,0.998216,True
3385,the+devil+and+daniel+johnston+2005,0.985364,True
11,kate++leopold+2001,0.982185,True
2350,otello+1986,0.981872,True
3893,rollerball+2002,0.978048,True
453,night+on+earth+1991,0.977427,True
2885,the+exorcism+of+emily+rose+2005,0.976597,True
4380,death+ship+1980,0.970494,False
1573,the+manchurian+candidate+1962,0.814356,False
4385,the+jungle+book+2+2003,0.729844,False


In [89]:
userid = 123
ids, scores = model_imp.recommend(userid, X_imp[userid], N=20, filter_already_liked_items=False)
pd.DataFrame({"movies": movies.loc[ids]['movieid'], "score": scores, "already_liked": np.in1d(ids, X_imp[userid].indices)})

Unnamed: 0,movies,score,already_liked
4325,moonlight+mile+2002,1.150219,False
4873,greenfingers+2001,1.091308,False
2411,baraka+1992,1.054747,False
3429,rocky+ii+1979,1.013041,True
3375,the+object+of+my+affection+1998,1.012211,True
987,the+bridges+of+madison+county+1995,1.007364,True
2981,allegro+non+troppo+1976,1.000979,True
2160,diner+1982,0.99794,True
3840,up+in+the+air+2009,0.997795,True
3867,the+hangover+part+iii+2013,0.996103,True


# Surprise

In [2]:
from surprise import accuracy, Dataset, SVD
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin("ml-100k")

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=0.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /Users/venny/.surprise_data/ml-100k
RMSE: 0.9390


0.9389823939019692

In [3]:
data

<surprise.dataset.DatasetAutoFolds at 0x7f8348054850>

In [4]:
import os

from surprise import BaselineOnly, Dataset, Reader
from surprise.model_selection import cross_validate

# path to dataset file
file_path = os.path.expanduser("~/Documents/Courses/17-645/Misc/ml-100k/u.data")

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format="user item rating timestamp", sep="\t")

data = Dataset.load_from_file(file_path, reader=reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(BaselineOnly(), data, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9423  0.9465  0.9486  0.9462  0.9398  0.9447  0.0032  
MAE (testset)     0.7465  0.7501  0.7528  0.7501  0.7458  0.7491  0.0026  
Fit time          0.05    0.05    0.05    0.05    0.05    0.05    0.00    
Test time         0.14    0.05    0.09    0.05    0.10    0.09    0.03    


{'test_rmse': array([0.94231388, 0.94651465, 0.94855059, 0.94616341, 0.93978089]),
 'test_mae': array([0.74654601, 0.75009708, 0.75277926, 0.75012542, 0.7457989 ]),
 'fit_time': (0.05452394485473633,
  0.05255484580993652,
  0.047972917556762695,
  0.05120205879211426,
  0.04806804656982422),
 'test_time': (0.14230108261108398,
  0.04979133605957031,
  0.09296512603759766,
  0.049386024475097656,
  0.09631490707397461)}

In [5]:
reader

<surprise.reader.Reader at 0x7f83895892e0>