In [1]:
import numpy as np
import pandas as pd
import os
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:80% !important; }</style>"))
from scipy.sparse import csc_matrix
from sparsesvd import sparsesvd

# Import ratings data (including user data)

In [3]:
cwd = os.getcwd()
ratings = pd.read_csv(os.path.join(cwd, "..", "data", "ratings_combined_adam.csv"))

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


# Create Compressed Sparse Column matrix

In [7]:
N1 = ratings['userId'].nunique()
N2 = ratings['movieId'].nunique()

In [8]:
uids_raw = ratings['userId'].unique()
iids_raw = ratings['movieId'].unique()

In [9]:
uids_inner = np.arange(N1)
iids_inner = np.arange(N2)

In [10]:
uid_maptoraw = dict(zip(uids_inner, uids_raw))
uid_maptoinner = dict(zip(uids_raw, uids_inner))
iid_maptoraw = dict(zip(iids_inner, iids_raw))
iid_maptoinner = dict(zip(iids_raw, iids_inner))

In [11]:
ratings.rename(columns={'userId':'uid_raw', 'movieId':'iid_raw'}, inplace=True)

In [12]:
ratings.head()

Unnamed: 0,uid_raw,iid_raw,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [13]:
ratings['uid_inner'] = ratings.apply(lambda x: uid_maptoinner[x['uid_raw']], axis=1)

In [14]:
ratings['iid_inner'] = ratings.apply(lambda x: iid_maptoinner[x['iid_raw']], axis=1)

In [15]:
compressed_matrix = csc_matrix((ratings['rating'], (ratings['uid_inner'], ratings['iid_inner'])), shape=(N1, N2))

# Perform SVD Matrix Factorization

In [40]:
ut, s, vt = sparsesvd(compressed_matrix, 50)

In [41]:
ut.shape, s.shape, vt.shape

((50L, 138494L), (50L,), (50L, 26744L))

# Make user predictions

In [102]:
user_uid_raw = ratings['uid_raw'].max(); print('user raw user id is: {}'.format(user_uid_raw))
user_uid_inner = uid_maptoinner[my_uid_raw]; print('user inner user id is: {}'.format(user_uid_inner))

user raw user id is: 138494
user inner user id is: 138493


### Dot product

In [103]:
s = np.diag(s)
preds = ut[:, user_uid_inner].dot(s).dot(vt)
preds.shape

(26744L,)

In [104]:
preds = pd.DataFrame(preds, columns = ['predicted_rating'])
preds.reset_index(inplace=True)
preds.rename(inplace=True, columns={'index':'iid_inner'})
preds['iid_raw'] = preds.apply(lambda x: iid_maptoraw[x['iid_inner']], axis=1)

In [105]:
preds['predicted_rating'].describe()

count    26744.000000
mean         0.098547
std          0.429143
min         -2.021876
25%         -0.000548
50%          0.003115
75%          0.022188
max          7.512083
Name: predicted_rating, dtype: float64

# Join movie titles and genres

In [106]:
cwd = os.getcwd()
movies = pd.read_csv(os.path.join(cwd, "..", "data", "movies.csv"))

In [107]:
preds = pd.merge(preds, movies[['movieId', 'title', 'genres']], left_on='iid_raw', right_on='movieId')
preds = preds.sort_values('predicted_rating', ascending=False)

In [108]:
preds.head(20)

Unnamed: 0,iid_inner,predicted_rating,iid_raw,movieId,title,genres
2087,2087,7.512083,79132,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
574,574,6.502743,3578,3578,Gladiator (2000),Action|Adventure|Drama
1822,1822,6.340306,72998,72998,Avatar (2009),Action|Adventure|Sci-Fi|IMAX
1854,1854,6.117074,89745,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
1001,1001,5.99244,59315,59315,Iron Man (2008),Action|Adventure|Sci-Fi
1856,1856,5.990524,91529,91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
331,331,5.923412,2571,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
998,998,5.592862,58559,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
1638,1638,5.531325,5418,5418,"Bourne Identity, The (2002)",Action|Mystery|Thriller
1823,1823,5.369418,73017,73017,Sherlock Holmes (2009),Action|Crime|Mystery|Thriller


# Filter out movies that the user already rated

In [109]:
user_profile = pd.read_csv('adam_profile.csv', index_col=0)

In [110]:
preds = pd.merge(preds, user_profile, on='movieId', how='left')

In [111]:
havent_seen_it_mask = preds['rating'].isnull()

In [114]:
preds[havent_seen_it_mask][['title', 'genres', 'predicted_rating']][:2000].to_csv('movie_recommendations_adam_SVD.csv', 
                                                                               index=False)