In [261]:
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

# pass in column names for each CSV as the column name is not given in the file and read them using pandas.
# You can check the column names from the readme file

#Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,encoding='latin-1')

#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')

#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,
encoding='latin-1')

In [262]:
print (users.head())
print (users.shape)

   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067
3        4   24   M  technician    43537
4        5   33   F       other    15213
(943, 5)


In [263]:
print (ratings.head())
print (ratings.shape)
print (ratings.user_id.nunique())
print (ratings.user_id.value_counts().min())
print (ratings.user_id.value_counts().max())
print (ratings.rating.value_counts())

   user_id  movie_id  rating  unix_timestamp
0      196       242       3       881250949
1      186       302       3       891717742
2       22       377       1       878887116
3      244        51       2       880606923
4      166       346       1       886397596
(100000, 4)
943
20
737
4    34174
3    27145
5    21201
2    11370
1     6110
Name: rating, dtype: int64


In [264]:
print (items.head())
print (items.shape)
print (items.columns)
print (items['movie id'].max())

   movie id        movie title release date  video release date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDb URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
4  http://us.imdb.com/M/title-exact?Copycat%20(1995)        0       0   

   Adventure  Animation  Children's  ...  Fantasy  Film-Noir  Horror  Musical  \
0          0          1           1  ...        0          0       0        0

In [265]:
#First we will content based filtering for recommending top 10 movies to a user.
#Our function will take user_id as input and output 
#For this we will use the simplified cosine similarity where we will create a item vector
#for each movie and a profile vector for user based on their previous likes and dislikes
#We will then take cosine similarity between the profile and the item vector and recommend
#the top 10 movies. Let's get started

#Creating a profile vector for each user
profile_df = pd.merge(ratings, items, left_on='movie_id', right_on='movie id', how='left')
print (profile_df.shape)
print (profile_df.head())


(100000, 28)
   user_id  movie_id  rating  unix_timestamp  movie id  \
0      196       242       3       881250949       242   
1      186       302       3       891717742       302   
2       22       377       1       878887116       377   
3      244        51       2       880606923        51   
4      166       346       1       886397596       346   

                  movie title release date  video release date  \
0                Kolya (1996)  24-Jan-1997                 NaN   
1    L.A. Confidential (1997)  01-Jan-1997                 NaN   
2         Heavyweights (1994)  01-Jan-1994                 NaN   
3  Legends of the Fall (1994)  01-Jan-1994                 NaN   
4         Jackie Brown (1997)  01-Jan-1997                 NaN   

                                            IMDb URL  unknown  ...  Fantasy  \
0    http://us.imdb.com/M/title-exact?Kolya%20(1996)        0  ...        0   
1  http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...        0  ...        0   
2  h

In [266]:
        
def genre_multiply(x):
    return x*x['rating']
        
profile_df_temp = profile_df[['Action', 'Adventure', 'Animation', "Children's",
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War','rating']].apply(genre_multiply, axis=1)

print (profile_df_temp.head())


   Action  Adventure  Animation  Children's  Comedy  Crime  Documentary  \
0       0          0          0           0       3      0            0   
1       0          0          0           0       0      3            0   
2       0          0          0           1       1      0            0   
3       0          0          0           0       0      0            0   
4       0          0          0           0       0      1            0   

   Drama  Fantasy  Film-Noir  Horror  Musical  Mystery  Romance  Sci-Fi  \
0      0        0          0       0        0        0        0       0   
1      0        0          3       0        0        3        0       0   
2      0        0          0       0        0        0        0       0   
3      2        0          0       0        0        0        2       0   
4      1        0          0       0        0        0        0       0   

   Thriller  War  rating  
0         0    0       9  
1         3    0       9  
2         0    0 

In [267]:
concat_df = pd.concat([profile_df[['user_id','movie_id']], profile_df_temp], axis=1)

In [268]:
print (concat_df[concat_df['user_id']==196])

       user_id  movie_id  Action  Adventure  Animation  Children's  Comedy  \
0          196       242       0          0          0           0       3   
940        196       393       0          0          0           0       4   
1133       196       381       0          0          0           0       4   
1812       196       251       0          0          0           0       3   
1896       196       655       0          5          0           0       5   
2374       196        67       0          0          0           0       5   
6910       196       306       0          0          0           0       0   
7517       196       238       0          0          0           0       4   
7842       196       663       0          0          0           0       5   
10017      196       111       0          0          0           0       4   
10254      196       580       0          0          0           0       2   
10981      196        25       0          0          0          

In [269]:
#Now creating the profile vector by taking average of the ratings by user_id
user_profile =concat_df[['Action', 'Adventure', 'Animation', "Children's",
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War','user_id']].groupby(by='user_id',as_index=False).mean()

In [270]:
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity


a = np.random.random((4, 10))
b = np.random.random((3, 10))



sim_sparse = cosine_similarity(a, b, dense_output=False)
print(sim_sparse)

[[0.88143097 0.75613938 0.80502353]
 [0.69735798 0.7424253  0.65170814]
 [0.81410923 0.73979398 0.71295149]
 [0.86654709 0.72334754 0.78980281]]


In [271]:
#Now creating user profile

user_vector=user_profile[['Action', 'Adventure', 'Animation', "Children's",
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War']]

item_vector=items[['Action', 'Adventure', 'Animation', "Children's",
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War']]

In [272]:
sim_sparse = cosine_similarity(user_vector, item_vector, dense_output=False)
print(sim_sparse)

[[0.34478094 0.47061341 0.27316176 ... 0.60925872 0.45914424 0.61025501]
 [0.25052874 0.30584028 0.24232371 ... 0.79697089 0.34376155 0.75514832]
 [0.15766132 0.53909999 0.46687429 ... 0.50453826 0.27307741 0.56377272]
 ...
 [0.40201513 0.71898859 0.37493649 ... 0.24618298 0.38832708 0.28120237]
 [0.4047511  0.50532561 0.33990277 ... 0.63692102 0.38239062 0.58208349]
 [0.26659034 0.63954752 0.31249613 ... 0.52768508 0.38712207 0.50139304]]


In [273]:
top_10_user= sim_sparse.argsort()[:,::-1][:,:10]

In [274]:
print (top_10_user[0].tolist())

[73, 3, 1137, 336, 1114, 777, 935, 516, 774, 511]


In [275]:
print (items.loc[top_10_user[160].tolist()]['movie title'])

691     American President, The (1995)
774     Something to Talk About (1995)
1114              Twelfth Night (1996)
169             Cinema Paradiso (1988)
935                 Brassed Off (1996)
511             Wings of Desire (1987)
516                   Manhattan (1979)
730            Corrina, Corrina (1994)
777            Don Juan DeMarco (1995)
1099       What Happened Was... (1994)
Name: movie title, dtype: object


In [276]:
print (concat_df[concat_df['user_id']==4])

       user_id  movie_id  Action  Adventure  Animation  Children's  Comedy  \
1250         4       264       0          0          0           0       0   
1329         4       303       0          0          0           0       0   
2204         4       361       0          0          0           0       0   
2526         4       357       0          0          0           0       0   
3277         4       260       4          0          0           0       0   
5960         4       356       0          0          0           0       0   
12151        4       294       0          0          0           0       5   
13893        4       288       0          0          0           0       0   
16305        4        50       5          5          0           0       0   
18930        4       354       0          0          0           0       5   
20082        4       271       4          4          0           0       0   
20383        4       300       5          0          0          

In [277]:
#Before that trying to find similar movies based on users liking it.
#Here the idea will be that we will find cosine similarity between the movies with
#each movie as a row and the columns as the user ratings
#Let's see how this experiment work
ratings.reset_index(inplace=True)
print (ratings.head())
u_user_id = ratings.user_id.unique()
u_movie_id = ratings.movie_id.unique()

ratings.set_index(["user_id", "movie_id"], inplace=True)
ratings = ratings.reindex(
    pd.MultiIndex.from_product(
        (u_user_id, u_movie_id),
        names=["user_id", "movie_id"]
    )
)
ratings.reset_index(inplace=True)
print (ratings.head())
print (ratings.shape)
#promo_2017_train = df_2017.set_index(
#    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
#        level=-1).fillna(False)

   index  user_id  movie_id  rating  unix_timestamp
0      0      196       242       3       881250949
1      1      186       302       3       891717742
2      2       22       377       1       878887116
3      3      244        51       2       880606923
4      4      166       346       1       886397596
   user_id  movie_id  index  rating  unix_timestamp
0      196       242    0.0     3.0     881250949.0
1      196       302    NaN     NaN             NaN
2      196       377    NaN     NaN             NaN
3      196        51    NaN     NaN             NaN
4      196       346    NaN     NaN             NaN
(1586126, 5)


In [278]:
ratings.loc[:, "rating"].fillna(0, inplace=True)

In [279]:
print (ratings.head())

   user_id  movie_id  index  rating  unix_timestamp
0      196       242    0.0     3.0     881250949.0
1      196       302    NaN     0.0             NaN
2      196       377    NaN     0.0             NaN
3      196        51    NaN     0.0             NaN
4      196       346    NaN     0.0             NaN


In [280]:
pivot = pd.pivot_table(ratings, values='rating', index=['movie_id'],
                  columns=['user_id'])

In [281]:
print (pivot.shape)

(1682, 943)


In [282]:
print (pivot.head())

user_id   1    2    3    4    5    6    7    8    9    10   ...  934  935  \
movie_id                                                    ...             
1         5.0  4.0  0.0  0.0  4.0  4.0  0.0  0.0  0.0  4.0  ...  2.0  3.0   
2         3.0  0.0  0.0  0.0  3.0  0.0  0.0  0.0  0.0  0.0  ...  4.0  0.0   
3         4.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
4         3.0  0.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  4.0  ...  5.0  0.0   
5         3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   

user_id   936  937  938  939  940  941  942  943  
movie_id                                          
1         4.0  0.0  4.0  0.0  0.0  5.0  0.0  0.0  
2         0.0  0.0  0.0  0.0  0.0  0.0  0.0  5.0  
3         4.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4         0.0  0.0  0.0  0.0  2.0  0.0  0.0  0.0  
5         0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 943 columns]


In [283]:
top_10_similarities= cosine_similarity(pivot).argsort()[:,::-1][:,:10]

In [284]:
print (top_10_similarities)

[[   0   49  180 ...  221   99  236]
 [   1  232  160 ...  225  230  549]
 [   2  409  762 ... 1046  824  474]
 ...
 [1679 1678 1677 ... 1293  360 1242]
 [1680 1671 1350 ... 1213 1521  797]
 [1681  766 1334 ... 1112 1427  959]]


In [285]:
print (items.loc[top_10_similarities[380].tolist()]['movie title'])

380                              Muriel's Wedding (1994)
69                    Four Weddings and a Funeral (1994)
85                        Remains of the Day, The (1993)
215                       When Harry Met Sally... (1989)
51                    Madness of King George, The (1994)
82                         Much Ado About Nothing (1993)
237                               Raising Arizona (1987)
450                                        Grease (1978)
381    Adventures of Priscilla, Queen of the Desert, ...
654                                   Stand by Me (1986)
Name: movie title, dtype: object


In [286]:
#Now we will implement collaborative filtering using matrix factorization
#First we need to create the matrix where users are in rows and each movie in column
#so in this case we should have 943,1682 dimensions

matrix_fac_df = pd.pivot_table(ratings, values='rating', index=['user_id'],
                              columns=['movie_id'])

print (matrix_fac_df.head())

movie_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                               ...   
1          5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2          4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0  ...   
3          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5          4.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

movie_id  1673  1674  1675  1676  1677  1678  1679  1680  1681  1682  
user_id                                                               
1          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
5          0.0   0.0   0.0   0.0  

In [287]:
users_items_pivot_matrix = matrix_fac_df.as_matrix()

  """Entry point for launching an IPython kernel.


In [288]:
users_items_pivot_matrix.shape

(943, 1682)

In [289]:
from scipy.sparse.linalg import svds
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [290]:
U.shape

(943, 15)

In [291]:
Vt.shape

(15, 1682)

In [292]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [293]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ 4.43878479e+00,  2.11917532e+00,  1.31158919e+00, ...,
        -7.08613101e-03,  1.83308395e-02,  5.86533316e-02],
       [ 1.88027236e+00, -9.29994696e-02,  8.46207206e-02, ...,
         1.10191650e-02, -9.33579482e-03, -2.46128025e-02],
       [-7.51577750e-02,  6.83639310e-03,  1.29765194e-01, ...,
         1.93662429e-02, -5.49764246e-04, -5.15644896e-03],
       ...,
       [ 2.03089125e+00, -3.34299627e-03,  3.81843117e-01, ...,
        -2.22309410e-03, -1.59373330e-05,  2.63556532e-03],
       [ 1.25292328e+00,  1.52255645e-01, -4.33783399e-01, ...,
         1.55998214e-02,  4.81491255e-03, -2.81527532e-02],
       [ 1.74525355e+00,  2.09840483e+00,  1.09220298e+00, ...,
        -1.32441482e-02,  2.46114779e-02,  1.81903557e-02]])

In [294]:
#Converting the reconstructed matrix back to a Pandas dataframe

cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = np.sort(u_movie_id), index=np.sort(u_user_id)).transpose()
cf_preds_df.head(10)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
1,4.438785,1.880272,-0.075158,0.582163,3.433327,3.240909,2.296333,1.050592,0.720506,1.628652,...,2.137697,2.33387,4.33399,1.786325,4.824664,2.547746,1.625894,2.030891,1.252923,1.745254
2,2.119175,-0.092999,0.006836,-0.120593,1.188078,0.072258,1.792962,1.598009,-0.048787,0.161714,...,0.94377,-0.081196,-0.018865,-0.029511,-0.233627,-0.02377,0.204595,-0.003343,0.152256,2.098405
3,1.311589,0.084621,0.129765,0.054378,0.471378,0.236292,0.753333,-0.071837,0.056645,0.009896,...,-0.076791,0.241885,1.229879,-0.067284,0.862746,0.241172,-0.144215,0.381843,-0.433783,1.092203
4,3.630156,0.280862,-0.053725,-0.16329,1.567449,1.674458,2.050389,1.12634,-0.092533,2.155349,...,2.211937,-0.006075,0.153428,0.064105,-0.422291,0.102573,1.49876,-0.221058,0.05344,2.703602
5,0.471221,-0.075563,0.000853,0.091547,0.359651,-0.693095,1.658826,0.16623,-0.005007,0.993618,...,-0.28788,0.150874,-0.04827,-0.027831,0.28409,0.269265,0.406425,-0.143911,-0.208841,1.370913
6,0.527748,0.385659,-0.009953,-0.047049,-0.064461,0.601268,0.357452,0.065826,0.091491,0.577239,...,0.150117,-0.005046,0.61331,0.385466,0.012373,0.056625,0.079245,0.044575,-0.092638,0.034962
7,5.088592,1.357602,0.123264,0.58129,2.609303,1.963651,3.712571,1.742954,0.609932,2.508345,...,0.709139,1.462394,4.056174,1.302575,3.845677,1.640721,1.548859,1.670637,-0.134214,2.264234
8,2.870084,0.28629,0.165041,-0.052797,1.05087,3.541965,2.065801,0.166024,0.175047,2.990021,...,1.755035,-0.109255,0.722314,0.664989,-0.231734,0.262023,1.422298,0.174952,1.499368,0.840201
9,3.303949,2.219235,-0.019711,0.151728,-0.754088,3.506893,3.826287,0.360503,0.641718,3.836299,...,0.174824,1.57993,4.092786,2.198797,2.577942,1.827623,0.671305,0.66427,-0.16416,2.130653
10,1.856559,0.555074,-0.017263,-0.058342,0.323414,1.286637,2.256572,0.408801,0.198937,1.032273,...,0.642325,0.06349,1.364872,0.838015,0.466907,0.292742,0.06763,0.303469,-0.131647,-0.425562


In [295]:
#Now let's use this to predict the top rated movie for a user
cf_preds_df.T.shape

(943, 1682)

In [296]:
top_10_user_mat_fac= all_user_predicted_ratings.T.argsort()[:,::-1][:,:10]

In [297]:
top_10_user_mat_fac.shape

(1682, 10)

In [298]:
print (top_10_user_mat_fac[1].tolist())

[392, 278, 12, 471, 129, 275, 726, 879, 496, 86]


In [299]:
print (top_10_user[1].tolist())

[516, 1114, 777, 730, 774, 691, 169, 935, 511, 1099]


In [300]:
#Finding movies similar
Vt.T.shape

(1682, 15)

In [301]:
top_10_similarities_mat_fac= cosine_similarity(Vt.T).argsort()[:,::-1][:,:10]

In [302]:
print (top_10_similarities_mat_fac.shape)

(1682, 10)


In [305]:
print (items.loc[top_10_similarities_mat_fac[10].tolist()]['movie title'])

10            Seven (Se7en) (1995)
11      Usual Suspects, The (1995)
54        Professional, The (1994)
155          Reservoir Dogs (1992)
91             True Romance (1993)
692                  Casino (1995)
788    Swimming with Sharks (1995)
938     Murder in the First (1995)
468              Short Cuts (1993)
30             Crimson Tide (1995)
Name: movie title, dtype: object


In [306]:
print (items.loc[top_10_similarities[10].tolist()]['movie title'])

10                  Seven (Se7en) (1995)
55                   Pulp Fiction (1994)
21                     Braveheart (1995)
11            Usual Suspects, The (1995)
78                  Fugitive, The (1993)
194               Terminator, The (1984)
97      Silence of the Lambs, The (1991)
95     Terminator 2: Judgment Day (1991)
173       Raiders of the Lost Ark (1981)
63      Shawshank Redemption, The (1994)
Name: movie title, dtype: object
