In [36]:
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

# pass in column names for each CSV as the column name is not given in the file and read them using pandas.
# You can check the column names from the readme file

#Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,encoding='latin-1')

#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')

#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,
encoding='latin-1')

In [37]:
print (users.head())
print (users.shape)

   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067
3        4   24   M  technician    43537
4        5   33   F       other    15213
(943, 5)


In [38]:
print (ratings.head())
print (ratings.shape)
print (ratings.user_id.nunique())
print (ratings.user_id.value_counts().min())
print (ratings.user_id.value_counts().max())

   user_id  movie_id  rating  unix_timestamp
0      196       242       3       881250949
1      186       302       3       891717742
2       22       377       1       878887116
3      244        51       2       880606923
4      166       346       1       886397596
(100000, 4)
943
20
737


In [39]:
print (items.head())
print (items.shape)
print (items.columns)

   movie id        movie title release date  video release date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDb URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
4  http://us.imdb.com/M/title-exact?Copycat%20(1995)        0       0   

   Adventure  Animation  Children's  ...  Fantasy  Film-Noir  Horror  Musical  \
0          0          1           1  ...        0          0       0        0

In [40]:
#First we will content based filtering for recommending top 10 movies to a user.
#Our function will take user_id as input and output 
#For this we will use the simplified cosine similarity where we will create a item vector
#for each movie and a profile vector for user based on their previous likes and dislikes
#We will then take cosine similarity between the profile and the item vector and recommend
#the top 10 movies. Let's get started

#Creating a profile vector for each user
profile_df = pd.merge(ratings, items, left_on='movie_id', right_on='movie id', how='left')
print (profile_df.shape)
print (profile_df.head())

(100000, 28)
   user_id  movie_id  rating  unix_timestamp  movie id  \
0      196       242       3       881250949       242   
1      186       302       3       891717742       302   
2       22       377       1       878887116       377   
3      244        51       2       880606923        51   
4      166       346       1       886397596       346   

                  movie title release date  video release date  \
0                Kolya (1996)  24-Jan-1997                 NaN   
1    L.A. Confidential (1997)  01-Jan-1997                 NaN   
2         Heavyweights (1994)  01-Jan-1994                 NaN   
3  Legends of the Fall (1994)  01-Jan-1994                 NaN   
4         Jackie Brown (1997)  01-Jan-1997                 NaN   

                                            IMDb URL  unknown  ...  Fantasy  \
0    http://us.imdb.com/M/title-exact?Kolya%20(1996)        0  ...        0   
1  http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...        0  ...        0   
2  h

In [41]:
        
def genre_multiply(x):
    return x*x['rating']
        
profile_df_temp = profile_df[['Action', 'Adventure', 'Animation', "Children's",
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War','rating']].apply(genre_multiply, axis=1)

print (profile_df_temp.head())


   Action  Adventure  Animation  Children's  Comedy  Crime  Documentary  \
0       0          0          0           0       3      0            0   
1       0          0          0           0       0      3            0   
2       0          0          0           1       1      0            0   
3       0          0          0           0       0      0            0   
4       0          0          0           0       0      1            0   

   Drama  Fantasy  Film-Noir  Horror  Musical  Mystery  Romance  Sci-Fi  \
0      0        0          0       0        0        0        0       0   
1      0        0          3       0        0        3        0       0   
2      0        0          0       0        0        0        0       0   
3      2        0          0       0        0        0        2       0   
4      1        0          0       0        0        0        0       0   

   Thriller  War  rating  
0         0    0       9  
1         3    0       9  
2         0    0 

In [44]:
print (pd.concat([profile_df, profile_df_temp], axis=1).columns)

Index(['user_id', 'movie_id', 'rating', 'unix_timestamp', 'movie id',
       'movie title', 'release date', 'video release date', 'IMDb URL',
       'unknown', 'Action', 'Adventure', 'Animation', 'Children's', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
       'Action', 'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'rating'],
      dtype='object')
