In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Storing the movie information into a pandas dataframe
movies_df = pd.read_csv('movies.csv')
#Storing the user information into a pandas dataframe
ratings_df = pd.read_csv('ratings.csv')
#Head is a function that gets the first N rows of a dataframe. N's default is 5.
print(movies_df.head())
print(ratings_df.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510


In [3]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
print(movies_df.head())

   movieId                        title  \
0        1                    Toy Story   
1        2                      Jumanji   
2        3             Grumpier Old Men   
3        4            Waiting to Exhale   
4        5  Father of the Bride Part II   

                                        genres  year  
0  Adventure|Animation|Children|Comedy|Fantasy  1995  
1                   Adventure|Children|Fantasy  1995  
2                               Comedy|Romance  1995  
3                         Comedy|Drama|Romance  1995  
4                                       Comedy  1995  


In [4]:
#Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
moviesWithGenres_df = movies_df.copy()

#For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
print(moviesWithGenres_df.head())

   movieId                        title  \
0        1                    Toy Story   
1        2                      Jumanji   
2        3             Grumpier Old Men   
3        4            Waiting to Exhale   
4        5  Father of the Bride Part II   

                                        genres  year    A    d    v    e    n  \
0  Adventure|Animation|Children|Comedy|Fantasy  1995  1.0  1.0  1.0  1.0  1.0   
1                   Adventure|Children|Fantasy  1995  1.0  1.0  1.0  1.0  1.0   
2                               Comedy|Romance  1995  0.0  1.0  0.0  1.0  1.0   
3                         Comedy|Drama|Romance  1995  0.0  1.0  0.0  1.0  1.0   
4                                       Comedy  1995  0.0  1.0  0.0  1.0  0.0   

     t  ...    S    -    I    X    W    N    (         g    )  
0  1.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1  1.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 

In [5]:
print(ratings_df.head())

   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510


In [6]:
userInput = [
            {'title':'Dumb & Dumber', 'rating':5.0},
            {'title':'Dirty Rotten Scoundrels, The', 'rating':5.0},
            {'title':'Godfather, The', 'rating':4.8},
            {'title':"12 Angry Men", 'rating':4.7},
            {'title':'Pulp Fiction', 'rating':4.1}
         ]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

   rating                         title
0     5.0                 Dumb & Dumber
1     5.0  Dirty Rotten Scoundrels, The
2     4.8                Godfather, The
3     4.7                  12 Angry Men
4     4.1                  Pulp Fiction


In [7]:
#Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('genres', 1).drop('year', 1)
#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original
#dataframe or it might spelled differently, please check capitalisation.
print(inputMovies)

   movieId           title  rating
0      296    Pulp Fiction     4.1
1      858  Godfather, The     4.8
2     1203    12 Angry Men     4.7
3    77846    12 Angry Men     4.7


In [8]:
#Filtering out the movies from the input
userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userMovies)

       movieId           title                       genres  year    A    d  \
292        296    Pulp Fiction  Comedy|Crime|Drama|Thriller  1994  0.0  1.0   
840        858  Godfather, The                  Crime|Drama  1972  0.0  0.0   
1173      1203    12 Angry Men                        Drama  1957  0.0  0.0   
14675    77846    12 Angry Men                  Crime|Drama  1997  0.0  0.0   

         v    e    n    t  ...    S    -    I    X    W    N    (         g  \
292    0.0  1.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
840    0.0  1.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
1173   0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
14675  0.0  1.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

         )  
292    0.0  
840    0.0  
1173   0.0  
14675  0.0  

[4 rows x 39 columns]


In [9]:
#Resetting the index to avoid future issues
userMovies = userMovies.reset_index(drop=True)
#Dropping unnecessary issues due to save memory and to avoid issues
userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
print(userGenreTable)

     A    d    v    e    n    t    u    r    |    i  ...    S    -    I    X  \
0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  1.0  1.0  1.0  ...  0.0  0.0  0.0  0.0   
1  0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  1.0  1.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  1.0  1.0  ...  0.0  0.0  0.0  0.0   

     W    N    (         g    )  
0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  

[4 rows x 35 columns]


In [10]:
print(inputMovies['rating'])

0    4.1
1    4.8
2    4.7
3    4.7
Name: rating, dtype: float64


In [11]:
#Dot produt to get weights
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])
#The user profile
print(userProfile)

A     0.0
d     4.1
v     0.0
e    13.6
n     0.0
t     0.0
u     0.0
r    18.3
|    13.6
i    13.6
m    18.3
a    18.3
o     4.1
C    13.6
h     4.1
l     4.1
y     4.1
F     0.0
s     0.0
R     0.0
c     0.0
D    18.3
T     4.1
H     0.0
M     0.0
S     0.0
-     0.0
I     0.0
X     0.0
W     0.0
N     0.0
(     0.0
      0.0
g     0.0
)     0.0
dtype: float64


In [12]:
#Now let's get the genres of every movie in our original dataframe
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])
#And drop the unnecessary information
genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
print(genreTable.head())

           A    d    v    e    n    t    u    r    |    i  ...    S    -    I  \
movieId                                                    ...                  
1        1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  0.0  0.0  0.0   
2        1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  0.0  0.0  0.0   
3        0.0  1.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0  0.0  ...  0.0  0.0  0.0   
4        0.0  1.0  0.0  1.0  1.0  0.0  0.0  1.0  1.0  0.0  ...  0.0  0.0  0.0   
5        0.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

           X    W    N    (         g    )  
movieId                                     
1        0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2        0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3        0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4        0.0  0.0  0.0  0.0  0.0  0.0  0.0  
5        0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 35 columns]


In [13]:
#Multiply the genres by the weights and then take the weighted average
recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
print(recommendationTable_df.head())

movieId
1    0.852825
2    0.705650
3    0.589356
4    0.829829
5    0.379763
dtype: float64


In [14]:
#Sort our recommendations in descending order
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
#Just a peek at the values
print(recommendationTable_df.head())

movieId
151755    1.0
156276    1.0
608       1.0
141418    1.0
1912      1.0
dtype: float64


In [15]:
#The final recommendation table
a = movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(10).keys())]
print(a['title'])

600                                                  Fargo
1823                                          Out of Sight
12390    Honor Among Thieves (Adieu l'ami) (Farewell, F...
12412                                       Protector, The
17795           Girls on the Road (a.k.a. Hot Summer Week)
25008                                         The 39 Steps
27088                                          Inseparable
32648                                     Sexy Evil Genius
37040                                              Orleans
39038                                The Lone Wolf Strikes
Name: title, dtype: object
