## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Import Dataset

In [13]:
df = pd.read_csv("movies.csv")
df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
34203,151697,Grand Slam (1967),Thriller
34204,151701,Bloodmoney (2010),(no genres listed)
34205,151703,The Butterfly Circus (2009),Drama
34206,151709,Zero (2015),Drama|Sci-Fi


In [72]:
df[df.title.str.startswith("Mecha")]

Unnamed: 0,movieId,title,genres,year
5594,5692,"Mechanic, The","[Action, Thriller]",1972
16856,85020,"Mechanic, The","[Action, Drama, Thriller]",2011


## Data Cleansing

In [14]:
df["year"] = df.title.str.extract("(\(\d\d\d\d\))", expand= False)
df.year = df.year.str.extract("(\d\d\d\d)", expand= False)
df.title = df.title.str.replace("(\(\d\d\d\d\))", "", regex= True)
df.title = df.title.apply(lambda x: x.strip())
df

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
34203,151697,Grand Slam,Thriller,1967
34204,151701,Bloodmoney,(no genres listed),2010
34205,151703,The Butterfly Circus,Drama,2009
34206,151709,Zero,Drama|Sci-Fi,2015


In [15]:
df.genres = df.genres.str.split("|")
df

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995
...,...,...,...,...
34203,151697,Grand Slam,[Thriller],1967
34204,151701,Bloodmoney,[(no genres listed)],2010
34205,151703,The Butterfly Circus,[Drama],2009
34206,151709,Zero,"[Drama, Sci-Fi]",2015


In [50]:
movies_df = df.copy()

for index, row in movies_df.iterrows():
    for genre in row.genres:
        movies_df.at[index, genre] = 1
movies_df.fillna(0, inplace= True)
movies_df

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34203,151697,Grand Slam,[Thriller],1967,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34204,151701,Bloodmoney,[(no genres listed)],2010,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34205,151703,The Butterfly Circus,[Drama],2009,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34206,151709,Zero,"[Drama, Sci-Fi]",2015,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Identify User Input as Sample

In [73]:
userInput = [
            {'title':'Jumanji', 'rating':5},
            {'title':'Spider-Man 2', 'rating':3.5},
            {'title':'Steve Jobs', 'rating':5},
            {'title':"Mechanic, The", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,Jumanji,5.0
1,Spider-Man 2,3.5
2,Steve Jobs,5.0
3,"Mechanic, The",5.0
4,Akira,4.5


In [74]:
filtered_movie = movies_df[movies_df.title.isin(inputMovies.title.tolist())]
inputMovies = pd.merge(filtered_movie, inputMovies)
inputMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed),rating
0,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
1,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
2,5692,"Mechanic, The","[Action, Thriller]",1972,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,85020,"Mechanic, The","[Action, Drama, Thriller]",2011,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
4,8636,Spider-Man 2,"[Action, Adventure, Sci-Fi, IMAX]",2004,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
5,136562,Steve Jobs,[Drama],2015,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [75]:
inputMovies = inputMovies.drop("genres", 1).drop("year", 1)
inputMovies

  inputMovies = inputMovies.drop("genres", 1).drop("year", 1)
  inputMovies = inputMovies.drop("genres", 1).drop("year", 1)


Unnamed: 0,movieId,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed),rating
0,2,Jumanji,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
1,1274,Akira,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
2,5692,"Mechanic, The",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,85020,"Mechanic, The",0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
4,8636,Spider-Man 2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
5,136562,Steve Jobs,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [76]:
userMovies = inputMovies.drop("movieId", axis= 1).drop("title", axis= 1).drop("rating", axis= 1)
userMovies

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
userProfile = userMovies.transpose().dot(inputMovies.rating)
userProfile

Adventure             13.0
Animation              4.5
Children               5.0
Comedy                 0.0
Fantasy                5.0
Romance                0.0
Drama                 10.0
Action                18.0
Crime                  0.0
Thriller              10.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 8.0
IMAX                   3.5
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [78]:
genre_table = movies_df.set_index(movies_df.movieId, inplace= True)
genre_table = movies_df.drop("movieId", axis= 1).drop("title", axis= 1).drop("genres", axis= 1).drop("year", axis= 1)
genre_table

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
151703,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151709,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
recommendationTable = (userProfile * genre_table).sum(axis= 1) / userProfile.sum()
recommendationTable

movieId
1         0.357143
2         0.298701
3         0.000000
4         0.129870
5         0.000000
            ...   
151697    0.129870
151701    0.000000
151703    0.129870
151709    0.233766
151711    0.000000
Length: 34208, dtype: float64

In [80]:
recommendationTable = recommendationTable.sort_values(ascending= False)
recommendationTable

movieId
71999     0.831169
91500     0.766234
48774     0.766234
58025     0.766234
27618     0.766234
            ...   
6453      0.000000
90823     0.000000
6451      0.000000
90821     0.000000
151711    0.000000
Length: 34208, dtype: float64

In [81]:
df.loc[df.movieId.isin(recommendationTable.head(20).keys())]

Unnamed: 0,movieId,title,genres,year
2345,2429,Mighty Joe Young,"[Action, Adventure, Drama, Fantasy, Thriller]",1998
4686,4781,Megiddo: The Omega Code 2,"[Action, Adventure, Fantasy, Sci-Fi, Thriller]",2001
6252,6350,Laputa: Castle in the Sky (Tenkû no shiro Rapy...,"[Action, Adventure, Animation, Children, Fanta...",1986
7763,8361,"Day After Tomorrow, The","[Action, Adventure, Drama, Sci-Fi, Thriller]",2004
9403,27618,"Sound of Thunder, A","[Action, Adventure, Drama, Sci-Fi, Thriller]",2005
10654,41569,King Kong,"[Action, Adventure, Drama, Fantasy, Thriller]",2005
11410,48774,Children of Men,"[Action, Adventure, Drama, Sci-Fi, Thriller]",2006
11497,49593,She,"[Action, Adventure, Drama, Fantasy, Horror, Ro...",1965
12464,58025,Jumper,"[Action, Adventure, Drama, Sci-Fi, Thriller]",2008
14397,71999,Aelita: The Queen of Mars (Aelita),"[Action, Adventure, Drama, Fantasy, Romance, S...",1924
