# Movie Recommendation System

Text

In [1]:
# Import right packages

import surprise
from surprise.prediction_algorithms import *
from surprise import Reader, Dataset
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# Read in data

df_links = pd.read_csv('data/links.csv')
df_movies = pd.read_csv('data/movies.csv')
df_ratings = pd.read_csv('data/ratings.csv')
df_tags = pd.read_csv('data/tags.csv')

In [3]:
print(df_links.shape)
print(df_movies.shape)
print(df_ratings.shape)
print(df_tags.shape)

(9742, 3)
(9742, 3)
(100836, 4)
(3683, 4)


In [4]:
df_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [5]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [6]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [7]:
df_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [8]:
# Confirm there's no null values
df_links.dropna(inplace=True)   # Only this file has several null values
df_movies.dropna(inplace=True)  # No null value
df_ratings.dropna(inplace=True) # No null value
df_tags.dropna(inplace=True)    # No null value

print(df_links.shape)
print(df_movies.shape)
print(df_ratings.shape)
print(df_tags.shape)

(9734, 3)
(9742, 3)
(100836, 4)
(3683, 4)


In [9]:
df_tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [10]:
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [11]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [12]:
# Merging the Movies and Ratings csv files for collaborative filtering / recommendation system
df_merged = pd.merge(df_ratings, df_movies[['movieId','title','genres']], on='movieId', how='left')

In [13]:
df_merged

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


In [14]:
df_merged.info()  # No null values

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [15]:
# Unique user id count
df_merged['userId'].nunique()

610

In [16]:
# Unique movie id count
df_merged['movieId'].nunique()

9724

In [17]:
# Unique title count. There is variance in the movie_id and title count. However, since the variance is very small\
# we will ignore the difference
df_merged['title'].nunique()

9719

In [18]:
# Unique genre count
df_merged['genres'].nunique()

951

In [19]:
# Converting timestamp to read-able format
df_merged['timestamp'] = pd.to_datetime(df_merged['timestamp'], unit='s').dt.strftime('%Y-%m-%d')

In [98]:
df_merged.head(50)

Unnamed: 0,userId,movieId,rating,timestamp,title,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,2000-07-30,Toy Story (1995),0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,2000-07-30,Grumpier Old Men (1995),0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,2000-07-30,Heat (1995),1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,2000-07-30,Seven (a.k.a. Se7en) (1995),0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,2000-07-30,"Usual Suspects, The (1995)",0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
5,1,70,3.0,2000-07-30,From Dusk Till Dawn (1996),1,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
6,1,101,5.0,2000-07-30,Bottle Rocket (1996),0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
7,1,110,4.0,2000-07-30,Braveheart (1995),1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,1,151,5.0,2000-07-30,Rob Roy (1995),1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
9,1,157,5.0,2000-07-30,Canadian Bacon (1995),0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [21]:
# To separate out the genres
df_merged_genres = df_merged['genres']

mlb = MultiLabelBinarizer()
df_merged_genres_mlb = pd.DataFrame(mlb.fit_transform(df_merged_genres.str.split('|')), columns=mlb.classes_)
df_merged_genres_mlb

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0
100832,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
100833,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
100834,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [22]:
df_merged = df_merged.join(df_merged_genres_mlb)

In [23]:
df_merged['(no genres listed)'].value_counts()

0    100789
1        47
Name: (no genres listed), dtype: int64

In [24]:
# Drop rows with no genres
df_merged = df_merged[df_merged['(no genres listed)'] == 0]
df_merged

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,(no genres listed),Action,Adventure,Animation,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,2000-07-30,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,2000-07-30,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,2000-07-30,Heat (1995),Action|Crime|Thriller,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,2000-07-30,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,2000-07-30,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,2017-05-03,Split (2017),Drama|Horror|Thriller,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
100832,610,168248,5.0,2017-05-03,John Wick: Chapter Two (2017),Action|Crime|Thriller,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
100833,610,168250,5.0,2017-05-08,Get Out (2017),Horror,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
100834,610,168252,5.0,2017-05-03,Logan (2017),Action|Sci-Fi,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [25]:
# Drop genres and no genres listed columns
df_merged.drop(columns = ['genres','(no genres listed)'], inplace=True)

In [49]:
df_merged.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,2000-07-30,Toy Story (1995),0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,2000-07-30,Grumpier Old Men (1995),0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,2000-07-30,Heat (1995),1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,2000-07-30,Seven (a.k.a. Se7en) (1995),0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,2000-07-30,"Usual Suspects, The (1995)",0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [27]:
df_merged['rating'].value_counts().sort_index()

0.5     1368
1.0     2809
1.5     1791
2.0     7549
2.5     5544
3.0    20041
3.5    13130
4.0    26810
4.5     8543
5.0    13204
Name: rating, dtype: int64

In [28]:
reader = Reader(rating_scale=(.5,5))
data = Dataset.load_from_df(df_merged[['userId', 'movieId','rating']], reader)

In [29]:
train, test = surprise.model_selection.train_test_split(data, random_state=42)

In [30]:
model = KNNBasic().fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [31]:
model2 = SVD().fit(train)

In [32]:
model3 = NMF().fit(train)

In [33]:
surprise.accuracy.rmse(model.test(test))

RMSE: 0.9456


0.9456323468882797

In [34]:
surprise.accuracy.rmse(model2.test(test))

RMSE: 0.8692


0.869213041657177

In [35]:
surprise.accuracy.rmse(model3.test(test))

RMSE: 0.9221


0.9221167645215308

In [36]:
test_outcome = model2.test(test)

In [37]:
test_outcome

[Prediction(uid=434, iid=356, r_ui=4.5, est=4.068821216107378, details={'was_impossible': False}),
 Prediction(uid=474, iid=5329, r_ui=3.5, est=3.229619003545127, details={'was_impossible': False}),
 Prediction(uid=304, iid=1242, r_ui=5.0, est=4.205068507713115, details={'was_impossible': False}),
 Prediction(uid=298, iid=5299, r_ui=0.5, est=2.079053236546889, details={'was_impossible': False}),
 Prediction(uid=131, iid=1196, r_ui=2.5, est=3.7892255329043816, details={'was_impossible': False}),
 Prediction(uid=288, iid=6377, r_ui=3.5, est=3.722480634906757, details={'was_impossible': False}),
 Prediction(uid=448, iid=99005, r_ui=1.5, est=2.83070082551466, details={'was_impossible': False}),
 Prediction(uid=285, iid=1193, r_ui=5.0, est=4.086302349687117, details={'was_impossible': False}),
 Prediction(uid=331, iid=61323, r_ui=5.0, est=3.6087521541759564, details={'was_impossible': False}),
 Prediction(uid=325, iid=2699, r_ui=4.0, est=2.710220891852779, details={'was_impossible': False})

In [66]:
def top_five_rec (user_id):
    user_id = user_id
    user_items = train.ur[train.to_inner_uid(user_id)]
    user_unrated_items = [item for item in train.all_items() if item not in user_items]
    
    prediction = [model2.predict(user_id,item) for item in user_unrated_items]
    prediction_sorted = sorted(prediction, key=lambda x: x.est, reverse=True)
    
    top_five_rec = prediction_sorted[:5]
    
    return top_five_rec

In [67]:
top_five_rec(1)

[Prediction(uid=1, iid=50, r_ui=None, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=608, r_ui=None, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=913, r_ui=None, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=928, r_ui=None, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=1089, r_ui=None, est=5, details={'was_impossible': False})]