# Movie Recommendation System

Text

In [1]:
# Import right packages

import surprise
from surprise.prediction_algorithms import *
from surprise import Reader, Dataset
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# Read in data

df_links = pd.read_csv('data/links.csv')
df_movies = pd.read_csv('data/movies.csv')
df_ratings = pd.read_csv('data/ratings.csv')
df_tags = pd.read_csv('data/tags.csv')

In [3]:
print(df_links.shape)
print(df_movies.shape)
print(df_ratings.shape)
print(df_tags.shape)

(9742, 3)
(9742, 3)
(100836, 4)
(3683, 4)


In [4]:
df_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [5]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [6]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [7]:
df_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [8]:
# Confirm there's no null values
df_links.dropna(inplace=True)   # Only this file has several null values
df_movies.dropna(inplace=True)  # No null value
df_ratings.dropna(inplace=True) # No null value
df_tags.dropna(inplace=True)    # No null value

print(df_links.shape)
print(df_movies.shape)
print(df_ratings.shape)
print(df_tags.shape)

(9734, 3)
(9742, 3)
(100836, 4)
(3683, 4)


In [9]:
df_tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [10]:
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [11]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [12]:
# Merging the Movies and Ratings csv files for collaborative filtering / recommendation system
df_merged = pd.merge(df_ratings, df_movies[['movieId','title','genres']], on='movieId', how='left')

In [13]:
df_merged

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


In [14]:
df_merged.info()  # No null values

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [15]:
# Unique user id count
df_merged['userId'].nunique()

610

In [16]:
# Unique movie id count
df_merged['movieId'].nunique()

9724

In [17]:
# Unique title count. There is variance in the movie_id and title count. However, since the variance is very small\
# we will ignore the difference
df_merged['title'].nunique()

9719

In [18]:
# Unique genre count
df_merged['genres'].nunique()

951

In [19]:
# Converting timestamp to read-able format
df_merged['timestamp'] = pd.to_datetime(df_merged['timestamp'], unit='s').dt.strftime('%Y-%m-%d')

In [20]:
df_merged.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,2000-07-30,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,2000-07-30,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,2000-07-30,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,2000-07-30,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,2000-07-30,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [21]:
# To separate out the genres
df_merged_genres = df_merged['genres']

mlb = MultiLabelBinarizer()
df_merged_genres_mlb = pd.DataFrame(mlb.fit_transform(df_merged_genres.str.split('|')), columns=mlb.classes_)
df_merged_genres_mlb

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0
100832,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
100833,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
100834,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [22]:
df_merged = df_merged.join(df_merged_genres_mlb)

In [23]:
df_merged['(no genres listed)'].value_counts()

0    100789
1        47
Name: (no genres listed), dtype: int64

In [24]:
# Drop rows with no genres
df_merged = df_merged[df_merged['(no genres listed)'] == 0]
df_merged

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,(no genres listed),Action,Adventure,Animation,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,2000-07-30,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,2000-07-30,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,2000-07-30,Heat (1995),Action|Crime|Thriller,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,2000-07-30,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,2000-07-30,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,2017-05-03,Split (2017),Drama|Horror|Thriller,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
100832,610,168248,5.0,2017-05-03,John Wick: Chapter Two (2017),Action|Crime|Thriller,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
100833,610,168250,5.0,2017-05-08,Get Out (2017),Horror,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
100834,610,168252,5.0,2017-05-03,Logan (2017),Action|Sci-Fi,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [25]:
# Drop genres and no genres listed columns
df_merged.drop(columns = ['genres','(no genres listed)'], inplace=True)

In [26]:
df_merged.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,2000-07-30,Toy Story (1995),0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,2000-07-30,Grumpier Old Men (1995),0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,2000-07-30,Heat (1995),1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,2000-07-30,Seven (a.k.a. Se7en) (1995),0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,2000-07-30,"Usual Suspects, The (1995)",0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [27]:
df_merged['rating'].value_counts().sort_index()

0.5     1368
1.0     2809
1.5     1791
2.0     7549
2.5     5544
3.0    20041
3.5    13130
4.0    26810
4.5     8543
5.0    13204
Name: rating, dtype: int64

In [28]:
reader = Reader(rating_scale=(.5,5))
data = Dataset.load_from_df(df_merged[['userId', 'movieId','rating']], reader)

In [29]:
from collections import defaultdict

def get_top_n(predictions, n=5):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
trainset = data.build_full_trainset()

model1 = SVD()
model1.fit(trainset)

model2 = KNNBasic()
model2.fit(trainset)

model3 = NMF()
model3.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7fb8885bac70>

In [30]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()

print(surprise.accuracy.rmse(model1.test(testset)))
print(surprise.accuracy.rmse(model2.test(testset)))
print(surprise.accuracy.rmse(model3.test(testset)))

RMSE: 0.4850
0.4850277066182615
RMSE: 0.9015
0.901512522756518
RMSE: 0.9976
0.9975788992580644


In [31]:
print(surprise.accuracy.mae(model1.test(testset)))

MAE:  0.3766
0.3766026071631154


In [32]:
best_model = model1

predictions = best_model.test(testset)

top_n = get_top_n(predictions, n=5)

# Print the recommended items for each user
#for uid, user_ratings in top_n.items():
    #print(uid, [iid for (iid, _) in user_ratings])

In [54]:
def top_five_rec1(userId):
    
    top_five_list = []
    for x in range(1):
        top_five_list.append(top_n[userId])
    return top_five_list

In [55]:
top_five_rec1(500)

[[(1136, 4.3424141694752105),
  (4011, 4.168168274949603),
  (1197, 4.117338841994113),
  (3275, 4.1169214294002785),
  (3578, 4.0552038135386965)]]

In [44]:
def top_five_rec(userId):
    
    top_five_list = []
    for x in range(5):
        top_five_list.append(top_n[userId][x][0])
    
    for x in top_five_list:
        print(df_merged[df_merged['movieId'] == x]['title'].head(1))

In [45]:
top_five_rec(500)

67    Monty Python and the Holy Grail (1975)
Name: title, dtype: object
1384    Snatch (2000)
Name: title, dtype: object
69    Princess Bride, The (1987)
Name: title, dtype: object
1951    Boondock Saints, The (2000)
Name: title, dtype: object
219    Gladiator (2000)
Name: title, dtype: object


In [53]:
# Doesn't like StarWars Series
df_merged[df_merged['userId']==500]

Unnamed: 0,userId,movieId,rating,timestamp,title,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
79907,500,1,4.0,2001-11-12,Toy Story (1995),0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
79908,500,11,1.0,2001-11-12,"American President, The (1995)",0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
79909,500,39,1.0,2001-11-12,Clueless (1995),0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
79910,500,101,1.0,2001-11-12,Bottle Rocket (1996),0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
79911,500,104,4.0,2001-11-12,Happy Gilmore (1996),0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79988,500,3566,4.0,2001-11-12,"Big Kahuna, The (2000)",0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
79989,500,3751,4.0,2001-11-12,Chicken Run (2000),0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
79990,500,4161,4.0,2001-11-12,"Mexican, The (2001)",1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
79991,500,4306,5.0,2001-11-12,Shrek (2001),0,1,1,1,1,...,0,0,0,0,0,1,0,0,0,0


In [36]:
# What is the most watched/popular movie? (most rating)
# What is the highest rated movie?