# Movie Recommendation System

Text

In [1]:
import surprise
from surprise.prediction_algorithms import *
from surprise import accuracy
from surprise import Reader, Dataset
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [2]:
links_df = pd.read_csv('data/links.csv')
movies_df = pd.read_csv('data/movies.csv')
ratings_df = pd.read_csv('data/ratings.csv')
tags_df = pd.read_csv('data/tags.csv')

In [3]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [5]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [6]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [7]:
print(movies_df['movieId'].value_counts())


86014     1
1282      1
3347      1
1298      1
25870     1
         ..
60072     1
4775      1
50601     1
131749    1
83969     1
Name: movieId, Length: 9742, dtype: int64


In [8]:
print(ratings_df['movieId'].value_counts())


356       329
318       317
296       307
593       279
2571      278
         ... 
5986        1
100304      1
34800       1
83976       1
8196        1
Name: movieId, Length: 9724, dtype: int64


In [9]:
movie_ratings_df = pd.merge(movies_df, ratings_df, on='movieId')

movie_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100836 non-null  int64  
 1   title      100836 non-null  object 
 2   genres     100836 non-null  object 
 3   userId     100836 non-null  int64  
 4   rating     100836 non-null  float64
 5   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [10]:
movie_ratings_df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [11]:
movie_ratings_df.columns

Index(['movieId', 'title', 'genres', 'userId', 'rating', 'timestamp'], dtype='object')

In [12]:
print(movie_ratings_df['userId'].value_counts())

414    2698
599    2478
474    2108
448    1864
274    1346
       ... 
406      20
595      20
569      20
431      20
442      20
Name: userId, Length: 610, dtype: int64


In [13]:
print(movie_ratings_df['movieId'].value_counts())

356       329
318       317
296       307
593       279
2571      278
         ... 
5922        1
153386      1
57147       1
5986        1
8196        1
Name: movieId, Length: 9724, dtype: int64


In [14]:
print(movie_ratings_df['movieId'].value_counts().sort_values())


8196        1
143410      1
96283       1
168358      1
6583        1
         ... 
2571      278
593       279
296       307
318       317
356       329
Name: movieId, Length: 9724, dtype: int64


In [15]:
print(movie_ratings_df['title'].value_counts().sort_values())

It Came from Outer Space (1953)                       1
High Anxiety (1977)                                   1
Blueberry (2004)                                      1
Hit by Lightning (2014)                               1
New Adventures of Pippi Longstocking, The (1988)      1
                                                   ... 
Matrix, The (1999)                                  278
Silence of the Lambs, The (1991)                    279
Pulp Fiction (1994)                                 307
Shawshank Redemption, The (1994)                    317
Forrest Gump (1994)                                 329
Name: title, Length: 9719, dtype: int64


In [16]:
genres_df = movie_ratings_df['genres']

mlb = MultiLabelBinarizer()
genres_df_mlb = pd.DataFrame(mlb.fit_transform(genres_df.str.split('|')), columns=mlb.classes_)
genres_df_mlb

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
100832,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
100833,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
100834,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

movie_ratings_df['userId'] = user_encoder.fit_transform(movie_ratings_df['userId'])
movie_ratings_df['movieId'] = movie_encoder.fit_transform(movie_ratings_df['movieId'])


In [18]:
df_merged = movie_ratings_df.join(genres_df_mlb)
df_merged.drop(columns = ['genres','(no genres listed)'], inplace=True)
df_merged

Unnamed: 0,movieId,title,userId,rating,timestamp,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,Toy Story (1995),0,4.0,964982703,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,Toy Story (1995),4,4.0,847434962,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,Toy Story (1995),6,4.5,1106635946,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,Toy Story (1995),14,2.5,1510577970,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,0,Toy Story (1995),16,4.5,1305696483,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,9719,Black Butler: Book of the Atlantic (2017),183,4.0,1537109082,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
100832,9720,No Game No Life: Zero (2017),183,3.5,1537109545,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
100833,9721,Flint (2017),183,3.5,1537109805,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100834,9722,Bungo Stray Dogs: Dead Apple (2018),183,3.5,1537110021,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
train_df, test_df = train_test_split(df_merged, test_size = 0.2)
train_df

Unnamed: 0,movieId,title,userId,rating,timestamp,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
76824,5265,"Miracle of Marcelino, The (Marcelino pan y vin...",605,3.5,1171730449,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
74052,4793,Calendar Girls (2003),139,3.0,1117730641,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9343,297,While You Were Sleeping (1995),30,4.0,850467366,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
33306,1193,Spawn (1997),361,4.0,1530638157,1,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0
62510,3328,Johnny Be Good (1988),413,2.0,1026225630,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63548,3518,Don't Say a Word (2001),598,2.5,1498515727,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
41757,1710,"Producers, The (1968)",507,1.5,1268297524,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
78423,5651,Battlestar Galactica (2003),28,5.0,1362016752,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
26482,919,Psycho (1960),201,4.0,974924662,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [21]:
reader = Reader(rating_scale=(0.5, 5))
#This line creates a reader. The reader class is a parser that reads the file containing our ratings

data = Dataset.load_from_df(df_merged[['userId', 'movieId', 'rating']], reader)

trainset = data.build_full_trainset()

In [None]:
#KNN_model = KNNBasic().fit(trainset)
#KNN_predictions = KNN_model.test(trainset.build_anti_testset())

#KNN_RMSE = accuracy.rmse(KNN_predictions)
#KNN_MAE = accuracy.mae(KNN_predictions)

#KNN_RMSE
#KNN_MAE

#In the context of a recommendation system, the training set consists of items that users have already rated. 
#The “anti-testset”, on the other hand, is made up of items that users have not yet rated. 
#This is useful for making predictions about potential ratings, which in turn can be used to recommend new items to users.
#This method is particularly useful when you want to recommend the top-N items to a user, as it allows the model to 
#predict ratings for items the user has not yet interacted with.

In [22]:
SVD_model = SVD().fit(trainset)
SVD_predictions = SVD_model.test(trainset.build_anti_testset())

SVD_RMSE = accuracy.rmse(SVD_predictions)
SVD_MAE = accuracy.mae(SVD_predictions)

SVD_RMSE
SVD_MAE

RMSE: 0.4861
MAE:  0.3771


0.37711354925314394

In [None]:
#NMF_model = NMF().fit(trainset)
#NMF_predictions = NMF_model.test(trainset.build_anti_testset())

#NMF_RMSE = accuracy.rmse(NMF_predictions)
#NMF_MAE = accuracy.mae(NMF_predictions)

#NMF_RMSE
#NMF_MAE

In [23]:
def get_top_n_recommendations(user_id, n=5):
    user_movies = df_merged[df_merged['userId'] == user_id]['movieId'].unique()
    all_movies = df_merged['movieId'].unique()
    movies_to_predict = list(set(all_movies) - set(user_movies))
    user_movie_pairs = [(user_id, movie_id, 0) for movie_id in movies_to_predict]
    predictions_cf = SVD_model.test(user_movie_pairs)
    top_n_recommendations = sorted(predictions_cf, key = lambda x: x.est, reverse = True )[:n]
    
    for pred in top_n_recommendations:
        predicted_rating = pred.est
        print(predicted_rating)
    
    top_n_movie_ids = [int(pred.iid) for pred in top_n_recommendations]
    
    top_n_movies = movie_encoder.inverse_transform(top_n_movie_ids)
    
    return top_n_movies

In [24]:
user_id = 500
recommendations = get_top_n_recommendations(user_id)
top_n_movies_titles = movies_df[movies_df['movieId'].isin(recommendations)]['title'].tolist()
print(f"Top 5 Recommendations for User {user_id}:")
for i, title in enumerate(top_n_movies_titles, 1):
  print(f"{i}.{title}")

4.233810753359288
4.1761333488843535
4.174997945017577
4.14558771239662
4.13898116908444
Top 5 Recommendations for User 500:
1.In the Name of the Father (1993)
2.Great Escape, The (1963)
3.Young Frankenstein (1974)
4.Grave of the Fireflies (Hotaru no haka) (1988)
5.City of God (Cidade de Deus) (2002)
