# setup packages and load data

In [1]:
import numpy as np
import pandas as pd

In [2]:
location = "D:/py_movie_recommendation_system/data/"
movies_df = pd.read_csv(location+"movies.csv")
ratings_df = pd.read_csv(location+"ratings.csv")
links_df = pd.read_csv(location+"links.csv")
tags_df = pd.read_csv(location+"tags.csv")

# data preprocessing

In [3]:
# drop useless column
movie_ratings=ratings_df.drop('timestamp', axis=1)

In [4]:
movie_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
# convert data type
user_movie = movie_ratings[["userId", "movieId"]].astype(int)
rating = movie_ratings["rating"].astype(float)
movie_ratings = pd.concat([user_movie, rating], axis=1)

In [6]:
movie_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


# train test split

In [7]:
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split
# put data into surprise dataset which is acceptable by surprise model
reader = Reader()
data = Dataset.load_from_df(movie_ratings[["userId", "movieId", "rating"]], reader)

In [8]:
train, test = train_test_split(data, test_size=0.2)

# define and tune model

In [9]:
from surprise import SVD
from surprise.model_selection import GridSearchCV
# build a grid of parameters
param_grid = {'n_factors': [25, 50, 100],'n_epochs': [10, 20, 30], 'lr_all': [0.001, 0.005, 0.01],'reg_all':[0.005, 0.001, 0.05]}
# build a 5-fold cv
gscv = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=5)
gscv.fit(data)

In [10]:
# get the best model parameters and build the best model
params = gscv.best_params['rmse']
svd = SVD(n_factors=params['n_factors'], n_epochs=params['n_epochs'],lr_all=params['lr_all'], reg_all=params['reg_all'])

# test best model

In [11]:
# get the rmse score
from surprise import accuracy
predictions = svd.fit(train).test(test)
accuracy.rmse(predictions, verbose=True)

RMSE: 0.8570


0.8570449302705279

In [12]:
# save model
from surprise import dump
dump.dump("D:/py_movie_recommendation_system/EDA&Model/svd", algo=svd)

# qualitative check

In [13]:
# get recommendation for user
def get_top_n(best_model, uid, n=10, location="D:/py_movie_recommendation_system/data/"):
    movies_df = pd.read_csv(location+"movies.csv")
    ratings_df = pd.read_csv(location+"ratings.csv").drop("timestamp", axis=1)
    users = list(ratings_df.userId.unique())
    if uid not in users:
        print(f"user {uid} is not in the database.")
        print("Here are the top reviewed movies:")
        top = ratings_df[["movieId", "rating"]].groupby("movieId").count().reset_index().sort_values("rating", ascending=False).reset_index(drop=True).drop("rating", axis=1)
        all_top = pd.merge(top, movies_df, left_on="movieId", right_on="movieId").drop("movieId", axis=1)
        recommendation = all_top.head(n).reset_index(drop=True)
        print(recommendation)
        return recommendation
    movies_df["estimate_rating"] = movies_df["movieId"].apply(lambda x: best_model.predict(uid, x).est)
    user_recommendation = movies_df.drop("movieId", axis=1)
    user_recommendation = user_recommendation.sort_values("estimate_rating", ascending=False)
    recommendation = user_recommendation.head(n).reset_index(drop=True)
    print(recommendation)
    return recommendation

In [14]:
# for known user
get_top_n(svd, 1, 10)

                                               title  \
0                      Day of the Doctor, The (2013)   
1        Wallace & Gromit: The Wrong Trousers (1993)   
2  Wallace & Gromit: The Best of Aardman Animatio...   
3                              Never Cry Wolf (1983)   
4                                    Superbad (2007)   
5    Grand Day Out with Wallace and Gromit, A (1989)   
6                           Captain Fantastic (2016)   
7  Neon Genesis Evangelion: The End of Evangelion...   
8                     Godfather: Part II, The (1974)   
9                            Funny Games U.S. (2007)   

                                       genres  estimate_rating  
0                      Adventure|Drama|Sci-Fi              5.0  
1             Animation|Children|Comedy|Crime              5.0  
2                  Adventure|Animation|Comedy              5.0  
3                             Adventure|Drama              5.0  
4                                      Comedy             

Unnamed: 0,title,genres,estimate_rating
0,"Day of the Doctor, The (2013)",Adventure|Drama|Sci-Fi,5.0
1,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,5.0
2,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy,5.0
3,Never Cry Wolf (1983),Adventure|Drama,5.0
4,Superbad (2007),Comedy,5.0
5,"Grand Day Out with Wallace and Gromit, A (1989)",Adventure|Animation|Children|Comedy|Sci-Fi,5.0
6,Captain Fantastic (2016),Drama,5.0
7,Neon Genesis Evangelion: The End of Evangelion...,Action|Animation|Drama|Fantasy|Sci-Fi,5.0
8,"Godfather: Part II, The (1974)",Crime|Drama,5.0
9,Funny Games U.S. (2007),Drama|Thriller,5.0


In [15]:
# for unknown user
get_top_n(svd, 1111111, 10)

user 1111111 is not in the database.
Here are the top reviewed movies:
                                       title                            genres
0                        Forrest Gump (1994)          Comedy|Drama|Romance|War
1           Shawshank Redemption, The (1994)                       Crime|Drama
2                        Pulp Fiction (1994)       Comedy|Crime|Drama|Thriller
3           Silence of the Lambs, The (1991)             Crime|Horror|Thriller
4                         Matrix, The (1999)            Action|Sci-Fi|Thriller
5  Star Wars: Episode IV - A New Hope (1977)           Action|Adventure|Sci-Fi
6                       Jurassic Park (1993)  Action|Adventure|Sci-Fi|Thriller
7                          Braveheart (1995)                  Action|Drama|War
8          Terminator 2: Judgment Day (1991)                     Action|Sci-Fi
9                    Schindler's List (1993)                         Drama|War


Unnamed: 0,title,genres
0,Forrest Gump (1994),Comedy|Drama|Romance|War
1,"Shawshank Redemption, The (1994)",Crime|Drama
2,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
4,"Matrix, The (1999)",Action|Sci-Fi|Thriller
5,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
6,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
7,Braveheart (1995),Action|Drama|War
8,Terminator 2: Judgment Day (1991),Action|Sci-Fi
9,Schindler's List (1993),Drama|War


In [16]:
# for movie
def recommend(mid, n = 10, location="D:/py_movie_recommendation_system/data/"):
    movies_df = pd.read_csv(location+"movies.csv")
    ratings_df = pd.read_csv(location+"ratings.csv").drop("timestamp", axis=1)
    movies_list = list(movies_df["movieId"])
    if mid not in movies_list:
        print("This is a new movie and we cannot find similar movie only based on it id.")
        return
    print(f"Top {n} movies recommended based on cosine similarity")
    movies_df_encoding = pd.concat([movies_df[["movieId", "title"]], movies_df["genres"].str.get_dummies(sep='|').astype(np.int64)], axis=1)
    target_movie = movies_df_encoding[movies_df_encoding["movieId"]==mid].reset_index(drop=True)
    other_movie = movies_df_encoding[movies_df_encoding["movieId"]!=mid].reset_index(drop=True)
    from scipy import spatial
    cosine_result = []
    for x in range(other_movie.shape[0]):
        other_movie_list = list(other_movie.iloc[x, 2:].apply(int))
        target_movie_list = list(target_movie.iloc[0:, 2:].apply(int))
        cosine = spatial.distance.cosine(other_movie_list, target_movie_list)
        cosine_result.append(cosine)
    result_df = pd.DataFrame(cosine_result)
    result_df.columns = ["Cosine"]
    similar_df = pd.concat([other_movie, result_df], axis=1)
    similar = pd.merge(similar_df[["movieId", "title", "Cosine"]], movies_df[["movieId", "genres"]], left_on="movieId", right_on="movieId").sort_values("Cosine", ascending=False).drop("Cosine", axis=1).head(n).reset_index(drop=True)
    print(similar)
    return similar

In [17]:
# for known movie
recommend(858, 10)

Top 10 movies recommended based on Pearsons'R correlation
   movieId                                              title  \
0        1                                   Toy Story (1995)   
1    27604              Suicide Club (Jisatsu saakuru) (2001)   
2    27773                                     Old Boy (2003)   
3    27772                           Ju-on: The Grudge (2002)   
4    27762                       Comic Book: The Movie (2004)   
5    27746                     Ginger Snaps: Unleashed (2004)   
6    27731         Cat Returns, The (Neko no ongaeshi) (2002)   
7    27706  Lemony Snicket's A Series of Unfortunate Event...   
8    27685                           Bring It On Again (2004)   
9    27683                Tremors 4: The Legend Begins (2004)   

                                         genres  
0   Adventure|Animation|Children|Comedy|Fantasy  
1                       Horror|Mystery|Thriller  
2                              Mystery|Thriller  
3                         

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,27604,Suicide Club (Jisatsu saakuru) (2001),Horror|Mystery|Thriller
2,27773,Old Boy (2003),Mystery|Thriller
3,27772,Ju-on: The Grudge (2002),Horror
4,27762,Comic Book: The Movie (2004),Comedy
5,27746,Ginger Snaps: Unleashed (2004),Horror|Thriller
6,27731,"Cat Returns, The (Neko no ongaeshi) (2002)",Adventure|Animation|Children|Fantasy
7,27706,Lemony Snicket's A Series of Unfortunate Event...,Adventure|Children|Comedy|Fantasy
8,27685,Bring It On Again (2004),Comedy
9,27683,Tremors 4: The Legend Begins (2004),Action|Comedy|Horror|Sci-Fi|Thriller|Western


In [18]:
# for unknown movie
recommend(111111111, 10)

This is a new movie and we cannot find similar movie only based on it id.
