# Instruction to download datasets:

Please download datasets from the link https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data

Place below datasets to the same folder of this notebook

1) combined_data_1.txt

2) combined_data_2.txt

3) combined_data_3.txt

4) combined_data_4.txt

5) movie_titles.csv

In [1]:
import pandas as pd
import numpy as np
import random
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

## Data Preprocessing

Movie Title

In [2]:
df_title = pd.read_csv(".\\movie_titles.csv", sep=';', names=["Movie_ID", "Year_Release", "Title"], encoding="latin_1")

temp = df_title["Movie_ID"].tolist()
df_title["Title"] = [temp[i].split(",", 2)[2] for i in range(len(temp))]
df_title["Year_Release"] = [temp[i].split(",", 2)[1] for i in range(len(temp))]
df_title["Movie_ID"] = [temp[i].split(",", 2)[0] for i in range(len(temp))]

df_title = df_title.astype({"Movie_ID": "int"})
# df_title[df_title["Year_Release"]=="NULL"]
# Year_Movie_Nan = movie[movie["Year_Release"]=="NULL"].index.tolist()
# Year_Movie_Fill = [2001, np.nan, np.nan, 1974, 1999, 1994, 1999]
df_title = df_title.set_index("Movie_ID")
#df_title = df_title[df_title.index<501]

df_title

Unnamed: 0_level_0,Year_Release,Title
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003,Dinosaur Planet
2,2004,Isle of Man TT 2004 Review
3,1997,Character
4,1994,Paula Abdul's Get Up & Dance
5,2004,The Rise and Fall of ECW
...,...,...
496,2004,Primus: Hallucino-Genetics Live 2004
497,2003,Broadway's Lost Treasures
498,1989,Glory: Bonus Material
499,2000,Under Suspicion


Rating

In [3]:
df1 = pd.read_table(".\\combined_data_1.txt", sep=",", names=["Customer_ID", "Rating", "Date"])
df2 = pd.read_table(".\\combined_data_2.txt", sep=",", names=["Customer_ID", "Rating", "Date"])
df3 = pd.read_table(".\\combined_data_3.txt", sep=",", names=["Customer_ID", "Rating", "Date"])
df4 = pd.read_table(".\\combined_data_4.txt", sep=",", names=["Customer_ID", "Rating", "Date"])
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

temp_list1 = df["Customer_ID"]
temp_list2 = []

for i in temp_list1 :
    if i.find(":") > 0 :
        temp = i.replace(":", "")
        temp_list2.append(np.nan)
    else :
        temp_list2.append(temp)

df.insert(0, "Movie_ID", temp_list2)
df = df.dropna(subset=["Movie_ID"])
df = df.astype({"Movie_ID": "int", "Customer_ID": "int"})
df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
df = df.sort_values(by=["Movie_ID", "Date", "Customer_ID"])
df = df.reset_index().drop(["index"], axis=1)
#df = df[df["Movie_ID"]<501]

df

Unnamed: 0,Movie_ID,Customer_ID,Rating,Date
0,1,1596531,5.0,2004-01-23
1,1,1366860,4.0,2004-01-26
2,1,1181550,3.0,2004-02-01
3,1,1227322,4.0,2004-02-06
4,1,2413320,4.0,2004-02-06
...,...,...,...,...
2798699,500,2118461,5.0,2005-12-19
2798700,500,2491226,2.0,2005-12-19
2798701,500,961541,4.0,2005-12-21
2798702,500,1721510,4.0,2005-12-21


Variable for function

In [4]:
df_movie_summary = df.groupby('Movie_ID')['Rating'].agg(['count','mean'])
movie_benchmark = round(df_movie_summary['count'].quantile(0.8),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

df_cust_summary = df.groupby('Customer_ID')['Rating'].agg(['count','mean'])
cust_benchmark = round(df_cust_summary['count'].quantile(0.8),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

df_pt = pd.pivot_table(df,values='Rating',index='Customer_ID',columns='Movie_ID')

## Recommendation models

Recommend with Collaborative Filtering

In [5]:
reader = Reader()

data = Dataset.load_from_df(df[['Customer_ID', 'Movie_ID', 'Rating']][:], reader)

svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'])

{'test_rmse': array([0.98764577, 0.9881501 , 0.98928323, 0.98897319, 0.98833283]),
 'test_mae': array([0.77691932, 0.77733161, 0.77798188, 0.77734425, 0.77742266]),
 'fit_time': (33.512972831726074,
  33.722288846969604,
  34.18374800682068,
  33.805906534194946,
  34.28094840049744),
 'test_time': (7.635077714920044,
  6.232989549636841,
  6.920273780822754,
  6.236044883728027,
  6.1862335205078125)}

Create the function check what customer liked in the past:

In [6]:
def liked(num):
    df_liked = df[(df['Customer_ID'] == num) & (df['Rating'] == 5)]
    df_liked = df_liked.set_index('Movie_ID')
    df_liked = df_liked.join(df_title)['Title']
    if df_liked.empty == True :
        return f"Customer {num} has not given any movie with 5 scores yet."
    else :
        return f"Customer {num} liked record:\n", df_liked

liked(random.choice(df["Customer_ID"]))

('Customer 2157639 liked record:\n',
 Movie_ID
 143    The Game
 Name: Title, dtype: object)

Create the function predict which movies customer would love to watch:

In [7]:
def recommend_cust(num):  
    cust_recommend = df_title.copy()
    cust_recommend = cust_recommend.reset_index()
    cust_recommend = cust_recommend[~cust_recommend['Movie_ID'].isin(drop_movie_list)]
        
    # getting full dataset
    data = Dataset.load_from_df(df[['Customer_ID', 'Movie_ID', 'Rating']], reader)
    trainset = data.build_full_trainset()
    svd.fit(trainset)
        
    cust_recommend['Estimate_Score'] = cust_recommend['Movie_ID'].apply(lambda x: svd.predict(num, x).est)
    cust_recommend = cust_recommend.drop('Movie_ID', axis = 1)
    cust_recommend = cust_recommend.sort_values('Estimate_Score', ascending=False)
    print(f"Recommendation for customer {num}: ", cust_recommend.head(10))

recommend_cust(random.choice(df["Customer_ID"]))

Recommendation for customer 31724:      Year_Release                                    Title  Estimate_Score
32          2000           Aqua Teen Hunger Force: Vol. 1        4.650285
358         2003            Absolutely Fabulous: Series 5        4.643055
470         1931                              City Lights        4.361014
151         1995                        A Little Princess        4.349601
269         2001               Sex and the City: Season 4        4.308298
43          1996                           Spitfire Grill        4.268676
164         1982  Richard Pryor: Live on the Sunset Strip        4.252555
214         1998                 That '70s Show: Season 1        4.232000
404         1987                          Wings of Desire        4.168572
174         1992                           Reservoir Dogs        4.133983


Recommend with Pearsons' R correlations

The way it works is we use Pearsons' R correlation to measure the linear correlation between review scores of all pairs of movies, then we provide the top 10 movies with highest correlations:

In [10]:
def recommend_movie(movie_title, min_count):
    print(f'''Top 10 movies recommended based on Pearsons'R correlation by moive "{movie_title}":''')
    i = int(df_title.index[df_title['Title'] == movie_title][0])
    target = df_pt[i]
    similar_to_target = df_pt.corrwith(target)
    corr_target = pd.DataFrame(similar_to_target, columns = ['PearsonR'])
    corr_target.dropna(inplace = True)
    corr_target = corr_target.sort_values('PearsonR', ascending = False)
    corr_target.index = corr_target.index.map(int)
    corr_target = corr_target.join(df_title).join(df_movie_summary)[['PearsonR', 'Title', 'count', 'mean']]
    print(corr_target[corr_target['count']>min_count][:10].to_string(index=False))

recommend_movie(random.choice(df_title["Title"]), 0)

Top 10 movies recommended based on Pearsons'R correlation by moive "Querelle":
 PearsonR                                               Title  count     mean
 1.000000            Marilyn Manson: Fear of a Satanic Planet    121 2.975207
 1.000000                                            Querelle    890 2.778652
 1.000000 Ashtanga Yoga: Beginner's Practice with Nicki Doane    108 2.712963
 1.000000                                      One Last Dance    652 2.627301
 1.000000                       Jimmy Buffett: MiniMatinee #1     90 2.988889
 1.000000                                     IFilm: Deranged    163 3.012270
 1.000000                   GTO: Great Teacher Onizuka: Set 2    384 4.138021
 1.000000                                          Elfen Lied    266 4.251880
 1.000000                Primus: Hallucino-Genetics Live 2004    235 3.710638
 0.986908  Cartoon Network Halloween: 9 Creepy Cartoon Capers    335 3.388060
