# Netflix Movie Recommendation
* Notebook: https://www.kaggle.com/laowingkin/netflix-movie-recommendation
* Data: https://www.kaggle.com/netflix-inc/netflix-prize-data?select=qualifying.txt

In [1]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD

'''
According to the documentation, the evaluate() method 
was deprecated in version 1.0.5 (functionally replaced 
by model_selection.cross_validate()) and was removed in version 1.1.0, 
which is likely what you have installed.
'''
# from surprise import evaluate 
from surprise.model_selection.validation import cross_validate

sns.set_style("darkgrid")

In [2]:
def readFile(file_path, rows=100000):
    data_dict = {'Cust_Id' : [], 'Movie_Id' : [], 'Rating' : [], 'Date' : []}
    f = open(file_path, "r")
    count = 0
    for line in f:
        count += 1
        if count > rows:
            break
            
        if ':' in line:
            movidId = line[:-2] # remove the last character ':'
            movieId = int(movidId)
        else:
            customerID, rating, date = line.split(',')
            data_dict['Cust_Id'].append(customerID)
            data_dict['Movie_Id'].append(movieId)
            data_dict['Rating'].append(rating)
            data_dict['Date'].append(date.rstrip("\n"))
    f.close()
            
    return pd.DataFrame(data_dict)

In [3]:
df1 = readFile('./data/netflix/combined_data_1.txt', rows=100000)
df2 = readFile('./data/netflix/combined_data_2.txt', rows=100000)
df3 = readFile('./data/netflix/combined_data_3.txt', rows=100000)
df4 = readFile('./data/netflix/combined_data_4.txt', rows=100000)

In [4]:
df1.head()

Unnamed: 0,Cust_Id,Movie_Id,Rating,Date
0,1488844,1,3,2005-09-06
1,822109,1,5,2005-05-13
2,885013,1,4,2005-10-19
3,30878,1,4,2005-12-26
4,823519,1,3,2004-05-03


In [5]:
df2.head()

Unnamed: 0,Cust_Id,Movie_Id,Rating,Date
0,2532865,4500,4,2005-07-26
1,573364,4500,3,2005-06-20
2,1696725,4500,3,2004-02-27
3,1253431,4500,3,2004-03-31
4,1265574,4500,2,2003-09-01


In [6]:
df3.head()

Unnamed: 0,Cust_Id,Movie_Id,Rating,Date
0,1277134,9211,1,2003-12-02
1,2435457,9211,2,2005-06-01
2,2338545,9211,3,2001-02-17
3,2218269,9211,1,2002-12-27
4,441153,9211,4,2002-10-11


In [7]:
df4.head()

Unnamed: 0,Cust_Id,Movie_Id,Rating,Date
0,2385003,13368,4,2004-07-08
1,659432,13368,3,2005-03-16
2,751812,13368,2,2002-12-16
3,2625420,13368,2,2004-05-25
4,1650301,13368,1,2005-08-30


In [8]:
df1['Rating'] = df1['Rating'].astype(float)
df2['Rating'] = df2['Rating'].astype(float)
df3['Rating'] = df3['Rating'].astype(float)
df4['Rating'] = df4['Rating'].astype(float)

In [9]:
df = df1.copy()
df = df.append(df2)
df = df.append(df3)
df = df.append(df4)

df.index = np.arange(0,len(df))
df.shape

(399899, 4)

In [10]:
df.head(10)

Unnamed: 0,Cust_Id,Movie_Id,Rating,Date
0,1488844,1,3.0,2005-09-06
1,822109,1,5.0,2005-05-13
2,885013,1,4.0,2005-10-19
3,30878,1,4.0,2005-12-26
4,823519,1,3.0,2004-05-03
5,893988,1,3.0,2005-11-17
6,124105,1,4.0,2004-08-05
7,1248029,1,3.0,2004-04-22
8,1842128,1,4.0,2004-05-09
9,2238063,1,3.0,2005-05-11


In [11]:
df['Movie_Id'].value_counts()

28       39752
13384    37884
4506     33731
4520     26500
9235     20214
         ...  
9221       105
4502       104
9           95
7           93
9227        88
Name: Movie_Id, Length: 101, dtype: int64

In [None]:
df_title = pd.read_csv('./data/netflix/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'])
df_title.head(10)

In [None]:
reader = Reader()

data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']], reader)
svd = SVD()
# Run 5-fold cross-validation and print results
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
df_785314 = df[(df['Cust_Id'] == '785314') & (df['Rating'] >= 5)]
df_785314 = df_785314.set_index('Movie_Id')
df_785314 = df_785314.join(df_title)['Name']
df_785314.head(df_785314.shape[0])

In [None]:
# getting full dataset
# data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']], reader)

trainset = data.build_full_trainset()
svd.fit(trainset)

In [None]:
titles = df_title.copy()

titles['Estimate_Score'] = titles['Movie_Id'].apply(lambda x: svd.predict(785314, x).est)
titles = titles.sort_values(by=['Estimate_Score'], ascending=False)

In [None]:
titles.head(10)