# The project aims to build recommendation based on CF method.

## Project Overview

Table of contents:
1. Data Loading, Cleaning and Manipulation
2. Overview data
3. Data mapping
4. Recommendation model

### 1. Data Loading , Cleaning and Manipulation

In [32]:
# Import libraries and packages.

import numpy as np
import pandas as pd
import gc
import math
import scipy

# Import matplotlib.pyplot as plt.

import missingno
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

# Warnings ignore for dataset replace and written.

import warnings
warnings.filterwarnings('ignore')

# Recommendations
from surprise.model_selection import cross_validate


# Load the dataset (download it if needed)
#from surprise import Dataset
#data = Dataset.load_builtin('ml-100k')

# Use the famous SVD algorithm
#algo = SVD()

# Run 5-fold cross-validation and then print results
#cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


In [4]:
# Load the data and view set examples of data
df1 = pd.read_csv("C:/Users/PC/Downloads/Dataset/netflix dataset/combined_data_1.txt", 
                  header = None, 
                  names = ['Cust_Id' , 'Rating'], 
                  usecols = [0,1])

In [5]:
print('Dataset 1 shape: {}'.format(df1.shape))
print('############### Dataset examples ##############\n')
print(df1.iloc[::5000000, :])

Dataset 1 shape: (24058263, 2)
############### Dataset examples ##############

          Cust_Id  Rating
0              1:     NaN
5000000   2560324     4.0
10000000  2271935     2.0
15000000  1921803     2.0
20000000  1933327     3.0


In [6]:
# Datatype of df1 
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24058263 entries, 0 to 24058262
Data columns (total 2 columns):
 #   Column   Dtype  
---  ------   -----  
 0   Cust_Id  object 
 1   Rating   float64
dtypes: float64(1), object(1)
memory usage: 367.1+ MB


Load less data with only df1.

In [7]:
# Proceed with combining all the df2 , df3 and df4

#df2 = pd.read_csv("C:/Users/PC/Downloads/Dataset/netflix dataset/combined_data_2.txt", header = None, names = ['Cust_Id' , 'Rating'], usecols = [0,1])
#df3 = pd.read_csv("C:/Users/PC/Downloads/Dataset/netflix dataset/combined_data_3.txt", header = None, names = ['Cust_Id' , 'Rating'], usecols = [0,1])
#df4 = pd.read_csv("C:/Users/PC/Downloads/Dataset/netflix dataset/combined_data_4.txt", header = None, names = ['Cust_Id' , 'Rating'], usecols = [0,1])

#print('Dataset 2 shape: {}'.format(df2.shape))
#print('Dataset 3 shape: {}'.format(df3.shape))
#print('Dataset 4 shape: {}'.format(df4.shape))

In [8]:
df = df1
#df = df1.append(df2, ignore_index = True)
#df = df.append(df3, ignore_index = True)
#df = df.append(df4, ignore_index = True)

df['Rating'] = df['Rating'].astype(float)


# Delete df1,df2,df3,df4 objects to save memory
#del df1,df2,df3,df4

In [9]:
# Dataset Shape
print('Dataset Shape:\t{}'.format(df.shape))

Dataset Shape:	(24058263, 2)


## 2. Overview data

In [10]:
df.head()

Unnamed: 0,Cust_Id,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [11]:
# Checking for total Movies and NaN values
print("Number of NaN values (NaN Value in Rating = MoviesID) :\n"+str(df.isnull().sum()))

Number of NaN values (NaN Value in Rating = MoviesID) :
Cust_Id       0
Rating     4499
dtype: int64


From above info we know there is 4499 movies included in the raw text file.

In [12]:
# creating a numpy array with correct length then add the whole array as column into the main dataframe for MoviesID columnd
df_nan = pd.DataFrame(pd.isnull(df.Rating))
df_nan = df_nan[df_nan['Rating'] == True]
df_nan = df_nan.reset_index()

movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    # numpy approach
    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

# Account for last record and corresponding length
# numpy approach
last_record = np.full((1,len(df) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

print('Movie numpy: {}'.format(movie_np))
print('Length: {}'.format(len(movie_np)))

Movie numpy: [1.000e+00 1.000e+00 1.000e+00 ... 4.499e+03 4.499e+03 4.499e+03]
Length: 24053764


In [13]:
# remove those Movie ID rows
df = df[pd.notnull(df['Rating'])]

df['Movie_Id'] = movie_np.astype(int)
df['Cust_Id'] = df['Cust_Id'].astype(int)
print('-Dataset examples-')
print(df.iloc[::5000000, :])

-Dataset examples-
          Cust_Id  Rating  Movie_Id
1         1488844     3.0         1
5000996    501954     2.0       996
10001962   404654     5.0      1962
15002876   886608     2.0      2876
20003825  1193835     2.0      3825


In [14]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24053764 entries, 1 to 24058262
Data columns (total 3 columns):
 #   Column    Dtype  
---  ------    -----  
 0   Cust_Id   int32  
 1   Rating    float64
 2   Movie_Id  int32  
dtypes: float64(1), int32(2)
memory usage: 550.5 MB
None


## 3. Data mapping

In [23]:
df_title = pd.read_csv('C:/Users/PC/Downloads/Dataset/netflix dataset/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'])
df_title.set_index('Movie_Id', inplace = True)
print (df_title.head(10))

            Year                          Name
Movie_Id                                      
1         2003.0               Dinosaur Planet
2         2004.0    Isle of Man TT 2004 Review
3         1997.0                     Character
4         1994.0  Paula Abdul's Get Up & Dance
5         2004.0      The Rise and Fall of ECW
6         1997.0                          Sick
7         1992.0                         8 Man
8         2004.0    What the #$*! Do We Know!?
9         1991.0      Class of Nuke 'Em High 2
10        2001.0                       Fighter


Now the data is clean and ready for recommendation purposes.

Limition on data quality in prior cleaning:
1. No data slicing which for any outliers in rating or user.
2. Skipped visualization causing any selected features may overfit or underfit.

How to improve quality :
1. Remove movie with less reviews
2. Remove customer who gives too less reviews.

## 4. Recommend with CF method

In [31]:
from surprise import Reader, Dataset, SVD
reader = Reader()

# get just top 100K rows for faster run time
data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']][:100000], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0470  1.0507  1.0494  1.0518  1.0538  1.0505  0.0023  
MAE (testset)     0.8401  0.8432  0.8378  0.8358  0.8311  0.8376  0.0041  
Fit time          3.28    3.31    3.35    3.35    3.36    3.33    0.03    
Test time         0.15    0.08    0.08    0.08    0.08    0.10    0.03    


{'test_rmse': array([1.0469793 , 1.05067979, 1.04943131, 1.05178534, 1.05376989]),
 'test_mae': array([0.84006676, 0.8432026 , 0.83775227, 0.83576443, 0.83112553]),
 'fit_time': (3.2800793647766113,
  3.3087587356567383,
  3.352403163909912,
  3.3533456325531006,
  3.357743263244629),
 'test_time': (0.14813542366027832,
  0.08007264137268066,
  0.08307576179504395,
  0.08100128173828125,
  0.08307528495788574)}