# **NETFLIX RECOMMENDATION ENGINE- PROJECT**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
netflix=pd.read_csv(r'/content/drive/MyDrive/Intellipaat/Copy of Copy of combined_data_1.txt.zip', header=None,usecols=[0,1],names=['Customer_id','Ratings'])
netflix
# no heading required, only want columns 0 and 1, the name of the columns are specified as customer_id and ratings

Unnamed: 0,Customer_id,Ratings
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0
...,...,...
24058258,2591364,2.0
24058259,1791000,2.0
24058260,512536,5.0
24058261,988963,3.0


In [None]:
total_movies=netflix['Ratings'].isnull().sum()
total_movies
# wherever NaN is present in ratings columns for a particular customer_id -[1,2,3,....], that is basically the movie number

4499

In [None]:
total_customers=netflix['Customer_id'].nunique() - total_movies
total_customers
# multiple customers have rated multiple movies, so nunique of customers - (total movies which are present in customer id column)

470758

In [None]:
total_ratings= len(netflix['Ratings'])- total_movies
total_ratings
# length of the rating column - (the NaN which is given to the movie number)

24053764

In [None]:
# sorting the data and making a better dataframe with customer_id, movie_id, Ratings as 3 different columns
movie_id=None
movie_col=[]
for x in netflix['Customer_id']:
  if ":" in x:
    movie_id=int(x.replace(":",""))

  movie_col.append(movie_id)

In [None]:
netflix['Movie_id']=movie_col
netflix

Unnamed: 0,Customer_id,Ratings,Movie_id
0,1:,,1
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
...,...,...,...
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499


In [None]:
netflix.dropna(inplace=True)

In [None]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24053764 entries, 1 to 24058262
Data columns (total 3 columns):
 #   Column       Dtype  
---  ------       -----  
 0   Customer_id  object 
 1   Ratings      float64
 2   Movie_id     int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 734.1+ MB


In [None]:
netflix['Customer_id']=netflix['Customer_id'].astype(int)
# converting object type to integer

In [None]:
netflix

Unnamed: 0,Customer_id,Ratings,Movie_id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1
...,...,...,...
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499


# **Pre Filtering of Data**

In [None]:
# FILTER 1- Excluding movies with less rating count. Grouping the data by count of ratings of movies
rating_count= netflix.groupby('Movie_id')['Ratings'].count()
rating_count

Unnamed: 0_level_0,Ratings
Movie_id,Unnamed: 1_level_1
1,547
2,145
3,2012
4,142
5,1140
...,...
4495,614
4496,9519
4497,714
4498,269


In [None]:
# setting a benchmark for movies i.e. movies only with the ratings above the benchmark will be recommended
benchmark= round(rating_count.quantile(0.6))
benchmark

908

In [None]:
# Finding the index of movies which are below the benchmark so as to remove it
rejected_movies=rating_count[rating_count<benchmark].index
rejected_movies

Index([   1,    2,    4,    7,    9,   10,   11,   12,   13,   14,
       ...
       4480, 4481, 4486, 4487, 4491, 4494, 4495, 4497, 4498, 4499],
      dtype='int64', name='Movie_id', length=2699)

In [None]:
# FILTER 2- Excluding customers with less rating experience- who have rated less movies- count of rating of each customer
customer_rating=netflix.groupby('Customer_id')['Ratings'].count()
customer_rating

Unnamed: 0_level_0,Ratings
Customer_id,Unnamed: 1_level_1
6,153
7,195
8,21
10,49
25,4
...,...
2649404,12
2649409,10
2649421,3
2649426,74


In [None]:
# setting benchmark for customers- only those customers will be considered whose rating count is greater than the benchmark
benchmark1=round(customer_rating.quantile(0.6))
benchmark1

36

In [None]:
# making list of customers who have ratings less than the benchmark so as to remove them
rejected_customers= customer_rating[customer_rating<benchmark1].index
rejected_customers

Index([      8,      25,      33,      83,      94,     126,     130,     133,
           142,     149,
       ...
       2649337, 2649343, 2649351, 2649376, 2649379, 2649384, 2649401, 2649404,
       2649409, 2649421],
      dtype='int64', name='Customer_id', length=282042)

In [None]:
# Removing the rejected movies and customers from the data
netflix=netflix[~netflix['Movie_id'].isin(rejected_movies)]                # movies which is present in rejected movies will be removed (~)
netflix=netflix[~netflix['Customer_id'].isin(rejected_customers)]          # customers which are present in rejected customers will be removed (~)
netflix

Unnamed: 0,Customer_id,Ratings,Movie_id
696,712664,5.0,3
697,1331154,4.0,3
698,2632461,3.0,3
699,44937,5.0,3
700,656399,4.0,3
...,...,...,...
24056842,1055714,5.0,4496
24056843,2643029,4.0,4496
24056844,267802,4.0,4496
24056845,1559566,3.0,4496


In [None]:
#importing 2nd dataset with encoding type ISO, with no header, using columns 0,1,2 with names specified
df=pd.read_csv(r"/content/drive/MyDrive/Intellipaat/Copy of Copy of movie_titles.csv",encoding='ISO=8859-1',header=None,usecols=[0,1,2],names=['Movie_id','Year','Name'])
df

Unnamed: 0,Movie_id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


# ****Model Building****

In [None]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [None]:
# Converting the data to SVD readable format using Dataset func and Reader func is used to read line by line and provide it to Dataset func for conversion
data=Dataset.load_from_df(netflix[['Customer_id','Ratings','Movie_id']][:5000000],reader=Reader())

In [None]:
model=SVD()

In [None]:
# cross validation will build the model with the best parameters, cv=3 means we want data to be divided in 3 folds, less the RMSE better the model
cross_validate(model,data,measures=['RMSE'],cv=3)                             # Training the model with best parameters

{'test_rmse': array([682.38166682, 682.62281278, 682.62361919]),
 'fit_time': (99.28128671646118, 98.88892030715942, 109.0086362361908),
 'test_time': (27.498905897140503, 26.78399133682251, 26.00049376487732)}

# **Recommendations**

In [None]:
netflix

Unnamed: 0,Customer_id,Ratings,Movie_id
696,712664,5.0,3
697,1331154,4.0,3
698,2632461,3.0,3
699,44937,5.0,3
700,656399,4.0,3
...,...,...,...
24056842,1055714,5.0,4496
24056843,2643029,4.0,4496
24056844,267802,4.0,4496
24056845,1559566,3.0,4496


In [None]:
# Pre filtering 2nd dataset- removing movies which are in rejected movie list
dff=df.copy()
dff=dff[~dff['Movie_id'].isin(rejected_movies)]
dff

Unnamed: 0,Movie_id,Year,Name
2,3,1997.0,Character
4,5,2004.0,The Rise and Fall of ECW
5,6,1997.0,Sick
7,8,2004.0,What the #$*! Do We Know!?
15,16,1996.0,Screamers
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [None]:
#recommendations are done for every individual customer
df_1331154=dff.copy()

In [None]:
# creating column Estimate score for storing predicted ratings for every movie for customer having id=1331154, "apply" func used to apply a function to all the rows of a column
df_1331154['Estimate_score']=df_1331154['Movie_id'].apply(lambda x: model.predict(1331154,x).est)        # predicting ratings for a customer for all x~ movie_id- estimate score
df_1331154

Unnamed: 0,Movie_id,Year,Name,Estimate_score
2,3,1997.0,Character,3.941151
4,5,2004.0,The Rise and Fall of ECW,3.611672
5,6,1997.0,Sick,3.229288
7,8,2004.0,What the #$*! Do We Know!?,3.109587
15,16,1996.0,Screamers,3.043015
...,...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...,3.678690
17766,17767,2004.0,Fidel Castro: American Experience,3.678690
17767,17768,2000.0,Epoch,3.678690
17768,17769,2003.0,The Company,3.678690


In [None]:
df_1331154=df_1331154.sort_values('Estimate_score',ascending=False)
df_1331154

Unnamed: 0,Movie_id,Year,Name,Estimate_score
67,68,2004.0,Invader Zim,4.622165
241,242,1995.0,Neon Genesis Evangelion: The End of Evangelion,4.451302
250,251,2000.0,Midsomer Murders: Strangler's Wood,4.422190
240,241,1959.0,North by Northwest,4.400847
75,76,1952.0,I Love Lucy: Season 2,4.290379
...,...,...,...,...
23,24,1981.0,My Bloody Valentine,2.712367
126,127,1987.0,Fatal Beauty,2.673847
25,26,2004.0,Never Die Alone,2.638090
224,225,2004.0,The Cookout,2.609705


In [None]:
df_1559566=dff.copy()
df_1559566['Estimate_score']=df_1559566['Movie_id'].apply(lambda x: model.predict(1559566,x).est)
df_1559566

Unnamed: 0,Movie_id,Year,Name,Estimate_score
2,3,1997.0,Character,3.808457
4,5,2004.0,The Rise and Fall of ECW,3.881053
5,6,1997.0,Sick,2.994684
7,8,2004.0,What the #$*! Do We Know!?,3.200631
15,16,1996.0,Screamers,3.213220
...,...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...,3.654081
17766,17767,2004.0,Fidel Castro: American Experience,3.654081
17767,17768,2000.0,Epoch,3.654081
17768,17769,2003.0,The Company,3.654081


In [None]:
df_1559566.sort_values('Estimate_score',ascending=False)

Unnamed: 0,Movie_id,Year,Name,Estimate_score
269,270,2001.0,Sex and the City: Season 4,4.881134
222,223,2003.0,Chappelle's Show: Season 1,4.550713
266,267,1994.0,Touched by an Angel: Season 1,4.495164
32,33,2000.0,Aqua Teen Hunger Force: Vol. 1,4.331779
105,106,2004.0,Stevie Ray Vaughan and Double Trouble: Live at...,4.317419
...,...,...,...,...
207,208,1947.0,The Three Stooges: Sing a Song of Six Pants,2.695814
25,26,2004.0,Never Die Alone,2.593592
180,181,2004.0,The Last Shot,2.523234
224,225,2004.0,The Cookout,2.335688
