# Importing packages

**Configuration packages**

In [144]:
import warnings
warnings.filterwarnings('ignore')

**Classical packages**

In [146]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

**Machine Learning packages**

In [148]:
#####

## Exploration Data Analysis

**Loading the dataset**

In [151]:
netflix_data= pd.read_csv(r'combined_data.zip', header=None, names=['Cust_ID', 'Ratings'], usecols=[0,1])

**Reading the dadaset**

In [153]:
#top 10 rows of the data
netflix_data.head()

Unnamed: 0,Cust_ID,Ratings
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [154]:
#last 10 rows of the data
netflix_data.tail()

Unnamed: 0,Cust_ID,Ratings
24058258,2591364,2.0
24058259,1791000,2.0
24058260,512536,5.0
24058261,988963,3.0
24058262,1704416,3.0


**Note:** *As we observed here are two columns in this dataset- customer ID and Ratings. Each section's group rows and each group has been named under `cust_ID` as `1:, 2:, 3:` are nothing but movie count, and corresponding to the group names, we have null values under `Ratings` columns. So null values are nothing but the total number of movies we have. So before removing null values, we need to seperate the count of movies into a new column at first.*

### Working on `customer ID` and `Ratings`

**calculating the unique number of customers**

In [158]:
#Checking the data type of each column
netflix_data.dtypes

Cust_ID     object
Ratings    float64
dtype: object

In [159]:
#Checking missing values
print(f'Cust_ID: {netflix_data.isna().sum()[0]}, Ratings: {netflix_data.isna().sum()[1]}')

Cust_ID: 0, Ratings: 4499


In [160]:
#Shape of the data
print(f'Rows: {netflix_data.shape[0]}, Cols: {netflix_data.shape[1]}')

Rows: 24058263, Cols: 2


In [161]:
#Unique number of customers
netflix_data['Cust_ID'].nunique()

475257

In [162]:
#storing data of the count of total movies and the count of unique customers (including all the movie IDs)
movie_count= netflix_data.isna().sum()[1]
cust_count= netflix_data['Cust_ID'].nunique()
print(f'movie_count: {movie_count}, cust_count: {cust_count} ')

movie_count: 4499, cust_count: 475257 


In [163]:
#unique number of customers
print(f'Unique number of customers (excluding all the movie IDs)-> cust_count - movie_count -> {cust_count-movie_count}')

Unique number of customers (excluding all the movie IDs)-> cust_count - movie_count -> 470758


**Note:** *As all the group names are of object type, the cust_id column is set as an object by default. So, our next step is to separate the count of movies from the cust_ID column by creating a new column and typecast the data type of the customer ID column from object to int.*

**Removing the count of movies from customer ID column into the new column**

In [166]:
#seperating the count of movies into a list
count= None
movie_ID=[]

for cust_id in netflix_data['Cust_ID']:
    if ':' in cust_id:
        count= int(cust_id.replace(':',''))
    movie_ID.append(count)

In [167]:
#Inserting the list of movie_IDs into a separate column
netflix_data['movie_ID']= movie_ID

In [168]:
netflix_data.head()

Unnamed: 0,Cust_ID,Ratings,movie_ID
0,1:,,1
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1


In [169]:
#Removing all the null values from the data
netflix_data= netflix_data[netflix_data['Ratings'].notnull()]

In [170]:
netflix_data.head()

Unnamed: 0,Cust_ID,Ratings,movie_ID
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1


In [171]:
#typecasting the data type of customer ID column from object to int type
netflix_data['Cust_ID']= netflix_data['Cust_ID'].astype(int)

In [172]:
netflix_data.dtypes

Cust_ID       int32
Ratings     float64
movie_ID      int64
dtype: object

In [173]:
netflix_data.shape

(24053764, 3)

### Grouping the count of `Ratings` for each movie

In [175]:
#Creating a new DataFrame by grouping the movieID with the count of ratings
netflix_movie_summary= netflix_data.groupby(by='movie_ID')['Ratings'].agg(['count']).reset_index()

In [176]:
netflix_movie_summary.head()

Unnamed: 0,movie_ID,count
0,1,547
1,2,145
2,3,2012
3,4,142
4,5,1140


### Count of top rated movies

**Creating a benchmark value for the recommendation**

In [179]:
benchmark= np.round(netflix_movie_summary['count'].quantile(0.6), 0)
benchmark

908.0

**Note :-** *`908` is the 60th percentile of all the values of count column.*

In [181]:
#Keeping records of drop movie list
drop_movie_list= netflix_movie_summary[netflix_movie_summary['count'] < benchmark]

In [182]:
drop_movie_list

Unnamed: 0,movie_ID,count
0,1,547
1,2,145
3,4,142
6,7,93
8,9,95
...,...,...
4493,4494,130
4494,4495,614
4496,4497,714
4497,4498,269


In [183]:
len(drop_movie_list)

2699

In [184]:
len(netflix_movie_summary['movie_ID'])

4499

In [185]:
4499-2699

1800

**Count of top rated reviews of all the movies is `1800`.**

**Note :-** *`2699` rows have reviews below the benchmark. So we deducted that with the total movie count and hence we have got total reviews above the benchmark.*

### Grouping the count of `Ratings` for each customer

In [189]:
#Creating a new DataFrame by grouping the movieID with the count of ratings
netflix_cust_summary= netflix_data.groupby(by='Cust_ID')['Ratings'].agg(['count']).reset_index()

In [190]:
netflix_cust_summary.head()

Unnamed: 0,Cust_ID,count
0,6,153
1,7,195
2,8,21
3,10,49
4,25,4


### Count of maximum review of customers

**Creating a benchmark value for the maximum reviews**

In [193]:
benchmark= np.round(netflix_cust_summary['count'].quantile(0.6), 0)
benchmark

36.0

**Note :-** *`908` is the 60th percentile of all the values of count column.*

In [195]:
#Keeping records of drop movie list
drop_cust_list= netflix_cust_summary[netflix_cust_summary['count'] < benchmark]

In [196]:
drop_cust_list

Unnamed: 0,Cust_ID,count
2,8,21
4,25,4
5,33,11
9,83,10
11,94,27
...,...,...
470750,2649384,7
470752,2649401,30
470753,2649404,12
470754,2649409,10


In [197]:
len(drop_cust_list)

282042

In [198]:
netflix_data.shape[0]

24053764

In [199]:
24053764-282042

23771722

**Count of maximum review of customers is `2,37,71,722`.**

**Note :-** *`2,82,042` rows have a review of customers below the benchmark. So we deducted that with the total rows of customers and hence we have got the count of maximum review of customers.*

### Preparing the `final dataset` for `recommendation`.

In [203]:
drop_movie_list

Unnamed: 0,movie_ID,count
0,1,547
1,2,145
3,4,142
6,7,93
8,9,95
...,...,...
4493,4494,130
4494,4495,614
4496,4497,714
4497,4498,269


In [204]:
drop_cust_list

Unnamed: 0,Cust_ID,count
2,8,21
4,25,4
5,33,11
9,83,10
11,94,27
...,...,...
470750,2649384,7
470752,2649401,30
470753,2649404,12
470754,2649409,10


In [205]:
drop_movie_list.dtypes

movie_ID    int64
count       int64
dtype: object

In [206]:
drop_cust_list.dtypes

Cust_ID    int32
count      int64
dtype: object

In [207]:
netflix_data.dtypes

Cust_ID       int32
Ratings     float64
movie_ID      int64
dtype: object

In [208]:
#Fetching out those rows that are above the benchmark
netflix_data= netflix_data[~netflix_data['movie_ID'].isin(drop_movie_list)]
netflix_data= netflix_data[~netflix_data['Cust_ID'].isin(drop_cust_list)]

In [209]:
netflix_data

Unnamed: 0,Cust_ID,Ratings,movie_ID
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1
...,...,...,...
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499


**Note :-** **No Output**

# Model Building

In [212]:
# Importing movie title dataset
df_title= pd.read_csv('movie_titles.csv', encoding= 'ISO-8859-1', header=None, names=['movie_ID', 'year', 'names'], usecols=[0,1,2])

In [213]:
# Reading movie title dataset
df_title

Unnamed: 0,movie_ID,year,names
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [214]:
#Checking dtype of movie title dataset
df_title.dtypes

movie_ID      int64
year        float64
names        object
dtype: object

## Using `Surprise package` for model building

In [216]:
#installing surprise package
! pip install scikit-surprise



In [217]:
# importing surprise package
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

`SVD (Singular value Decomposition)` is used for recommendation engine etc

*The surprise library in Python is used for building and analyzing recommender systems. It provides tools to work with collaborative
 filtering algorithms,
which are commonly used to recommend items (such as movies, books, or products) to users based on their past interactions or preferences.*

In [219]:
# making instance of reader
reader= Reader()

In [220]:
# converting top 100k rows of netflix_data into SVD readable format
data= Dataset.load_from_df(netflix_data[['Cust_ID', 'movie_ID', 'Ratings']][:100000], reader)

In [221]:
# trying to access the data directly
data

<surprise.dataset.DatasetAutoFolds at 0x25631b8a690>

In the above code, we are using `load_drom_df` method from the `Dataset` module which will load these data using the reader object in `SVD readable format`. We cannot access this data directly. This data is only readable by `SVD`.

In [223]:
# making instance of SVD class
model= SVD()

In [224]:
# cross validation of the model
cross_validate(model, data, measures=['RMSE'], cv=5)

{'test_rmse': array([1.05041348, 1.0520367 , 1.05651731, 1.05711496, 1.04421618]),
 'fit_time': (2.2140631675720215,
  1.133246898651123,
  1.2220573425292969,
  1.3380494117736816,
  1.4674568176269531),
 'test_time': (0.38111042976379395,
  0.08752632141113281,
  0.12255072593688965,
  0.10651707649230957,
  0.43009066581726074)}

This above line of code is using the cross_validate function from the sklearn.model_selection module to evaluate the performance of a machine learning model on a dataset.

model -> The machine learning model to be evaluated.
data -> The dataset to be used for evaluation.
measures -> specifies the performance metrics to be used for evaluation.
cv=5 -> specifies the data will be divided into 5 subparts and the model will be trained and evaluated in each part.


### Recommendation

In [227]:
netflix_data

Unnamed: 0,Cust_ID,Ratings,movie_ID
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1
...,...,...,...
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499


In [228]:
df_title

Unnamed: 0,movie_ID,year,names
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [229]:
user_885013= df_title.copy(deep=True)

In [230]:
user_885013

Unnamed: 0,movie_ID,year,names
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


**Note :-** *These dataset also contains movies less than 908, the benchmark value which has to be dropped*

In [232]:
#Removies all the movies that are less than the 908 reviews
user_885013= user_885013[~user_885013['movie_ID'].isin(drop_movie_list)]

In [233]:
user_885013

Unnamed: 0,movie_ID,year,names
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [313]:
# based on estimation scored after predicitng the model data, the estimation score will be in the range of 1-5
# if the score is more than 3.5 we can recomend those movies to user
# SVD as it is a recomendation engione, the output of the model ie, predicted data is the estaimation score on which it wil decided the movied
# to be recomended or Not

# here in this code we are creating a new coulmns infront of each movie name on which we decide to recomend a movie

#Note: this is only for one user 1331154
user_885013['Estimated_score']= user_885013['movie_ID'].apply(lambda x: model.predict(885013,x).est)

In [314]:
user_885013

Unnamed: 0,movie_ID,year,names,Estimated_score
0,1,2003.0,Dinosaur Planet,3.858903
1,2,2004.0,Isle of Man TT 2004 Review,3.640710
2,3,1997.0,Character,4.089825
3,4,1994.0,Paula Abdul's Get Up & Dance,3.037665
4,5,2004.0,The Rise and Fall of ECW,4.807227
...,...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...,3.604853
17766,17767,2004.0,Fidel Castro: American Experience,3.604853
17767,17768,2000.0,Epoch,3.604853
17768,17769,2003.0,The Company,3.604853


In [315]:
user_885013.sort_values(by='Estimated_score', ascending=False).head()

Unnamed: 0,movie_ID,year,names,Estimated_score
4,5,2004.0,The Rise and Fall of ECW,4.807227
12,13,2003.0,Lord of the Rings: The Return of the King: Ext...,4.475403
24,25,1997.0,Inspector Morse 31: Death Is Now My Neighbour,4.242465
27,28,2002.0,Lilo and Stitch,4.194491
2,3,1997.0,Character,4.089825


**Note:-** *These are the top 5 movies that should be recommended to the user*