# Movie Recommendation using Netflix Movie Reviews




This project aims to build a movie recommendation system using Netflix Movie Ratings. There are 17337458 Ratings given by 143458 users to 1350 movies. Ratings are in the form of Integer i.e. 1 - 5


**Table of Content**



#### 1.  Load Rating Data
#### 2.  Load Movie Data
#### 3.  Analyze Data
#### 4.  Recommendation Model
#### 4.1 Collaborative Filtering - SVD

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
! pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 KB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=3366578 sha256=cbb97494fde6689160e7d6be30a8e49742e213d4ee85f646e1f8f0d6ab902ccd
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-sur

In [None]:
from surprise import Reader, Dataset, SVD
from surprise import accuracy
from surprise.model_selection import train_test_split

# 1. Load Rating Data

In [None]:
df = pd.read_csv('Netflix_Dataset_Rating.csv')
df

Unnamed: 0,User_ID,Rating,Movie_ID
0,712664,5,3
1,1331154,4,3
2,2632461,3,3
3,44937,5,3
4,656399,4,3
...,...,...,...
16380161,1841951,3,4302
16380162,2562900,3,4302
16380163,1588735,5,4302
16380164,464989,2,4302


In [None]:
df.dtypes

User_ID     int64
Rating      int64
Movie_ID    int64
dtype: object

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16380166 entries, 0 to 16380165
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   User_ID   int64
 1   Rating    int64
 2   Movie_ID  int64
dtypes: int64(3)
memory usage: 374.9 MB


In [None]:
df['Rating'].describe().astype('int')

count    16380166
mean            3
std             1
min             1
25%             3
50%             4
75%             4
max             5
Name: Rating, dtype: int64

In [None]:
print("Unique Values :\n",df.nunique())

Unique Values :
 User_ID     143458
Rating           5
Movie_ID      1291
dtype: int64


# 2. Load Movie Data

In [None]:
df_title = pd.read_csv('Netflix_Dataset_Movie.csv')
df_title

Unnamed: 0,Movie_ID,Year,Name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004,Fidel Castro: American Experience
17767,17768,2000,Epoch
17768,17769,2003,The Company


In [None]:
df_title.dtypes

Movie_ID     int64
Year         int64
Name        object
dtype: object

In [None]:
df_title.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Movie_ID  17770 non-null  int64 
 1   Year      17770 non-null  int64 
 2   Name      17770 non-null  object
dtypes: int64(2), object(1)
memory usage: 416.6+ KB


In [None]:
df_title['Year'].describe().astype('int')

count    17770
mean      1990
std         16
min       1915
25%       1985
50%       1997
75%       2002
max       2005
Name: Year, dtype: int64

In [None]:
print("Unique Values :\n",df_title.nunique())

Unique Values :
 Movie_ID    17770
Year           91
Name        17297
dtype: int64


# 3. Analyze Data

In [None]:
no_of_rated_products_per_users = df.groupby(by='User_ID')['Rating'].count().sort_values(ascending=False)
no_of_rated_products_per_users.head()

User_ID
305344     1284
387418     1279
2439493    1266
2118461    1247
1664010    1201
Name: Rating, dtype: int64

In [None]:
no_of_rated_products_per_users.describe()

count    143458.000000
mean        114.180917
std          76.317957
min           5.000000
25%          63.000000
50%          89.000000
75%         139.000000
max        1284.000000
Name: Rating, dtype: float64

In [None]:
no_of_rated_products_per_movies = df.groupby(by='Movie_ID')['Rating'].count().sort_values(ascending=False)
no_of_rated_products_per_movies.head()

Movie_ID
1905    117075
2452    102721
571     101450
3860     98545
2862     95053
Name: Rating, dtype: int64

In [None]:
no_of_rated_products_per_movies.describe()

count      1291.000000
mean      12687.967467
std       17519.555423
min        1042.000000
25%        2614.000000
50%        5222.000000
75%       14708.000000
max      117075.000000
Name: Rating, dtype: float64

In [None]:
f = ['count','mean']
df_movie_summary = df.groupby('Movie_ID')['Rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

df__title = df_title.set_index('Movie_ID')

# 4. Recommendation Model


## 4.1 Collaborative Filtering - SVD

In [None]:
model = SVD()

data = Dataset.load_from_df(df[['User_ID', 'Movie_ID', 'Rating']], Reader())

trainset, testset = train_test_split(data, test_size=0.3,random_state=10)

trainset = data.build_full_trainset()

model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc113044610>

In [None]:
predictions = model.test(testset)

accuracy.rmse(predictions)

RMSE: 0.7361


0.7360989994351077

In [None]:
def Recommendation(given_user_id,n_movies):
    given_user = df_title.copy()
    given_user = given_user.reset_index()
    given_user = given_user[~given_user['Movie_ID'].isin(drop_movie_list)]


    given_user['Estimated_Rating'] = given_user['Movie_ID'].apply(lambda x: model.predict(given_user_id, x).est)

    given_user = given_user.drop('Movie_ID', axis = 1)

    given_user = given_user.sort_values('Estimated_Rating', ascending=False)
    given_user.drop(['index'], axis = 1,inplace=True)
    given_user.reset_index(inplace=True,drop=True)
    return given_user.head(n_movies)

### Movie Recommendation for User - 712664

In [None]:
Recommendation(712664,10)

Unnamed: 0,Year,Name,Estimated_Rating
0,1974,The Godfather,5.0
1,1954,Seven Samurai,5.0
2,1992,Reservoir Dogs,5.0
3,1991,The Simpsons: Season 3,4.950384
4,1978,The Deer Hunter,4.891069
5,1961,The Hustler,4.890522
6,1990,The Simpsons: Treehouse of Horror,4.871523
7,1949,The Third Man,4.853952
8,1989,The Simpsons: Season 1,4.847204
9,1975,Jaws,4.8335


### Movie Recommendation for User - 2643029

In [None]:
Recommendation(2643029,5)

Unnamed: 0,Year,Name,Estimated_Rating
0,2004,Eternal Sunshine of the Spotless Mind,4.79006
1,2003,Whale Rider,4.744551
2,1999,Being John Malkovich,4.681243
3,2002,Spirited Away,4.59376
4,2004,Napoleon Dynamite,4.571332
