# Movie Recommendation using Netflix Movie Reviews




This project aims to build a movie recommendation system using Netflix Movie Ratings. There are 17337458 Ratings given by 143458 users to 1350 movies. Ratings are in the form of Integer i.e. 1 - 5

### *Please Upvote if this notebook is helpful to you in some ways!*
So let's get started


**Table of Content**



#### 1.  Load Rating Data
#### 2.  Load Movie Data
#### 3.  Analyze Data
#### 4.  Recommendation Model
#### 4.1 Collaborative Filtering - SVD
#### 4.2 Pearson Correlation Method

In [None]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
from surprise import accuracy
from surprise.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, plot, iplot

# 1. Load Rating Data

In [None]:
df = pd.read_csv('../input/netflix-movie-rating-dataset/Netflix_Dataset_Rating.csv')
df

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df['Rating'].describe().astype('int')

In [None]:
print("Unique Values :\n",df.nunique())

# 2. Load Movie Data

In [None]:
df_title = pd.read_csv('../input/netflix-movie-rating-dataset/Netflix_Dataset_Movie.csv')
df_title

In [None]:
df_title.dtypes

In [None]:
df_title.info()

In [None]:
df_title['Year'].describe().astype('int')

In [None]:
print("Unique Values :\n",df_title.nunique())

# 3. Analyze Data

In [None]:
no_of_rated_products_per_users = df.groupby(by='User_ID')['Rating'].count().sort_values(ascending=False)
no_of_rated_products_per_users.head()

In [None]:
no_of_rated_products_per_users.describe()


In [None]:
quantiles = no_of_rated_products_per_users.quantile(np.arange(0,1.01,0.01), interpolation='higher')

plt.figure(figsize=(10,10))
plt.title("Quantiles and their Values")
quantiles.plot()
plt.scatter(x=quantiles.index[::5], y=quantiles.values[::5], c='orange', label="quantiles with 0.05 intervals")
plt.scatter(x=quantiles.index[::25], y=quantiles.values[::25], c='m', label = "quantiles with 0.25 intervals")
plt.ylabel('No of ratings given by user')
plt.xlabel('Value at the quantile')
plt.legend(loc='best')
plt.show()

In [None]:
no_of_rated_products_per_movies = df.groupby(by='Movie_ID')['Rating'].count().sort_values(ascending=False)
no_of_rated_products_per_movies.head()

In [None]:
no_of_rated_products_per_movies.describe()

In [None]:
quantiles = no_of_rated_products_per_movies.quantile(np.arange(0,1.01,0.01), interpolation='higher')

plt.figure(figsize=(10,10))
plt.title("Quantiles and their Values")
quantiles.plot()
plt.scatter(x=quantiles.index[::5], y=quantiles.values[::5], c='orange', label="quantiles with 0.05 intervals")
plt.scatter(x=quantiles.index[::25], y=quantiles.values[::25], c='m', label = "quantiles with 0.25 intervals")
plt.ylabel('No of ratings given to movies')
plt.xlabel('Value at the quantile')
plt.legend(loc='best')
plt.show()

In [None]:
f = ['count','mean']
df_movie_summary = df.groupby('Movie_ID')['Rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

df__title = df_title.set_index('Movie_ID')

In [None]:
data = df['Rating'].value_counts()

trace = go.Bar(x = data.index,y = data.values,text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],textposition = 'auto')

layout = dict(title = 'Distribution of {} Movie-ratings'.format(df.shape[0]), xaxis = dict(title = 'Rating'), yaxis = dict(title = 'Movies'))

fig = go.Figure(data=[trace], layout=layout)

iplot(fig)

In [None]:
data = df_title['Year'].value_counts()

trace = go.Bar(x = data.index,y = data.values,text = ['{:.1f} %'.format(val) for val in (data.values / df_title.shape[0] * 100)],textposition = 'auto')

layout = dict(title = 'Distribution of {} Movie - Year-wise'.format(df.shape[0]), xaxis = dict(title = 'Year'), yaxis = dict(title = 'Movies'))

fig = go.Figure(data=[trace], layout=layout)

iplot(fig)

In [None]:
df_2004 = df_title[df_title['Year']==2004]
df_2004 = df_2004.merge(df,on='Movie_ID')

data = df_2004['Rating'].value_counts()

trace = go.Bar(x = data.index,y = data.values,text = ['{:.1f} %'.format(val) for val in (data.values / df_2004.shape[0] * 100)],textposition = 'auto')

layout = dict(title = 'Distribution of {} Movie-ratings released in 2004'.format(df.shape[0]), xaxis = dict(title = 'Rating'), yaxis = dict(title = 'Movies'))

fig = go.Figure(data=[trace], layout=layout)

iplot(fig)

In [None]:
df_max_ratings = df_movie_summary[df_movie_summary['count']==df_movie_summary['count'].max()]

req_MID = df_max_ratings.index[0]
df_max_ratings = df[df['Movie_ID'] ==req_MID]

Movie_name_max_raings = df_title['Name'][df_title['Movie_ID']==req_MID].to_string(index=False)

data = df_max_ratings['Rating'].value_counts()

trace = go.Bar(x = data.index,y = data.values,text = ['{:.1f} %'.format(val) for val in (data.values / df_max_ratings.shape[0] * 100)],textposition = 'auto')

layout = dict(title = 'Distribution of Most Rated Movie -'+Movie_name_max_raings+ ' Movie-Ratings')

fig = go.Figure(data=[trace], layout=layout)

iplot(fig)

# 4. Recommendation Model


## 4.1 Collaborative Filtering - SVD

In [None]:
model = SVD(n_epochs=10,verbose = True)

data = Dataset.load_from_df(df[['User_ID', 'Movie_ID', 'Rating']], Reader())

trainset, testset = train_test_split(data, test_size=0.3,random_state=10)

trainset = data.build_full_trainset()

model.fit(trainset)

In [None]:
predictions = model.test(testset)

accuracy.rmse(predictions, verbose=True)

In [None]:
def Recommendation(given_user_id,n_movies):
    given_user = df_title.copy()
    given_user = given_user.reset_index()
    given_user = given_user[~given_user['Movie_ID'].isin(drop_movie_list)]


    given_user['Estimated_Rating'] = given_user['Movie_ID'].apply(lambda x: model.predict(given_user_id, x).est)

    given_user = given_user.drop('Movie_ID', axis = 1)

    given_user = given_user.sort_values('Estimated_Rating', ascending=False)
    given_user.drop(['index'], axis = 1,inplace=True)
    given_user.reset_index(inplace=True,drop=True)
    return given_user.head(n_movies)

### Movie Recommendation for User - 712664

In [None]:
Recommendation(712664,10)

### Movie Recommendation for User - 2643029

In [None]:
Recommendation(2643029,10)

## 4.2 Pearson Correlation Method

In [None]:
df_p = pd.pivot_table(df,values='Rating',index='User_ID',columns='Movie_ID')

def Recommend_based_on_movie(movie_title, min_count):
    i = int(df__title.index[df__title['Name'] == movie_title][0])
    target = df_p[i]
    similar_to_target = df_p.corrwith(target)
    corr_target = pd.DataFrame(similar_to_target, columns = ['PearsonR'])
    corr_target.dropna(inplace = True)
    corr_target = corr_target.sort_values('PearsonR', ascending = False)
    corr_target.index = corr_target.index.map(int)
    corr_target = corr_target.join(df__title).join(df_movie_summary)[['PearsonR', 'Name', 'count', 'mean']]
    corr_target = corr_target[corr_target['count']>min_count][:10]
    corr_target.reset_index(drop=True,inplace=True)
    return corr_target

### Recommendation based on given movie - "The Education of Little Tree"

In [None]:
Recommend_based_on_movie("The Education of Little Tree", 0)

### Recommendation based on given movie - "Character"

In [None]:
Recommend_based_on_movie("Character", 0)