# Netflix Recommendation Engine

In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
from sklearn.feature_extraction import text

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
data = pd.read_csv("Data set/netflixData.csv")
print(data.head())

                                Show Id                          Title  \
0  cc1b6ed9-cf9e-4057-8303-34577fb54477                       (Un)Well   
1  e2ef4e91-fb25-42ab-b485-be8e3b23dedb                         #Alive   
2  b01b73b7-81f6-47a7-86d8-acb63080d525  #AnneFrank - Parallel Stories   
3  b6611af0-f53c-4a08-9ffa-9716dc57eb9c                       #blackAF   
4  7f2d4170-bab8-4d75-adc2-197f7124c070               #cats_the_mewvie   

                                         Description  \
0  This docuseries takes a deep dive into the luc...   
1  As a grisly virus rampages a city, a lone man ...   
2  Through her diary, Anne Frank's story is retol...   
3  Kenya Barris and his family navigate relations...   
4  This pawesome documentary explores how our fel...   

                      Director  \
0                          NaN   
1                       Cho Il   
2  Sabina Fedeli, Anna Migotto   
3                          NaN   
4             Michael Margolis   

             

In [9]:
#To check the number of null values in each column

print(data.isnull().sum())

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64


In [10]:
#Checking total no of rows
print(len(data))

5967


In [42]:
#considering only required columns:
modified_data = data[["Title","Description","Content Type", "Genres"]]

modified_data.head(10)

Unnamed: 0,Title,Description,Content Type,Genres
0,(Un)Well,This docuseries takes a deep dive into the luc...,TV Show,Reality TV
1,#Alive,"As a grisly virus rampages a city, a lone man ...",Movie,"Horror Movies, International Movies, Thrillers"
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...",Movie,"Documentaries, International Movies"
3,#blackAF,Kenya Barris and his family navigate relations...,TV Show,TV Comedies
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,Movie,"Documentaries, International Movies"
5,#FriendButMarried,"Pining for his high school crush for years, a ...",Movie,"Dramas, International Movies, Romantic Movies"
6,#FriendButMarried 2,As Ayu and Ditto finally transition from best ...,Movie,"Dramas, International Movies, Romantic Movies"
7,#realityhigh,When nerdy high schooler Dani finally attracts...,Movie,Comedies
8,#Rucker50,This documentary celebrates the 50th anniversa...,Movie,"Documentaries, Sports Movies"
9,#Selfie,"Two days before their final exams, three teen ...",Movie,"Comedies, Dramas, International Movies"


In [34]:
modified_data2 = data[["Title","Description","Content Type", "Genres", "Director"]]
modified_data2.head()
drop_null= modified_data2.dropna()
len(drop_null)

3903

In [43]:
print(modified_data)

                              Title  \
0                          (Un)Well   
1                            #Alive   
2     #AnneFrank - Parallel Stories   
3                          #blackAF   
4                  #cats_the_mewvie   
...                             ...   
5962                      الف مبروك   
5963                   دفعة القاهرة   
5964                           海的儿子   
5965                        반드시 잡는다   
5966            최강전사 미니특공대 : 영웅의 탄생   

                                            Description Content Type  \
0     This docuseries takes a deep dive into the luc...      TV Show   
1     As a grisly virus rampages a city, a lone man ...        Movie   
2     Through her diary, Anne Frank's story is retol...        Movie   
3     Kenya Barris and his family navigate relations...      TV Show   
4     This pawesome documentary explores how our fel...        Movie   
...                                                 ...          ...   
5962  On his wedding day, a

In [44]:
#Dropping all null value rows
modified_data.dropna()
len(modified_data)


5967

In [45]:
#cleaning title column

#imports

import nltk #For processing text

import re #regular expression for matching

nltk.download('stopwords')

from nltk.corpus import stopwords

stemmer = nltk.SnowballStemmer("english")  #Makes a sentence to its basic form EX: liked to like

import string

stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Pallempati
[nltk_data]     Sowmya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
#creating method for cleaning any text data
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [47]:
#Using above created method for cleaning title column
print(modified_data)

modified_data["Title"] = modified_data["Title"].apply(clean_text)
modified_data["Title"]

                              Title  \
0                          (Un)Well   
1                            #Alive   
2     #AnneFrank - Parallel Stories   
3                          #blackAF   
4                  #cats_the_mewvie   
...                             ...   
5962                      الف مبروك   
5963                   دفعة القاهرة   
5964                           海的儿子   
5965                        반드시 잡는다   
5966            최강전사 미니특공대 : 영웅의 탄생   

                                            Description Content Type  \
0     This docuseries takes a deep dive into the luc...      TV Show   
1     As a grisly virus rampages a city, a lone man ...        Movie   
2     Through her diary, Anne Frank's story is retol...        Movie   
3     Kenya Barris and his family navigate relations...      TV Show   
4     This pawesome documentary explores how our fel...        Movie   
...                                                 ...          ...   
5962  On his wedding day, a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  modified_data["Title"] = modified_data["Title"].apply(clean_text)


0                           unwel
1                            aliv
2       annefrank  parallel stori
3                         blackaf
4                    catsthemewvi
                  ...            
5962                    الف مبروك
5963                 دفعة القاهرة
5964                         海的儿子
5965                      반드시 잡는다
5966           최강전사 미니특공대  영웅의 탄생
Name: Title, Length: 5967, dtype: object

In [48]:
print(modified_data.head())

                       Title  \
0                      unwel   
1                       aliv   
2  annefrank  parallel stori   
3                    blackaf   
4               catsthemewvi   

                                         Description Content Type  \
0  This docuseries takes a deep dive into the luc...      TV Show   
1  As a grisly virus rampages a city, a lone man ...        Movie   
2  Through her diary, Anne Frank's story is retol...        Movie   
3  Kenya Barris and his family navigate relations...      TV Show   
4  This pawesome documentary explores how our fel...        Movie   

                                           Genres  
0                                      Reality TV  
1  Horror Movies, International Movies, Thrillers  
2             Documentaries, International Movies  
3                                     TV Comedies  
4             Documentaries, International Movies  


In [49]:
# Using Genre column as a feature to recommend similar content to user

#creating list for Genre column
genre_list = modified_data["Genres"].tolist()
genre_list

['Reality TV',
 'Horror Movies, International Movies, Thrillers',
 'Documentaries, International Movies',
 'TV Comedies',
 'Documentaries, International Movies',
 'Dramas, International Movies, Romantic Movies',
 'Dramas, International Movies, Romantic Movies',
 'Comedies',
 'Documentaries, Sports Movies',
 'Comedies, Dramas, International Movies',
 'Comedies, Dramas, International Movies',
 'Comedies, International Movies, Romantic Movies',
 'Comedies, Dramas, International Movies',
 'International TV Shows, Romantic TV Shows, TV Dramas',
 'Docuseries, Science & Nature TV',
 'Dramas, International Movies, Sports Movies',
 'Movies',
 'Dramas, International Movies',
 'Dramas, International Movies',
 'Horror Movies, International Movies',
 'Crime TV Shows, TV Dramas, TV Mysteries',
 'Crime TV Shows, Docuseries',
 'Documentaries',
 'Documentaries',
 'Comedies, Dramas, Independent Movies',
 'Dramas, Independent Movies, International Movies',
 'Dramas, International Movies',
 'Dramas, Thril

In [50]:
#creating a vectorizer based on genre list to train the model
tfidf = text.TfidfVectorizer(input = genre_list , stop_words = "english")
tfidf

TfidfVectorizer(input=['Reality TV',
                       'Horror Movies, International Movies, Thrillers',
                       'Documentaries, International Movies', 'TV Comedies',
                       'Documentaries, International Movies',
                       'Dramas, International Movies, Romantic Movies',
                       'Dramas, International Movies, Romantic Movies',
                       'Comedies', 'Documentaries, Sports Movies',
                       'Comedies, Dramas, International Movies',
                       'Comedies, Dramas, In...
                       'Horror Movies, International Movies',
                       'Crime TV Shows, TV Dramas, TV Mysteries',
                       'Crime TV Shows, Docuseries', 'Documentaries',
                       'Documentaries', 'Comedies, Dramas, Independent Movies',
                       'Dramas, Independent Movies, International Movies',
                       'Dramas, International Movies', 'Dramas, Thrillers'

In [51]:
#Training the model using fit transform

tfidf_matrix = tfidf.fit_transform(genre_list)
tfidf_matrix

<5967x44 sparse matrix of type '<class 'numpy.float64'>'
	with 22096 stored elements in Compressed Sparse Row format>

In [59]:
#checks similar values and returns a higher value if matches
similarity = cosine_similarity(tfidf_matrix)
similarity

array([[1.        , 0.        , 0.        , ..., 0.32075218, 0.        ,
        0.        ],
       [0.        , 1.        , 0.30428612, ..., 0.07587812, 0.68953015,
        0.15936057],
       [0.        , 0.30428612, 1.        , ..., 0.11962968, 0.27899812,
        0.12562419],
       ...,
       [0.32075218, 0.07587812, 0.11962968, ..., 1.        , 0.25478887,
        0.        ],
       [0.        , 0.68953015, 0.27899812, ..., 0.25478887, 1.        ,
        0.110801  ],
       [0.        , 0.15936057, 0.12562419, ..., 0.        , 0.110801  ,
        1.        ]])

In [66]:
#assigning each title to a index by eliminating duplicates
indices = pd.Series(modified_data.index, index = modified_data['Title']).drop_duplicates()
print(indices)

Title
unwel                           0
aliv                            1
annefrank  parallel stori       2
blackaf                         3
catsthemewvi                    4
                             ... 
الف مبروك                    5962
دفعة القاهرة                 5963
海的儿子                         5964
반드시 잡는다                      5965
최강전사 미니특공대  영웅의 탄생           5966
Length: 5967, dtype: int64


In [82]:
#recommendation algorithm for top 10 similar movies sorted by similarity scores of genres

def netflix_recommendation(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    similarity_scores = similarity_scores[60:90]
    print(similarity_scores)
    movieindices = [i[0] for i in similarity_scores]
    return modified_data['Title'].iloc[movieindices]
                               
print(netflix_recommendation("girlfriend"))
                               
    

[(1225, 0.872378329739641), (1235, 0.872378329739641), (1425, 0.872378329739641), (1426, 0.872378329739641), (1653, 0.872378329739641), (1787, 0.872378329739641), (1826, 0.872378329739641), (1839, 0.872378329739641), (1882, 0.872378329739641), (2756, 0.872378329739641), (3004, 0.872378329739641), (3474, 0.872378329739641), (3554, 0.872378329739641), (3772, 0.872378329739641), (4032, 0.872378329739641), (4201, 0.872378329739641), (4323, 0.872378329739641), (4954, 0.872378329739641), (5152, 0.872378329739641), (5271, 0.872378329739641), (163, 0.8036877889018903), (654, 0.8036877889018903), (775, 0.8036877889018903), (1125, 0.8036877889018903), (1500, 0.8036877889018903), (1926, 0.8036877889018903), (2250, 0.8036877889018903), (2483, 0.8036877889018903), (2547, 0.8036877889018903), (2577, 0.8036877889018903)]
1225                             dead
1235                 dear white peopl
1425                          eastsid
1426                             easi
1653                          