###Importing the required packages

In [61]:
import pandas as pd
import numpy as np
import neattext.functions as nfx
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

In [62]:
df=pd.read_csv('netflix.csv')
df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"


###Changing the column name

In [63]:
df.rename(columns={'listed_in':'genres'},inplace=True)

In [64]:
df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,genres
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"


### Counting no of movies,tv shows in dataset

In [68]:
df['type'].value_counts()

Movie      6126
TV Show    2664
Name: type, dtype: int64

### Movie Recommendation Engine

In [9]:
movies_df = df[df['type'] == 'Movie'].reset_index(drop= True)
movies_df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,genres
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
2,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"
3,s10,Movie,The Starling,Theodore Melfi,United States,9/24/2021,2021,PG-13,104 min,"Comedies, Dramas"
4,s939,Movie,Motu Patlu in the Game of Zones,Suhas Kadav,India,5/1/2021,2019,TV-Y7,87 min,"Children & Family Movies, Comedies, Music & Mu..."


### Checking duplicates in movie dataframe

In [69]:
movies_df.duplicated().sum()

0

### Checking Whether it has any null values

In [13]:
movies_df.isnull().sum()

show_id         0
type            0
title           0
director        0
country         0
date_added      0
release_year    0
rating          0
duration        0
genres          0
dtype: int64

### Acquiring required columns into a new dataframe

In [15]:
movies = movies_df[['title','director','country','rating','genres']]
movies.head()

Unnamed: 0,title,director,country,rating,genres
0,Dick Johnson Is Dead,Kirsten Johnson,United States,PG-13,Documentaries
1,Confessions of an Invisible Girl,Bruno Garotti,Brazil,TV-PG,"Children & Family Movies, Comedies"
2,Sankofa,Haile Gerima,United States,TV-MA,"Dramas, Independent Movies, International Movies"
3,The Starling,Theodore Melfi,United States,PG-13,"Comedies, Dramas"
4,Motu Patlu in the Game of Zones,Suhas Kadav,India,TV-Y7,"Children & Family Movies, Comedies, Music & Mu..."


In [25]:
movies.describe().T

Unnamed: 0,count,unique,top,freq
title,6126,6124,22-Jul,2
director,6126,4355,Given,173
country,6126,79,United States,2395
rating,6126,14,TV-MA,2062
genres,6126,278,"Dramas, International Movies",362


### Removing Stopwords, Special Characters using Neattext package

In [29]:
# Remove stopwords
movies['director'] = movies['director'].apply(nfx.remove_stopwords)
movies['country'] = movies['country'].apply(nfx.remove_stopwords)
movies['genres'] = movies['genres'].apply(nfx.remove_stopwords)

# # Remove special characters
movies['country'] = movies['country'].apply(nfx.remove_special_characters)

movies.head()

Unnamed: 0,title,director,country,rating,genres
0,Dick Johnson Is Dead,Kirsten Johnson,United States,PG-13,Documentaries
1,Confessions of an Invisible Girl,Bruno Garotti,Brazil,TV-PG,"Children & Family Movies, Comedies"
2,Sankofa,Haile Gerima,United States,TV-MA,"Dramas, Independent Movies, International Movies"
3,The Starling,Theodore Melfi,United States,PG-13,"Comedies, Dramas"
4,Motu Patlu in the Game of Zones,Suhas Kadav,India,TV-Y7,"Children & Family Movies, Comedies, Music & Mu..."


### Vectorizing the Data(Converting words into real number)

In [32]:
countVector = CountVectorizer(binary= True)
country = countVector.fit_transform(movies['country']).toarray()

countVector = CountVectorizer(binary= True,
                             tokenizer=lambda x:x.split(','))
director = countVector.fit_transform(movies['director']).toarray()
genres = countVector.fit_transform(movies['genres']).toarray()

In [33]:
binary_director = pd.DataFrame(director).transpose()
#binary_cast = pd.DataFrame(cast).transpose()
binary_country = pd.DataFrame(country).transpose()
binary_genres = pd.DataFrame(genres).transpose()

In [34]:
movies_binary = pd.concat([binary_director,binary_country, binary_genres], axis=0,ignore_index=True)
movies_binary.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4997,4998,4999,5000,5001,5002,5003,5004,5005,5006
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6121,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6122,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6123,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6124,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
movies_sim = cosine_similarity(movies_binary.T)
movies_sim

array([[1.        , 0.        , 0.40824829, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.28867513],
       [0.40824829, 0.        , 1.        , ..., 0.16666667, 0.40824829,
        0.        ],
       ...,
       [0.        , 0.        , 0.16666667, ..., 1.        , 0.40824829,
        0.23570226],
       [0.        , 0.        , 0.40824829, ..., 0.40824829, 1.        ,
        0.28867513],
       [0.        , 0.28867513, 0.        , ..., 0.23570226, 0.28867513,
        1.        ]])

In [37]:
movies_sim.shape

(6126, 6126)

### TV Recommendation Enginer

In [38]:
# Getting Tv Shows
tv_show = df[df['type'] == 'TV Show'].reset_index(drop= True)
tv_show.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,genres
0,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
1,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
2,s9,TV Show,The Great British Baking Show,Andy Devonshire,United Kingdom,9/24/2021,2021,TV-14,9 Seasons,"British TV Shows, Reality TV"
3,s4,TV Show,Jailbirds New Orleans,Not Given,Pakistan,9/24/2021,2021,TV-MA,1 Season,"Docuseries, Reality TV"
4,s15,TV Show,Crime Stories: India Detectives,Not Given,Pakistan,9/22/2021,2021,TV-MA,1 Season,"British TV Shows, Crime TV Shows, Docuseries"


### Checking duplicates in TV Show dataframe

In [39]:
# Checking for duplicate
tv_show.duplicated().sum()

0

### Checking Duplicates

In [40]:
# Checking for null values
tv_show.isnull().sum()


show_id         0
type            0
title           0
director        0
country         0
date_added      0
release_year    0
rating          0
duration        0
genres          0
dtype: int64

### Acquiring required columns into a dataframe

In [41]:
# Selecting features for working
tv_df = tv_show[['title','director','country', 'rating', 'genres']]
tv_df.head()

Unnamed: 0,title,director,country,rating,genres
0,Ganglands,Julien Leclercq,France,TV-MA,"Crime TV Shows, International TV Shows, TV Act..."
1,Midnight Mass,Mike Flanagan,United States,TV-MA,"TV Dramas, TV Horror, TV Mysteries"
2,The Great British Baking Show,Andy Devonshire,United Kingdom,TV-14,"British TV Shows, Reality TV"
3,Jailbirds New Orleans,Not Given,Pakistan,TV-MA,"Docuseries, Reality TV"
4,Crime Stories: India Detectives,Not Given,Pakistan,TV-MA,"British TV Shows, Crime TV Shows, Docuseries"


In [42]:
tv_df.describe().T

Unnamed: 0,count,unique,top,freq
title,2664,2663,9-Feb,2
director,2664,226,Not Given,2415
country,2664,59,United States,845
rating,2664,9,TV-MA,1143
genres,2664,235,Kids' TV,219


Vectorising the Data(Convert words into real numbers)

In [43]:

tv_df['country'] = tv_df['country'].apply(nfx.remove_stopwords)
tv_df['genres'] = tv_df['genres'].apply(nfx.remove_stopwords)

tv_df['country'] = tv_df['country'].apply(nfx.remove_special_characters)

tv_df.head()

Unnamed: 0,title,director,country,rating,genres
0,Ganglands,Julien Leclercq,France,TV-MA,"Crime TV Shows, International TV Shows, TV Act..."
1,Midnight Mass,Mike Flanagan,United States,TV-MA,"TV Dramas, TV Horror, TV Mysteries"
2,The Great British Baking Show,Andy Devonshire,United Kingdom,TV-14,"British TV Shows, Reality TV"
3,Jailbirds New Orleans,Not Given,Pakistan,TV-MA,"Docuseries, Reality TV"
4,Crime Stories: India Detectives,Not Given,Pakistan,TV-MA,"British TV Shows, Crime TV Shows, Docuseries"


In [44]:
countVector = CountVectorizer(binary= True)
country = countVector.fit_transform(tv_df['country']).toarray()

countVector = CountVectorizer(binary= True,
                             tokenizer=lambda x:x.split(','))
#cast = countVector.fit_transform(tv_df['cast']).toarray()
genres = countVector.fit_transform(tv_df['genres']).toarray()

In [45]:
#tv_binary_cast = pd.DataFrame(cast).transpose()
tv_binary_country = pd.DataFrame(country).transpose()
tv_binary_genres = pd.DataFrame(genres).transpose()

In [46]:
tv_binary = pd.concat([ tv_binary_country, tv_binary_genres], axis=0,ignore_index=True)
tv_binary.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,93,94,95,96,97,98,99,100,101,102
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2659,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2660,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2661,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2662,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
tv_sim = cosine_similarity(tv_binary.T)
tv_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.2236068 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.2236068 , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.35355339,
        0.35355339],
       [0.        , 0.        , 0.        , ..., 0.35355339, 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.35355339, 1.        ,
        1.        ]])

In [49]:
tv_sim.shape

(2664, 2664)

#  Recommendation Engine Testing

In [50]:
def recommend(title):
    if title in movies_df['title'].values:
        movies_index = movies_df[movies_df['title'] == title].index.item()
        scores = dict(enumerate(movies_sim[movies_index]))
        sorted_scores = dict(sorted(scores.items(), key=lambda x: x[1], reverse=True))

        selected_movies_index = [id for id, scores in sorted_scores.items()]
        selected_movies_score = [scores for id, scores in sorted_scores.items()]

        rec_movies = movies_df.iloc[selected_movies_index]
        rec_movies['similiarity'] = selected_movies_score

        movie_recommendation = rec_movies.reset_index(drop=True)
        return movie_recommendation[1:6] # Skipping the first row

    elif title in tv_show['title'].values:
        tv_index = tv_show[tv_show['title'] == title].index.item()
        scores = dict(enumerate(tv_sim[tv_index]))
        sorted_scores = dict(sorted(scores.items(), key=lambda x: x[1], reverse=True))

        selected_tv_index = [id for id, scores in sorted_scores.items()]
        selected_tv_score = [scores for id, scores in sorted_scores.items()]

        rec_tv = tv_show.iloc[selected_tv_index]
        rec_tv['similiarity'] = selected_tv_score

        tv_recommendation = rec_tv.reset_index(drop=True)
        return tv_recommendation[1:6] # Skipping the first row

    else:
        print("Title not in dataset. Please check spelling.")

# Testing


In [53]:
recommend('Fida')

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,genres,similiarity
1,s552,Movie,Haseen Dillruba,Vinil Mathew,India,7/2/2021,2021,TV-MA,136 min,"International Movies, Romantic Movies, Thrillers",0.8
2,s1109,Movie,Irul,Naseef Yusuf Izuddin,India,4/2/2021,2021,TV-MA,91 min,"International Movies, Thrillers",0.67082
3,s1266,Movie,The Girl on the Train,Ribhu Dasgupta,India,2/26/2021,2021,TV-MA,120 min,"International Movies, Thrillers",0.67082
4,s1679,Movie,Raman Raghav 2.0,Anurag Kashyap,India,11/19/2020,2016,TV-MA,133 min,"International Movies, Thrillers",0.67082
5,s1987,Movie,Nee Enge En Anbe,Sekhar Kammula,India,9/17/2020,2014,TV-14,137 min,"International Movies, Thrillers",0.67082


In [54]:
recommend('Kal Ho Naa Ho')

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,genres,similiarity
1,s196,Movie,EMI: Liya Hai To Chukana Padega,Saurabh Kabra,India,8/27/2021,2008,TV-14,128 min,"Comedies, Dramas, International Movies",0.8
2,s200,Movie,Koi Aap Sa,Partho Mitra,India,8/27/2021,2006,TV-14,135 min,"Comedies, Dramas, International Movies",0.8
3,s218,Movie,The Dirty Picture,Milan Luthria,India,8/27/2021,2011,TV-14,145 min,"Comedies, Dramas, International Movies",0.8
4,s260,Movie,Pahuna,Paakhi Tyrewala,India,8/18/2021,2018,TV-PG,82 min,"Comedies, Dramas, International Movies",0.8
5,s388,Movie,Mimi,Laxman Utekar,India,7/27/2021,2021,TV-14,132 min,"Comedies, Dramas, International Movies",0.8


In [57]:
recommend('Dick Johnson Is Dead')

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,genres,similiarity
1,s1071,Movie,Why Did You Kill Me?,Fredrick Munk,United States,4/14/2021,2021,TV-MA,84 min,Documentaries,0.75
2,s379,Movie,Fantastic Fungi,Louie Schwartzberg,United States,7/28/2021,2019,TV-14,81 min,Documentaries,0.75
3,s463,Movie,This Changes Everything,Tom Donahue,United States,7/15/2021,2019,TV-MA,96 min,Documentaries,0.75
4,s717,Movie,FTA,Francine Parker,United States,6/15/2021,1972,TV-MA,104 min,Documentaries,0.75
5,s723,Movie,Sir! No Sir!,David Zeiger,United States,6/15/2021,2005,TV-MA,84 min,Documentaries,0.75


In [58]:
recommend('Stranger Things')

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,genres,similiarity
1,s3188,TV Show,Nightflyers,Not Given,United States,12/2/2019,2018,TV-MA,1 Season,"TV Horror, TV Mysteries, TV Sci-Fi & Fantasy",1.0
2,s3686,TV Show,Stranger Things,Not Given,United States,7/4/2019,2019,TV-14,3 Seasons,"TV Horror, TV Mysteries, TV Sci-Fi & Fantasy",1.0
3,s6954,TV Show,Helix,Not Given,United States,11/10/2018,2015,TV-MA,1 Season,"TV Horror, TV Mysteries, TV Sci-Fi & Fantasy",1.0
4,s5889,TV Show,Hemlock Grove,Eli Roth,United States,10/23/2015,2015,TV-MA,3 Seasons,"TV Horror, TV Mysteries, TV Thrillers",0.8
5,s242,TV Show,Manifest,Not Given,United States,8/21/2021,2021,TV-14,3 Seasons,"TV Dramas, TV Mysteries, TV Sci-Fi & Fantasy",0.8


In [60]:
recommend('I Hate Luv Storys')

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,genres,similiarity
1,s27,Movie,Minsara Kanavu,Rajiv Menon,India,9/21/2021,1997,TV-PG,147 min,"Comedies, International Movies, Music & Musicals",0.8
2,s203,Movie,Kyaa Kool Hai Hum,Sangeeth Sivan,India,8/27/2021,2005,TV-MA,165 min,"Comedies, International Movies, Music & Musicals",0.8
3,s1019,Movie,Jaane Tu... Ya Jaane Na,Abbas Tyrewala,India,4/17/2021,2008,TV-14,147 min,"Comedies, International Movies, Music & Musicals",0.8
4,s1676,Movie,Break Ke Baad,Danish Aslam,India,11/19/2020,2010,TV-14,108 min,"Comedies, International Movies, Music & Musicals",0.8
5,s1796,Movie,Ajab Prem Ki Ghazab Kahani,Rajkumar Santoshi,India,10/25/2020,2009,TV-14,150 min,"Comedies, International Movies, Music & Musicals",0.8


In [75]:
recommend('The Kissing Booth')

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,genres,similiarity
1,s2214,Movie,The Kissing Booth 2,Vince Marcello,United Kingdom,7/24/2020,2020,TV-14,133 min,"Comedies, Romantic Movies",1.0
2,s4869,Movie,The Kissing Booth,Vince Marcello,United Kingdom,5/11/2018,2018,TV-14,105 min,"Comedies, Romantic Movies",1.0
3,s2694,Movie,Love Wedding Repeat,Dean Craig,United Kingdom,4/10/2020,2020,TV-MA,101 min,"Comedies, International Movies, Romantic Movies",0.730297
4,s7392,Movie,Man Up,Ben Palmer,United Kingdom,9/4/2018,2015,R,88 min,"Comedies, International Movies, Romantic Movies",0.730297
5,s349,Movie,Poms,Zara Hayes,United Kingdom,8/1/2021,2019,PG-13,91 min,Comedies,0.67082
