In [1]:
import numpy as np
import pandas as pd
import plotly.express as px 

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data=pd.read_csv('netflix_titles.csv')
data

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [3]:
data.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [4]:
#filling nan values
data['cast']=data['cast'].fillna('No Cast Specified')
data['director']=data['director'].fillna('No Director Specified')
data['country'] = data['country'].fillna('Not Mentioned')

In [5]:
#filling null values with 0
data = data.fillna(0)

In [6]:
#distribution of content ratings on Netflix
z = data.groupby(['rating']).size().reset_index(name='counts')

px.pie(z, values='counts', names='rating', 
                  title='Distribution of Content Ratings on Netflix',
                  color_discrete_sequence=px.colors.qualitative.Set3)

In [7]:
#top 5 successful directors (add series)
data['director']=data['director'].fillna('No Director Specified')
filtered_directors=pd.DataFrame()
filtered_directors=data['director'].str.split(',',expand=True).stack()
filtered_directors=filtered_directors.to_frame()
filtered_directors.columns=['director']
directors=filtered_directors.groupby(['director']).size().reset_index(name='Total Content')
directors=directors[directors.director !='No Director Specified']
directors=directors.sort_values(by=['Total Content'],ascending=False)
directorsTop5=directors.head()
directorsTop5=directorsTop5.sort_values(by=['Total Content'])

px.bar(directorsTop5,x='director',y='Total Content', color='director', title='Top 5 Directors on Netflix')

In [8]:
#top 5 successful actors
data['cast']=data['cast'].fillna('No Cast Specified')
filtered_cast=pd.DataFrame()
filtered_cast=data['cast'].str.split(',',expand=True).stack()
filtered_cast=filtered_cast.to_frame()
filtered_cast.columns=['Actor']
actors=filtered_cast.groupby(['Actor']).size().reset_index(name='Total Content')
actors=actors[actors.Actor !='No Cast Specified']
actors=actors.sort_values(by=['Total Content'],ascending=False)
actorsTop5=actors.head()
actorsTop5=actorsTop5.sort_values(by=['Total Content'])

px.bar(actorsTop5,x='Actor',y='Total Content', title='Top 5 Actors on Netflix', color='Actor')

In [9]:
#Top 10 countries with most releases of TV Shows and Movies
data['country'] = data['country'].fillna('Not Mentioned')
data['country'] = data['country'].apply(lambda x: x.split(",")[0])

c=data['country'].value_counts()
c = c.head(10)

px.bar(data, x=c.index, y=c, title='Top 10 countries with most releases of TV Shows and Movies')

In [10]:
#Trend of content produced over the years on Netflix
df1=data[['type','release_year']]
df1=df1.rename(columns={"release_year": "Release Year"})
df2=df1.groupby(['Release Year','type']).size().reset_index(name='Total Content')
df2=df2[df2['Release Year']>=2010]
px.line(df2, x="Release Year", y="Total Content", color='type',title='Trend of content produced over the years on Netflix')

In [11]:
#cleaning the data of title field
import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["title"] = data["title"].apply(clean)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
#vll use soup column as feature 2 recommend similar content
count = text.CountVectorizer(stop_words="english")
count_matrix = count.fit_transform(data['listed_in'])
similarity = cosine_similarity(count_matrix, count_matrix)

In [13]:
#resetting the index of dataframe
data=data.reset_index()

In [14]:
#setting title column as index 
indices = pd.Series(data.index, index=data['title']).fillna(0)
indices

title
dick johnson dead         0
blood  water              1
gangland                  2
jailbird new orlean       3
kota factori              4
                       ... 
zodiac                 8802
zombi dumb             8803
zombieland             8804
zoom                   8805
zubaan                 8806
Length: 8807, dtype: int64

In [15]:
#creating the function 2 recommend content
def get_recommendations_new(title, cosine_sim = similarity):

    title=title.replace(' ','').lower()

    if title not in data['title'].unique():
      print('This Movie / Tv Show is not in our database')
      
    else:
      idx = indices[title]

      sim_scores = list(enumerate(cosine_sim[idx]))

      sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

      sim_scores = sim_scores[1:11]

      movie_indices = [i[0] for i in sim_scores]

      return data['title'].iloc[movie_indices]

In [16]:
#taking mvi / tv show name as input & printing the recommended titles
mvi = input("Enter Movie / Tv Show name to recommend : ")
print('----------------------------------------------------')
print('Recommended Movies and Tv Shows for ' + mvi +' are : ')
get_recommendations_new(mvi)

Enter Movie / Tv Show name to recommend : PK
----------------------------------------------------
Recommended Movies and Tv Shows for PK are : 


79           tughlaq durbar telugu
159                      love puff
195    emi liya hai chukana padega
199                     koi aap sa
217                   dirti pictur
258                          leagu
259                         pahuna
298                     quam money
387                           mimi
452                    perfect fit
Name: title, dtype: object

In [17]:
show_id = input('Enter show id : ')	
Type = input('Enter Type (Movie / Tv Show) : ')
title = input('Enter Title name : ')	
director = input('Enter Director Name : ')	
cast = input('Enter Cast names : ')	
country = input('Enter Country name : ')	
date_added = input('Enter Date added (in the format September 25, 2021) : ')	
release_year = input('Enter Year (in the format 2022) : ')	
rating = input('Enter name of rating : ')	
duration = input('Enter Duration (in the format 55 min / 2 Season) : ')	
listed_in = input('Enter in which genere it is listed (in the format TV Shows, TV Dramas, ...... : )')	
description = input('Enter Description of the Movie / Tv Show : ')	
index = show_id

Enter show id : 8807
Enter Type (Movie / Tv Show) : Movie
Enter Title name : Bahubali
Enter Director Name : Rajmouli
Enter Cast names : Prabhas, Anushka, Rana, Tamannah
Enter Country name : India
Enter Date added (in the format September 25, 2021) : August 19, 2017
Enter Year (in the format 2022) : 2017
Enter name of rating : IMDb
Enter Duration (in the format 55 min / 2 Season) : 168
Enter in which genere it is listed (in the format TV Shows, TV Dramas, ...... : )Action, Historical
Enter Description of the Movie / Tv Show : ight for kingdom from 2 generations


In [18]:
data = data.append(pd.Series([index, show_id, Type, title, director, cast, country, date_added, release_year, rating, duration, listed_in, description],
                             index=data.columns),
                   ignore_index=True)
data

Unnamed: 0,index,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,0,s1,Movie,dick johnson dead,Kirsten Johnson,No Cast Specified,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,1,s2,TV Show,blood water,No Director Specified,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,2,s3,TV Show,gangland,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Not Mentioned,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,3,s4,TV Show,jailbird new orlean,No Director Specified,No Cast Specified,Not Mentioned,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,4,s5,TV Show,kota factori,No Director Specified,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8803,8803,s8804,TV Show,zombi dumb,No Director Specified,No Cast Specified,Not Mentioned,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,8804,s8805,Movie,zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,8805,s8806,Movie,zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."
8806,8806,s8807,Movie,zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,"March 2, 2019",2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...


In [19]:
import pickle
pickle.dump(data.to_dict(),open('movies.pkl','wb'))