#Import Libaries and data

In [82]:
import numpy as np # Importing the NumPy library for performing linear algebra operations
import pandas as pd # Importing the Pandas library for data manipulation and preparation
import plotly.express as px # Importing Plotly Express for interactive data visualization
from textblob import TextBlob # Importing TextBlob for sentiment analysis of textual data

# Reading the Netflix titles data from a CSV file into a Pandas DataFrame
df = pd.read_csv('netflix_titles.csv')

## Checking number of rows and columns in data

In [83]:
df.shape

(8807, 12)

# Checking content available in Dataset

In [84]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


#Filtering countries like USA, India, UK

In [85]:
# Defining a list of main countries of the interest
main_country = ['United States', 'India','United Kingdom']
# Creating a boolean mask to filter rows in the DataFrame where the 'country' column matches any of the countries in the main_country list
mask = df['country'].isin(main_country)

In [86]:
# Creating a new dataframe called new_df containing only rows where the country columns matches any of the countries in the main_country list
new_df = df[mask]
print(new_df)

     show_id     type                          title         director  \
0         s1    Movie           Dick Johnson Is Dead  Kirsten Johnson   
4         s5  TV Show                   Kota Factory              NaN   
8         s9  TV Show  The Great British Baking Show  Andy Devonshire   
9        s10    Movie                   The Starling   Theodore Melfi   
15       s16  TV Show              Dear White People              NaN   
...      ...      ...                            ...              ...   
8799   s8800    Movie                          Zenda   Avadhoot Gupte   
8802   s8803    Movie                         Zodiac    David Fincher   
8804   s8805    Movie                     Zombieland  Ruben Fleischer   
8805   s8806    Movie                           Zoom     Peter Hewitt   
8806   s8807    Movie                         Zubaan      Mozez Singh   

                                                   cast         country  \
0                                               

In [87]:
new_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...
15,s16,TV Show,Dear White People,,"Logan Browning, Brandon P. Bell, DeRon Horton,...",United States,"September 22, 2021",2021,TV-MA,4 Seasons,"TV Comedies, TV Dramas",Students of color navigate the daily slights a...


In [88]:
new_df.shape

(4209, 12)

#Taking the count of ratings available

In [89]:
# Grouping the DataFrame df by the rating column and calculating the size of each group, then resetting the index and assigning it to x
x = new_df.groupby(['rating']).size().reset_index(name='counts')
print(x)

      rating  counts
0     66 min       1
1     74 min       1
2     84 min       1
3          G      29
4      NC-17       1
5         NR      44
6         PG     168
7      PG-13     301
8          R     474
9      TV-14    1028
10      TV-G     114
11     TV-MA    1353
12     TV-PG     460
13      TV-Y     111
14     TV-Y7     120
15  TV-Y7-FV       2
16        UR       1


#Creating the Piechart based on Content rating

In [90]:
# Creating a pie chart using Plotly Express (px) with data 'x', where 'counts' represent the values and 'rating' represents the categories.
pieChart = px.pie(x, values='counts', names='rating', title='Distribution of content ratings on Netflix')
pieChart.show()

#Analyzing the top 5 Directors on Netflix

In [91]:
# Filling missing values in the director column with the string 'Director not specified'
new_df['director']=new_df['director'].fillna('Director not specified')
# Creating an empty DataFrame called directors_list
directors_list = pd.DataFrame()
# Splitting the director column by commas into separate columns, expanding them into a DataFrame,
# and stacking the resulting columns to create a single series containing all directors
directors_list = new_df['director'].str.split(',', expand=True).stack()
# Converting the series containing directors names into a DataFrame
directors_list = directors_list.to_frame()
# Renaming the column of the DataFrame directors_list to Director
directors_list.columns = ['Director']
print(directors_list)

                      Director
0    0         Kirsten Johnson
4    0  Director not specified
8    0         Andy Devonshire
9    0          Theodore Melfi
15   0  Director not specified
...                        ...
8799 0          Avadhoot Gupte
8802 0           David Fincher
8804 0         Ruben Fleischer
8805 0            Peter Hewitt
8806 0             Mozez Singh

[4591 rows x 1 columns]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [92]:
# Grouping the DataFrame directors_list by the Director column and calculating the size of each group,
# then resetting the index and assigning it to directors with the column name Total Count
directors = directors_list.groupby(['Director']).size().reset_index(name='Total Count')
# Filtering out rows in the directors DataFrame where the director's name is Director not specified
directors = directors[directors.Director != 'Director not specified']
# Sorting the directors DataFrame by the Total Count column in descending order
directors = directors.sort_values(by=['Total Count'], ascending = False)
print(directors)

             Director  Total Count
1222        Jay Karas           15
1632     Marcus Raboy           15
1220      Jay Chapman           12
851      David Dhawan            9
2335  Shannon Hartman            9
...               ...          ...
963     Elle Callahan            1
964       Ellen Brown            1
965     Ellen Seidler            1
966       Ellena Wood            1
2679      Àlex Pastor            1

[2679 rows x 2 columns]


In [93]:
# Extracting the top 5 directors with the highest total count of appearances from the sorted DataFrame directors
top5Directors = directors.head()
print(top5Directors)

             Director  Total Count
1222        Jay Karas           15
1632     Marcus Raboy           15
1220      Jay Chapman           12
851      David Dhawan            9
2335  Shannon Hartman            9


#Creating a barplot for Top 5 Directors

In [94]:
# Sorting the DataFrame 'top5Directors' by the 'Total Count' column in ascending order
top5Directors = top5Directors.sort_values(by=['Total Count'])
# Creating a bar chart using Plotly Express (px) with data from 'top5Directors', where 'Total Count' is plotted on the x-axis and 'Director' on the y-axis
barChart = px.bar(top5Directors, x='Total Count', y = 'Director', title = 'Top 5 Directors on Netflix')
barChart.show()

# Analyzing the top 5 Actors on Netflix

In [95]:
# Filling missing values in the cast column with the string 'No cast specified'
new_df['cast']=new_df['cast'].fillna('No cast specified')

# Creating an empty DataFrame called cast_df
cast_df = pd.DataFrame()

# Splitting the cast column by commas into separate columns, expanding them into a DataFrame,
# and stacking the resulting columns to create a single series containing all actors
cast_df = df['cast'].str.split(',',expand=True).stack()

# Converting the series containing actors names into a DataFrame
cast_df = cast_df.to_frame()

# Renaming the column of the DataFrame cast_df to Actor
cast_df.columns = ['Actor']

# Grouping the DataFrame cast_df by the Actor column and calculating the size of each group,
# then resetting the index and assigning it to 'actors' with the column name Total Count
actors = cast_df.groupby(['Actor']).size().reset_index(name='Total Count')

# Filtering out rows in the actors DataFrame where the actor's name is No cast specified
actors = actors[actors.Actor != 'No cast specified']

# Sorting the actors DataFrame by the Total Count column in descending order
actors = actors.sort_values(by=['Total Count'], ascending=False)

# Extracting the top 5 actors with the highest total count of appearances from the sorted DataFrame actors
top5Actors = actors.head()

# Sorting the top5Actors DataFrame by the Total Count column in ascending order
top5Actors = top5Actors.sort_values(by=['Total Count'])

# Creating a bar chart using Plotly Express with the Total Count column as x-axis and Actor column as y-axis,
# and setting the title of the chart
barChart2 = px.bar(top5Actors, x='Total Count', y='Actor', title='Top 5 Actors on Netflix')

# Displaying the bar chart using Plotly
barChart2.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#Analyzing the content produced on netflix based on years

In [96]:
# Creating a new DataFrame 'df1' by selecting only the 'type' and 'release_year' columns from the original DataFrame 'df'
df1 = new_df[['type', 'release_year']]

# Renaming the columns of the DataFrame 'df1' to 'Release Year' and 'Type'
df1 = df1.rename(columns={"release_year": "Release Year", "type": "Type"})

# Grouping the DataFrame 'df1' by the 'Release Year' and 'Type' columns, calculating the size of each group,
# then resetting the index and assigning it to 'df2' with the column name 'Total Count'
df2 = df1.groupby(['Release Year', 'Type']).size().reset_index(name='Total Count')
print(df2)

     Release Year     Type  Total Count
0            1942    Movie            2
1            1943    Movie            3
2            1944    Movie            2
3            1945    Movie            3
4            1945  TV Show            1
..            ...      ...          ...
102          2019  TV Show          166
103          2020    Movie          255
104          2020  TV Show          200
105          2021    Movie           77
106          2021  TV Show          114

[107 rows x 3 columns]


In [97]:
# Filtering out rows in the 'df2' DataFrame where the 'Release Year' is greater than or equal to 2000
df2 = df2[df2['Release Year'] >= 2000]

# Creating a line chart using Plotly Express with the 'Release Year' column as the x-axis, 'Total Count' column as the y-axis,
# and 'Type' column as the color to represent different types of content, and setting the title of the chart
graph = px.line(df2, x="Release Year", y="Total Count", color="Type", title="Trend of Content Produced on Netflix Every Year")

# Displaying the line chart using Plotly
graph.show()

#Sentiment Analysis of Netflix Content

In [98]:
# Creating a new DataFrame 'df3' by selecting only the 'release_year' and 'description' columns from the original DataFrame 'df'
df3 = new_df[['release_year', 'description']]

# Renaming the columns of the DataFrame 'df3' to 'Release Year' and 'Description'
df3 = df3.rename(columns={'release_year': 'Release Year', 'description': 'Description'})

# Iterating over each row in the DataFrame 'df3'
for index, row in df3.iterrows():

  # Extracting the description from the current row
  d = row['Description']
  # Creating a TextBlob object for sentiment analysis on the description
  testimonial = TextBlob(d)
  # Calculating the polarity of the sentiment
  p = testimonial.sentiment.polarity

  # Determining the sentiment label based on the polarity value
  if p == 0:
    sent = 'Neutral'
  elif p > 0:
    sent = 'Positive'
  else:
    sent = 'Negative'

  # Assigning the sentiment label to the 'Sentiment' column in the DataFrame 'df3' for the current row
  df3.loc[index, 'Sentiment'] = sent

# Grouping the DataFrame 'df3' by the 'Release Year' and 'Sentiment' columns, calculating the size of each group,
# then resetting the index and assigning it to 'df3' with the column name 'Total Count'
df3 = df3.groupby(['Release Year', 'Sentiment']).size().reset_index(name='Total Count')

# Filtering out rows in the 'df3' DataFrame where the 'Release Year' is greater than 2005
df3 = df3[df3['Release Year'] > 2005]

# Creating a bar chart using Plotly Express with the 'Release Year' column as the x-axis, 'Total Count' column as the y-axis,
# and 'Sentiment' column as the color to represent different sentiments, and setting the title of the chart
barGraph = px.bar(df3, x="Release Year", y="Total Count", color="Sentiment", title="Sentiment Analysis of Content on Netflix")

# Displaying the bar chart using Plotly
barGraph.show()

#Content-Based Recommendation

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer# Importing the TfidfVectorizer class from the scikit-learn feature extraction module for converting text to feature vectors using TF-IDF
from sklearn.metrics.pairwise import cosine_similarity# Importing the cosine_similarity function from the scikit-learn metrics pairwise module for calculating cosine similarity between samples
from sklearn.metrics.pairwise import linear_kernel# Importing the linear_kernel function from the scikit-learn metrics pairwise module for computing the linear kernel between two samples

#Data Preprocessing

In [100]:
import re# Importing the re module, which provides support for regular expressions (regex) in Python

In [101]:
# Using a lambda function with the apply method to apply a regex substitution to each element in the 'description' column of the DataFrame 'df',
# replacing any occurrences of '-', '.', ':', ',', and '“' with a space
df['description'] = df['description'].apply(lambda x: re.sub(r'[-.:,“]', ' ', x))
df['description'].head()

0    As her father nears the end of his life  filmm...
1    After crossing paths at a party  a Cape Town t...
2    To protect his family from a powerful drug lor...
3    Feuds  flirtations and toilet talk go down amo...
4    In a city of coaching centers known to train I...
Name: description, dtype: object

#Construction of the TF-IDF Matrix

In [102]:
# Checking if there are any null values in the 'description' column of the DataFrame 'new_df', and printing the result
print(df["description"].isnull().any())
# Filling any null values in the 'description' column of the DataFrame 'new_df' with an empty string
df['description'] = df['description'].fillna('')

False


In [103]:
# Creating a TfidfVectorizer object with English stop words removed, which will be used to convert text data into TF-IDF feature vectors
tfidf = TfidfVectorizer(stop_words='english')

In [104]:
# Using the fit_transform method of the TfidfVectorizer object to transform the 'description' column of the DataFrame 'new_df' into a TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(df['description'])
# Returns the feature names (i.e., each word in each column) of the TF-IDF vectorizer.
tfidf.get_feature_names_out
# Converts the TF-IDF matrix to a NumPy array.
tfidf_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#Construction of the Cosine Similarity Matrix

In [105]:
# Computes the similarities between documents using the cosine similarity metric based on the TF-IDF matrix.
similarity = cosine_similarity(tfidf_matrix,
                               tfidf_matrix)

In [106]:
similarity

array([[1.        , 0.        , 0.        , ..., 0.        , 0.01538292,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.02230089],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.01538292, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.02230089, ..., 0.        , 0.        ,
        1.        ]])

#Generating Recommendations Based on Similarity

In [107]:
# Creates a Series using the 'title' column as the index.
# This contains the indices corresponding to each course title.
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [108]:
indices[0:10]

title
Dick Johnson Is Dead                0
Blood & Water                       1
Ganglands                           2
Jailbirds New Orleans               3
Kota Factory                        4
Midnight Mass                       5
My Little Pony: A New Generation    6
Sankofa                             7
The Great British Baking Show       8
The Starling                        9
dtype: int64

In [109]:
indices.index.value_counts()

title
Dick Johnson Is Dead                     1
Ip Man 2                                 1
Hannibal Buress: Comedy Camisado         1
Turbo FAST                               1
Masha's Tales                            1
                                        ..
Love for Sale 2                          1
ROAD TO ROMA                             1
Good Time                                1
Captain Underpants Epic Choice-o-Rama    1
Zubaan                                   1
Name: count, Length: 8807, dtype: int64

In [110]:
# If multiple movies or TV Shows have the same title,
# it keeps only the last one. This ensures that each movies or TV Shows has a unique index.
indices = indices[~indices.index.duplicated(keep='last')]

In [111]:
# Finds the index of the TV Shows with the title 'Peaky Blinders'.
film_index = indices["Peaky Blinders"]

# Retrieves an array containing the similarities of the 'Peaky Blinder' shows with other shows.
similarity[film_index]

array([0.        , 0.        , 0.        , ..., 0.01938176, 0.        ,
       0.        ])

In [112]:
# Creating a DataFrame 'similarity_scores' from the similarity scores corresponding to a particular film index, with a column named "score"
similarity_scores = pd.DataFrame(similarity[film_index],
                                 columns=["score"])

In [113]:
# Sorting the DataFrame 'similarity_scores' by the "score" column in descending order and selecting the indices of the top 4 most similar films,
# excluding the first index (which corresponds to the film itself)
film_indices = similarity_scores.sort_values("score", ascending=False)[1:5].index

In [114]:
# Extracting the titles of the films from the DataFrame 'df' based on the indices stored in 'film_indices'
df["title"].iloc[film_indices]

7683     Our Godfather
2646    My Stupid Boss
3133               Don
8293          The Fear
Name: title, dtype: object

In [115]:
# Define a function named get_recommendations_film which takes a film title and an optional cosine similarity matrix as input parameters
def get_recommendations_film(title, cosine_sim=similarity):

    # Creating a Series named 'indices' with film titles as index and corresponding indices as values, dropping any duplicate indices
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()

    # Removing duplicate indices from the Series, keeping only the last occurrence
    indices = indices[~indices.index.duplicated(keep='last')]

    # Getting the index of the input film title
    film_index = indices[title]

    # Retrieving the cosine similarity scores for the film with the given index
    cosine_sim[film_index]

    # Creating a DataFrame 'similarity_scores' with the cosine similarity scores for the film, with a column named "score"
    similarity_scores = pd.DataFrame(cosine_sim[film_index], columns=["score"])

    # Get the indices of the top 4 most similar films, excluding the first index (which corresponds to the film itself)
    course_indices = similarity_scores.sort_values("score", ascending=False)[1:5].index

    # Return the titles of the recommended films based on the indices obtained above
    return df['title'].iloc[course_indices]

In [116]:
df['title'].sample(10)

810                    Streets of Fire
7731                Peter and the Farm
2597                    Dangerous Lies
1442        Korean Pork Belly Rhapsody
1881      American Pie 9: Girls' Rules
8790     You Don't Mess with the Zohan
4433                    Do Dooni Chaar
1385    Demon Slayer: Kimetsu no Yaiba
6002              2307: Winter's Dream
8220                           The Box
Name: title, dtype: object

In [117]:
get_recommendations_film('The Twilight Saga: Eclipse')

450                 The Twilight Saga: New Moon
451                                    Twilight
447    The Twilight Saga: Breaking Dawn: Part 1
448    The Twilight Saga: Breaking Dawn: Part 2
Name: title, dtype: object

In [118]:
get_recommendations_film('Vikings Unearthed')

2250                              The Old Guard
7596    Norm of the North: King Sized Adventure
1629                                     U-Turn
5468            Sarah Silverman A Speck of Dust
Name: title, dtype: object

In [119]:
get_recommendations_film("Marvel's The Defenders")

1658                 Valentino
3752    Marvel's Jessica Jones
4655        Marvel's Iron Fist
4495        Marvel's Daredevil
Name: title, dtype: object