In [None]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot
import time
from mlxtend.preprocessing import TransactionEncoder
print("Setup Complete")

In [None]:
pip install pyforest

In [None]:
from pyforest import *

In [None]:
netflix = pd.read_csv("../input/netflix-shows/netflix_titles.csv")
## add new features in the dataset
netflix["date_added"] = pd.to_datetime(netflix['date_added'])
netflix['year_added'] = netflix['date_added'].dt.year
netflix['month_added'] = netflix['date_added'].dt.month

netflix['season_count'] = netflix.apply(lambda x : x['duration'].split(" ")[0] if "Season" in x['duration'] else "", axis = 1)
netflix['duration'] = netflix.apply(lambda x : x['duration'].split(" ")[0] if "Season" not in x['duration'] else "", axis = 1)
netflix.head()

In [None]:
netflix_films = pd.read_csv("../input/netflix-shows/netflix_titles.csv")

In [None]:
netflix_summary=pd.read_csv("../input/netflix-shows/netflix_titles.csv")
netflix_summary.head()

In [None]:
netflix['genre'] = netflix['listed_in'].apply(lambda x :  x.replace(' ,',',').replace(', ',',').split(',')) 
netflix['genre'].head()

In [None]:
netflix_shows=netflix[netflix['type']=='TV Show']

In [None]:
imdb_ratings=pd.read_csv('/kaggle/input/imdb-extensive-dataset/IMDb ratings.csv',usecols=['weighted_average_vote'])
imdb_titles=pd.read_csv('/kaggle/input/imdb-extensive-dataset/IMDb movies.csv', usecols=['title', 'year', 'genre'])
ratings = pd.DataFrame({'Title': imdb_titles.title,
                       'Release Year': imdb_titles.year,
                       'Rating': imdb_ratings.weighted_average_vote,
                       'Genre': imdb_titles.genre})
ratings.drop_duplicates(subset=['Title', 'Release Year', 'Rating'], inplace=True)
ratings.shape

Applying inner join on the IMDB Ratings dataset and Netflix dataset to retrieve the content including both ratings on IMDB and are available on Netflix.

In [None]:
ratings.dropna()
joint_data=ratings.merge(netflix_summary, left_on='Title', right_on='title', how='inner')
joint_data=joint_data.sort_values(by='Rating', ascending=False)

The top rated 20 films on Netflix are:

In [None]:
import plotly.express as px
top_rated=joint_data[0:20]
fig =px.sunburst(
    top_rated,
    path=['title','country'],
    values='Rating',
    color='Rating')
fig.show()

In [None]:
import plotly.express as px
top_rated

Countries with the highest rated content.

In [None]:
country_count=joint_data['country'].value_counts().sort_values(ascending=False)
country_count=pd.DataFrame(country_count)
top_countries=country_count[0:15]
top_countries

As you can see, the United States is the single most dominant country producing the highest rated content, with its closest rival be India.

In [None]:
import plotly.express as px
data = dict(
    number=[799, 701, 107, 56, 50, 40, 36, 35, 33, 30, 26, 22, 22, 20],
    country=["United States", "India", "United Kingdom", "Canada", "Philippines", "Spain", "South Korea",
         "Indonesia", "France", "Australia", "Nigeria", "Turkey", "Mexico", "Egypt"])
fig = px.funnel(data, x='number', y='country')
fig.show()

In [None]:
plt.figure(figsize=(11,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="rating", data=netflix_films, palette="Set2", order=netflix_films['rating'].value_counts().index[0:15])

Most Netflix content is aimed towards TV-MA audiences followed by TV-14 and TV-PG audiences. It is also interesting to note that there is no content classified as TV-Y7-FV, UR and NC-17 which could be a potential growth area.

In [None]:
d1 = netflix[netflix["type"] == "TV Show"]
d2 = netflix[netflix["type"] == "Movie"]

In [None]:
import plotly.figure_factory as ff
x1 = d2['duration'].fillna(0.0).astype(float)
fig = ff.create_distplot([x1], ['a'], bin_size=0.9, curve_type='normal', colors=["#2ad65b"])
fig.update_layout(title_text='Distplot with Normal Distribution')
fig.show()

From the above normal distribution graph, it becomes evident that the standard running time for films are about 98 minutes.

In [None]:
col = 'season_count'
vc1 = d1[col].value_counts().reset_index()
vc1 = vc1.rename(columns = {col : "count", "index" : col})
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))
vc1 = vc1.sort_values(col)

trace1 = go.Bar(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="#a932de"))
data = [trace1]
layout = go.Layout(title="Seasons", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

The vast majority of Netflix TV Shows run for only 1 season with the second and third highest television content spanning for 2 and 3 seasons respectively, and so on.

In [None]:
d1 = netflix[netflix["type"] == "TV Show"]
d2 = netflix[netflix["type"] == "Movie"]

col = "year_added"

vc1 = d1[col].value_counts().reset_index()
vc1 = vc1.rename(columns = {col : "count", "index" : col})
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))
vc1 = vc1.sort_values(col)

vc2 = d2[col].value_counts().reset_index()
vc2 = vc2.rename(columns = {col : "count", "index" : col})
vc2['percent'] = vc2['count'].apply(lambda x : 100*x/sum(vc2['count']))
vc2 = vc2.sort_values(col)

trace1 = go.Scatter(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="#a152de"))
trace2 = go.Scatter(x=vc2[col], y=vc2["count"], name="Movies", marker=dict(color="#8ad72b"))
data = [trace1, trace2]
layout = go.Layout(title="TV and Movie Content added over the years", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

From 2014 onwards, there is a major shift between the gap for movies and TV shows on Netflix, with Netflix focusing more on movies.

In [None]:
col = "listed_in"
categories = ", ".join(d2['listed_in']).split(", ")
counter_list = Counter(categories).most_common(50)
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="#b722de"))

data = [trace1]
layout = go.Layout(title="New Content Added Over Time", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

# Releases by Year

In [None]:
plt.figure(figsize=(12,11))
plt.title("Total number of movies released by Netflix by Year")
sns.set(style="white")
ax = sns.countplot(y="release_year", data=netflix, palette="Set1", order=netflix['release_year'].value_counts().index[0:15])

As you can see from this bar graph, 2018 was the biggest year for movie releases by Netflix.

In [None]:
netflix['country'].fillna('',inplace=True)

countries_list = set()
for country in netflix['country'].unique():
    for substr in country.strip().split(','):
        countries_list.add(substr.strip())
if '' in countries_list:
    countries_list.remove('')

categories_list = set()
for category in netflix['listed_in'].unique():
    for substr in category.strip().split(','):
        categories_list.add(substr.strip())
if '' in categories_list:
    categories_list.remove('')
    
country_category_netflix = pd.DataFrame(index=sorted(countries_list),
    columns=sorted(categories_list))
for country in countries.list:
    for category in categories_list:
        country_category_netflix.loc[country, category] = \
        int(len(netflix[netflix['country'].str.contains(country) & netflix['listed_in'].str.contains(category)]))

In [None]:
country_category_netflix = pd.DataFrame(index=sorted(countries_list),
    columns=sorted(categories_list))
for country in countries.list:
    for category in categories_list:
        country_category_netflix.loc[country, category] = \
        int(len(netflix[netflix['country'].str.contains(country) & netflix['listed_in'].str.contains(category)]))

# Content in United States

In [None]:
netflix_us=netflix[netflix['country'] == 'United States']
nannef=netflix_us.dropna()
import plotly.express as px
fig = px.treemap(nannef, path=['country', 'rating'],
                color='rating', hover_data=['rating', 'title'], color_continuous_scale='Purples')
fig.show()

It is notable that the content in the United States is heavily categorised as TV-MA. Although, there is a more similar distribution of content grouped under the TV-14, R, TV-PG, and PG-13 ratings. Personally, I believe that this emphasises making television/films more accessible to broader audiences rather than one target audience.

In [None]:
from collections import Counter
country_data = netflix['country']
country_count = pd.Series(dict(Counter(','.join(country_data).replace(' ,',',').replace(
    ', ',',').split(',')))).sort_values(ascending=False)
top20country = country_count.head(20)

In [None]:
col = "listed_in"
categories = ", ".join(d2['listed_in']).split(", ")
counter_list = Counter(categories).most_common(50)
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list[::-1]
trace_1 = go.Line(y=labels, x=values, orientation="h", name="TV Shows",
                marker=dict(color="a678de"))

data = [trace_1]
layout = go.Layout(title="Content added over time", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
newest_us_series=netflix_us.sort_values(by='release_year', ascending=False)[0:25]

In [None]:
newest_us_series

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'Release Year']),
                              cells=dict(values=[newest_us_series['title'], newest_us_series['release_year']]))
                                   ])
fig.show()

In [None]:
oldest_us_films = netflix_us.sort_values(by='release_year', ascending=True)[0:25]
oldest_us_films = oldest_us_films[oldest_us_films['duration'] != ""]
oldest_us_films[['title', 'release_year']][:15]

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'Release Year']),
                              cells=dict(values=[oldest_us_series['title'], oldest_us_series['release_year']]))
                                   ])
fig.show()

# Content from India

In [None]:
netflix_india=netflix[netflix['country'] == 'India']
nannef=netflix_india.dropna()
import plotly.express as px
fig = px.treemap(nannef, path=['country', 'rating'],
                color='rating', hover_data=['rating', 'title'], color_continuous_scale='Oranges')
fig.show()

Interestingly, Indian content has a predominant focus on TV-14 compared to TV-MA content. This could account for the increased talent amongst teenagers and pre-teenagers, particularly in the Bollywood industry. There is also a more evenly spread cluster with the TV-Y7, TV-G, TV-Y, PG-13, NR, PG, R, and TV-Y7-FV ratings which could indicate less attention or specialism across that produced content.

In [None]:
newest_india_series=netflix_india.sort_values(by='release_year', ascending=False)[0:10]

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'Release Year']),
                              cells=dict(values=[newest_india_series['title'], newest_india_series['release_year']]))
                                   ])
fig.show()

In [None]:
oldest_india_films = netflix_india.sort_values(by='release_year', ascending=True)[0:25]
oldest_india_films = oldest_india_films[oldest_india_films['duration'] != ""]
oldest_india_films[['title', 'release_year']][:15]

# United Kingdom Content

In [None]:
netflix_uk=netflix[netflix['country'] == 'United Kingdom']
nannef=netflix_uk.dropna()
import plotly.express as px
fig = px.treemap(nannef, path=['country', 'rating'],
                color='rating', hover_data=['rating', 'title'], color_continuous_scale='Greens')
fig.show()

In [None]:
newest_uk_series=netflix_uk.sort_values(by='release_year', ascending=False)[0:25]

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'Release Year']),
                              cells=dict(values=[newest_uk_series['title'], newest_uk_series['release_year']]))
                                   ])
fig.show()

In [None]:
oldest_uk_films = netflix_uk.sort_values(by='release_year', ascending=True)[0:25]
oldest_uk_films = oldest_uk_films[oldest_uk_films['duration'] != ""]
oldest_uk_films[['title', 'release_year']][:15]

# Canadian Content

In [None]:
netflix_ca=netflix[netflix['country'] == 'Canada']
nannef=netflix_ca.dropna()
import plotly.express as px
fig = px.treemap(nannef, path=['country', 'rating'],
                color='rating', hover_data=['rating', 'title'], color_continuous_scale='Blues')
fig.show()

In [None]:
newest_ca_series=netflix_ca.sort_values(by='release_year', ascending=False)[0:25]

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'Release Year']),
                              cells=dict(values=[newest_ca_series['title'], newest_ca_series['release_year']]))
                                   ])
fig.show()

In [None]:
oldest_ca_films_series = netflix_ca.sort_values(by='release_year', ascending=True)[0:25]
oldest_ca_films_series = oldest_ca_films_series[oldest_ca_films_series['duration'] != ""]
oldest_ca_films_series[['title', 'release_year']][:15]

# Spanish Content

In [None]:
netflix_sp=netflix[netflix['country'] == 'Spain']
nannef=netflix_sp.dropna()
import plotly.express as px
fig = px.treemap(nannef, path=['country', 'rating'],
                color='rating', hover_data=['rating', 'title'], color_continuous_scale='Reds')
fig.show()

In [None]:
newest_sp_series=netflix_sp.sort_values(by='release_year', ascending=False)[0:25]

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'Release Year']),
                              cells=dict(values=[newest_sp_series['title'], newest_sp_series['release_year']]))
                                   ])
fig.show()

In [None]:
oldest_sp_films_series = netflix_sp.sort_values(by='release_year', ascending=True)[0:25]
oldest_sp_films_series = oldest_sp_films_series[oldest_sp_films_series['duration'] != ""]
oldest_sp_films_series[['title', 'release_year']][:15]

# South Korean Content

In [None]:
netflix_sk=netflix[netflix['country'] == 'South Korea']
nannef=netflix_sk.dropna()
import plotly.express as px
fig = px.treemap(nannef, path=['country', 'rating'],
                color='rating', hover_data=['rating', 'title'], color_continuous_scale='Yellows')
fig.show()

In [None]:
newest_sk_series=netflix_sk.sort_values(by='release_year', ascending=False)[0:25]

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'Release Year']),
                              cells=dict(values=[newest_sk_series['title'], newest_sk_series['release_year']]))
                                   ])
fig.show()

In [None]:
oldest_sk_films_series = netflix_sk.sort_values(by='release_year', ascending=True)[0:25]
oldest_sk_films_series = oldest_sk_films_series[oldest_sk_films_series['duration'] != ""]
oldest_sk_films_series[['title', 'release_year']][:15]

# Indonesian Content

In [None]:
netflix_ia=netflix[netflix['country'] == 'Indonesia']
nannef=netflix_ia.dropna()
import plotly.express as px
fig = px.treemap(nannef, path=['country', 'rating'],
                color='rating', hover_data=['rating', 'title'], color_continuous_scale='Magentas')
fig.show()

In [None]:
newest_ia_series=netflix_ia.sort_values(by='release_year', ascending=False)[0:25]

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'Release Year']),
                              cells=dict(values=[newest_ia_series['title'], newest_ia_series['release_year']]))
                                   ])
fig.show()

In [None]:
oldest_ia_films_series = netflix_ia.sort_values(by='release_year', ascending=True)[0:25]
oldest_ia_films_series = oldest_ia_films_series[oldest_ia_films_series['duration'] != ""]
oldest_ia_films_series[['title', 'release_year']][:15]

# French Content

In [None]:
netflix_fr=netflix[netflix['country'] == 'France']
nannef=netflix_fr.dropna()
import plotly.express as px
fig = px.treemap(nannef, path=['country', 'rating'],
                color='rating', hover_data=['rating', 'title'], color_continuous_scale='Violets')
fig.show()

In [None]:
newest_fr_series=netflix_fr.sort_values(by='release_year', ascending=False)[0:25]

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'Release Year']),
                              cells=dict(values=[newest_fr_series['title'], newest_fr_series['release_year']]))
                                   ])
fig.show()

In [None]:
oldest_fr_films_series = netflix_fr.sort_values(by='release_year', ascending=True)[0:25]
oldest_fr_films_series = oldest_fr_films_series[oldest_fr_films_series['duration'] != ""]
oldest_fr_films_series[['title', 'release_year']][:15]

# Australian Content

In [None]:
netflix_au=netflix[netflix['country'] == 'Australia']
nannef=netflix_au.dropna()
import plotly.express as px
fig = px.treemap(nannef, path=['country', 'rating'],
                color='rating', hover_data=['rating', 'title'], color_continuous_scale='Opals')
fig.show()

In [None]:
newest_au_series=netflix_au.sort_values(by='release_year', ascending=False)[0:25]

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'Release Year']),
                              cells=dict(values=[newest_au_series['title'], newest_au_series['release_year']]))
                                   ])
fig.show()

In [None]:
netflix['description'].head(10)

In [None]:
# Import TfIdfVectoriser from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF Vectoriser Object. Delete all English stop words, like 'the' a', etc.
tfidf = TfidfVectorizer(stop_words='english')

# Overwrite NaN with a blank string
netflix['description'] = netflix['description'].fillna('')

# Construct the essential TF-IDF matrix by fitting and modifying the data
tfidf_matrix = tfidf.fit_transform(netflix['description'])

# Output the shape of tfidf_matrix
tfidf_matrix.shape

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
netflix.head()

In [None]:
netflix.tail()

In [None]:
netflix.info()

In [None]:
# identify the number of missing data points per column
missing_values_count = netflix.isnull().sum()

# look at the # of missing points in the first 50 columns
missing_values_count[0:50]

In [None]:
# total number of missing values
total_cells = np.product(netflix.shape)
total_missing = missing_values_count.sum()

# percent of missing data 
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

# **Rename the date_added column to release date**

In [None]:
netflix.columns = netflix.columns.str.replace("date_added" , "release_date")

In [None]:
pd.options.display.max_rows = None

# **Data Cleaning and replacing NaN Values**

In [None]:
netflix["country"].unique().size

# **Column data is unreliable in "Country" and there are multiple countries listed in some cases**

In [None]:
netflix["country"].value_counts(dropna=False).tail(10)

In [None]:
netflix["country"].sample(50)

# **Then, update the country column.**

In [None]:
netflix["country"] = netflix["country"].str.split(",").str[0]

In [None]:
netflix["country"].unique().size

In [None]:
netflix["country"].value_counts().head(10)

# **Now, we are left with the NaN values to handle.**

In [None]:
netflix["country"].mode()[0]

In [None]:
netflix["country"] = netflix["country"].fillna(netflix["country"].mode()[0])

In [None]:
netflix["country"].head(15)

In [None]:
netflix[netflix["rating"].isnull()]

In [None]:
netflix["rating"].value_counts(dropna=False).head(10)

In [None]:
netflix.loc[netflix["title"] == "13TH: A Conversation with Oprah Winfrey & Ava DuVernay", "rating"] = 'TV-PG'
netflix.loc[netflix["title"] == "Gargantia on the Verdurous Planet", "rating"] = 'TV-G'
netflix.loc[netflix["title"] == "Little Lunch", "rating"] = 'TV-Y7'
netflix.loc[netflix["title"] == "Louis C.K. 2017", "rating"] = 'TV-MA'
netflix.loc[netflix["title"] == "Louis C.K.: Hilarious", "rating"] = 'TV-MA'
netflix.loc[netflix["title"] == "Louis C.K.: Live at the Comedy Store", "rating"] = 'TV-MA'
netflix.loc[netflix["title"] == "My Honor Was Loyalty", "rating"] = 'PG-13'

In [None]:
netflix["rating"].value_counts(dropna=False).head(10)

In [None]:
netflix[netflix["rating"].isnull()]

In [None]:
netflix[netflix["release_date"].isna()]

In [None]:
netflix.loc[netflix["title"] == "A Young Doctor's Notebook and Other Stories", "release_date"] = "December 6, 2012"
netflix.loc[netflix["title"] == "Anthony Bourdain: Parts Unknown", "release_date"] = "April 14 6, 2013"
netflix.loc[netflix["title"] == "Frasier", "release_date"] = "September 16, 1993"
netflix.loc[netflix["title"] == "Friends", "release_date"] = "September 22, 1994"
netflix.loc[netflix["title"] == "Gunslinger Girl", "release_date"] = "May 21, 2002"
netflix.loc[netflix["title"] == "Kikoriki", "release_date"] = "May 17, 2004"
netflix.loc[netflix["title"] == "La Familia P. Luche", "release_date"] = "November 29, 2002"
netflix.loc[netflix["title"] == "Maron", "release_date"] = "May 3, 2013"
netflix.loc[netflix["title"] == "Red vs. Blue", "release_date"] = "April 1, 2003"
netflix.loc[netflix["title"] == "The Adventures of Figaro Pho", "release_date"] = "August 31, 2015"

In [None]:
netflix[netflix["release_date"].isnull()]

In [None]:
netflix["listed_in"].value_counts().head(10)

In [None]:
netflix["listed_in"].value_counts().tail(10)

In [None]:
netflix["listed_in"] = netflix["listed_in"].str.split(",").str[0]

In [None]:
netflix["listed_in"] = netflix["listed_in"].str.strip()

In [None]:
netflix["listed_in"].value_counts()[:10]

In [None]:
netflix.isnull().sum()

In [None]:
netflix["country"].value_counts().head()

# Exploratory Data Analysis

* What does each category mean?
* What content is available in various countries?
* Is Netflix increasingly focusing on television rather than films in recent years?

* TV-MA: This programme is aimed at mature audiences only and is therefore unsuitable for children under 17.
* TV-14: This programme includes some material that parents would find unsuitable for children under 14.
* TV-PG: This programme has material that parents would find unsuitable for younger children.
* R: Contains some adult material. Parents are encouraged to learn more about the film before taking their young children with them.
* PG-13: Parents are urged to be cautious. Some material may be inappropriate for children under 13.
* NR/UR: If a film has not been classified for a specifc rating or is an extended version of a submitted film.
* PG: Some material may be unsuitable for children. It may contain some material parents might be wary of for their young children.
* TV-Y7: This programme is designed for children aged 7 and above.
* TV-G: This programme is suitable for all ages.
* TV-Y: These programmes are aimed at a very young audience, including children from ages 2-6.
* TV-Y7-FV: Programming with fantasy violence that is recommended for ages 7 and above.
* G: All ages admitted. Nothing that would offend parents for being viewed by children.
* NC: No children under 17 allowed.The content is appropriate only for adults.

From these definitions, we learn that NR and UR are virtually the same rating(not rated and unrated).
Uncut or extended versions of films that are classified "Unrated" even include ratings stating that the uncut version of a film has content that varies from the theatrical release and may be unsuitable for children. Therefore, this must be corrected.

In [None]:
for i in netflix.index:
    if netflix.loc[i, 'rating'] == 'UR':
        netflix.loc[i, 'rating'] = 'NR'

In [None]:
netflix["type"].value_counts()

In [None]:
sns.countplot(x = "type", data = netflix)

In [None]:
netflix["type"].value_counts().plot(kind = "pie", figsize=(10,10),autopct='%1.1f%%')

In [None]:
sns.setgrid

In [None]:
plt.figure(figsize=(8,6))
netflix['rating'].value_counts(normalize=True).plot.bar()
plt.title('Distribution of rating categories')
plt.xlabel('rating')
plt.ylabel('relative frequency')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x='rating', hue='type', data=netflix)
plt.title('comparing frequency between type and rating')
plt.show()

Evidently from this bar chart, we can surmise that movies are the dominant category in the majority of rating categories in Netflix, with the exception of TV-Y and TV-Y7.

In [None]:
netflix['country'].value_counts().sort_values(ascending=False).head(10)

In [None]:
netflix['country'].value_counts().sort_values(ascending=False).tail()

In [None]:
most_productive_countries=netflix[(netflix['country']=='United States')|(netflix['country']=='India')|(netflix['country']=='United Kingdom')|
                                  (netflix['country']=='Canada')|(netflix['country']=='Japan')|(netflix['country']=='France')]

plt.figure(figsize=(10,8))
sns.countplot(x='country',hue='type',data=most_productive_countries)
plt.title('Comparisons between the types of content that the most productive countries create')
plt.show()

According to this bar graph, Japan is the only country to produce more TV shows than movies.

In [None]:
for i in most_productive_countries['country'].unique():
    print(i)
    print(most_productive_countries[most_productive_countries['country']==i]['rating'].value_counts(normalize=True)*100)
    print('-'*10)

# Most Featured Actors on Netflix Films

In [None]:
def country_trace(country, flag = "movie"):
    netflix["from_us"] = netflix['country'].fillna("").apply(lambda x : 1 if country.lower() in x.lower() else 0)
    small = netflix[netflix["from_us"] == 1]
    if flag == "movie":
        small = small[small["duration"] != ""]
    else:
        small = small[small["season_count"] != ""]
    cast = ", ".join(small['cast'].fillna("")).split(", ")
    tags = Counter(cast).most_common(25)
    tags = [_ for _ in tags if "" != _[0]]

    labels, values = [_[0]+"  " for _ in tags], [_[1] for _ in tags]
    trace = go.Bar(y=labels[::-1], x=values[::-1], orientation="h", name="", marker=dict(color="#a678de"))
    return trace

from plotly.subplots import make_subplots
traces = []
titles = ["United States", "","India","", "United Kingdom", "Canada","", "Spain","", "Japan"]
for title in titles:
    if title != "":
        traces.append(country_trace(title))

fig = make_subplots(rows=2, cols=5, subplot_titles=titles)
fig.add_trace(traces[0], 1,1)
fig.add_trace(traces[1], 1,3)
fig.add_trace(traces[2], 1,5)
fig.add_trace(traces[3], 2,1)
fig.add_trace(traces[4], 2,3)
fig.add_trace(traces[5], 2,5)

fig.update_layout(height=1200, showlegend=False)
fig.show()

In [None]:
netflix.director.value_counts()[1:20].sort_values(ascending=False).plot(kind='bar', width=0.5, color='yellow');

In [None]:
small = netflix[netflix["type"] == "Movie"]
small = small[small["country"] == "United Kingdom"]

col = "director"
categories = ", ".join(small[col].fillna("")).split(", ")
import collections
counter_list = collections.Counter(categories).most_common(12)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="orange"))

data = [trace1]
layout = go.Layout(title="Movie Directors from the UK with most content", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
data = netflix.groupby('type')['director'].value_counts()['Movie'][0: 19]
data = pd.DataFrame(data)

plt.barh(data.index, data.director, color = "#007f5c")

# Plot title
plt.title('Most Popular Directors for Movies', fontsize=18, fontweight='bold')

# Display graph
plt.show()

# Analysis of Countries

In [None]:
# subset dataset and split 
country = netflix.loc[netflix.country.notnull(), 'country'].astype('str').apply(lambda t: t.split(', '))

# Convert DataFrame column into list of strings
country = list(country)

# number of movies/TV shows without null values
len(country)

In [None]:
# Instantiate encoder and identify unique country
encoder = TransactionEncoder().fit(country)

# One-hot encode
onehot_country = encoder.transform(country)

# Convert one-hot encoded data to DataFrame and set show_id as index
onehot_country = pd.DataFrame(onehot_country, columns = encoder.columns_, index=netflix.loc[netflix.country.notnull(), 'show_id'])

# Print the one-hot encoded country dataset
onehot_country.head()

# Cast Analysis

In [None]:
# Subset dataset and split
cast = netflix.loc[netflix.cast.notnull(),'cast'].astype('str').apply(lambda t: t.split(', '))

# Convert DataFrame column into a list of strings
cast = list(cast)

# Number of movies/TV Shows
len(cast)

In [None]:
# Implement encoder and detect unique records
encoder = TransactionEncoder().fit(cast)

# One-hot encode
onehot = encoder.transform(cast)

# Convert one-hot encoded data to DataFrame and set show_id as index
onehot_cast = pd.DataFrame(onehot, columns = encoder.columns_, index=netflix.loc[netflix.cast.notnull(),'show_id'])

# Print the one-hot encoded dataset
onehot_cast.shape

In [None]:
onehot_cast.sum().sort_values(ascending=False).head()

In [None]:
# Function that returns information about cast's Movies/TV Shows
def cast(actor):
    data = netflix[netflix.cast.astype('str').apply(lambda t: actor in t)]
    return(data)

In [None]:
# Set a function to the top featured actor and reveal the first 5 movies/TV Shows
cast('Anupam Kher').head()

In [None]:
# Reset index
onehot_cast = onehot_cast.reset_index()

In [None]:
usa = netflix[netflix.country == 'United States']
usa.head()

In [None]:
# Merge show genre first
cast_country = onehot_cast.merge(netflix[['show_id', 'type']], how='left')

# Merge one-hot encoded country dataset
cast_country = cast_country.merge(onehot_country, how='left')

In [None]:
# Filter by movie category and US
cast_us = cast_country.loc[(cast_country.type == 'Movie') 
                           & (cast_country['United States'] == True)]

In [None]:
# Filter by movie category and US
cast_usa = usa.set_index("title").cast.str.split(',', expand=True).stack().reset_index(level=1, drop=True)
print(cast_usa)

In [None]:
# Calculate the overall number of American films of actors/actresses
us_cast_count = cast_us.loc[:,onehot_cast.columns].drop('show_id', axis=1)\
                        .sum().sort_values(ascending=False)

In [None]:
# Top-5 actors/actresses
cast_usa_count.head()

In [None]:
cast('Adam Sandler').head()

In [None]:
cast('James Franco').head()

In [None]:
cast('Samuel L. Jackson').head()

In [None]:
cast('Fred Tatasciore').head()

In [None]:
cast('Judy Greer').head()

In [None]:
cast('Jennifer Garner').head()

In [None]:
cast('Amy Adams').head()

In [None]:
cast('Lucy Liu').head()

In [None]:
plt.figure(figsize=(13,7))
plt.title("Most Featured American Actors/Actresses on Netflix",size='20')
sns.countplot(y = cast_usa, order=cast_usa.value_counts().index[:10], palette='Paired')
plt.show()

In [None]:
usa_actors = cast_usa.value_counts()
usa_actors.head()

# Director Analysis

In [None]:
# Subset dataset and split
director = netflix.loc[netflix.director.notnull(),'director'].astype('str').apply(lambda t: t.split(', '))

# Convert DataFrame column into a list of strings
director = list(director)

# Number of movies/TV Shows
len(director)

In [None]:
# Implement encoder and detect unique records
encoder = TransactionEncoder().fit(director)

# One-hot encode
onehot = encoder.transform(director)

# Convert one-hot encoded data to DataFrame and set show_id as index
onehot_director = pd.DataFrame(onehot, columns = encoder.columns_, index=netflix.loc[netflix.director.notnull(),'show_id'])

# Print the one-hot encoded dataset
onehot_director.shape

In [None]:
onehot_director.sum().sort_values(ascending=False).head()

In [None]:
# Function defining the Movies/TV Shows by director
def director(name):
    data = netflix[netflix.director.name(str).apply(lambda t: actor in t)]
    return(data)

In [None]:
# Countries where Jan Suter made his Movies/TV Shows
director('Jan Suter').head()

In [None]:
# Countries where Jan Suter made his Movies/TV Shows
director('Jan Suter').country.unique()

In [None]:
data = netflix.groupby('type')['cast'].value_counts()['Movie'][0: 19]
data = pd.DataFrame(data)

plt.barh(data.index, data.cast, color = "#007f5c")

# Plot title
plt.title('Most Popular Movie Actors', fontsize=18, fontweight='bold')

# Display graph
plt.show()

In [None]:
small = netflix[netflix["type"] == "Movie"]
small = small[small["country"] == "United States"]

col = "director"
categories = ", ".join(small[col].fillna("")).split(", ")
import collections
counter_list = collections.Counter(categories).most_common(12)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="orange"))

data = [trace1]
layout = go.Layout(title="Movie Directors from the US with most content", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
small = netflix[netflix["type"] == "Movie"]
small = small[small["country"] == "India"]

col = "director"
categories = ", ".join(small[col].fillna("")).split(", ")
import collections
counter_list = collections.Counter(categories).most_common(12)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="orange"))

data = [trace1]
layout = go.Layout(title="Movie Directors from India with most content", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
small = netflix[netflix["type"] == "Movie"]
small = small[small["country"] == "Canada"]

col = "director"
categories = ", ".join(small[col].fillna("")).split(", ")
import collections
counter_list = collections.Counter(categories).most_common(12)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="orange"))

data = [trace1]
layout = go.Layout(title="Movie Directors from Canada with most content", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
small = netflix[netflix["type"] == "Movie"]
small = small[small["country"] == "Spain"]

col = "director"
categories = ", ".join(small[col].fillna("")).split(", ")
import collections
counter_list = collections.Counter(categories).most_common(12)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="orange"))

data = [trace1]
layout = go.Layout(title="Movie Directors from Spain with most content", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
small = netflix[netflix["type"] == "Movie"]
small = small[small["country"] == "South Korea"]

col = "director"
categories = ", ".join(small[col].fillna("")).split(", ")
import collections
counter_list = collections.Counter(categories).most_common(12)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="orange"))

data = [trace1]
layout = go.Layout(title="Movie Directors from South Korea with most content", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
small = netflix[netflix["type"] == "Movie"]
small = small[small["country"] == "Indonesia"]

col = "director"
categories = ", ".join(small[col].fillna("")).split(", ")
import collections
counter_list = collections.Counter(categories).most_common(12)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="orange"))

data = [trace1]
layout = go.Layout(title="Movie Directors from Indonesia with most content", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
small = netflix[netflix["type"] == "Movie"]
small = small[small["country"] == "France"]

col = "director"
categories = ", ".join(small[col].fillna("")).split(", ")
import collections
counter_list = collections.Counter(categories).most_common(12)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="orange"))

data = [trace1]
layout = go.Layout(title="Movie Directors from France with most content", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
small = netflix[netflix["type"] == "Movie"]
small = small[small["country"] == "Australia"]

col = "director"
categories = ", ".join(small[col].fillna("")).split(", ")
import collections
counter_list = collections.Counter(categories).most_common(12)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="orange"))

data = [trace1]
layout = go.Layout(title="Movie Directors from Australia with most content", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

# Network Analysis of Actors & Directors

In [None]:
import networkx as nx

In [None]:
# Deleting nulls 
netflix['director'].fillna(' ', inplace=True)
netflix['cast'].fillna(' ', inplace=True)
netflix['country'].fillna(' ', inplace=True)

In [None]:
def draw_network_graph(country_name):
    # Creating actor and director lists
    actors_list = set()
    for actor, country in zip(netflix.cast,netflix.country):
        if country_name in country:
            for substr in actor.strip().split(','):
                actors_list.add(substr.strip())
    if '' in actors_list:
        actors_list.remove('')

    directors_list = set()
    for director, country in zip(netflix.director,netflix.country):
        if country_name in country:
            for substr in director.strip().split(','):
                directors_list.add(substr.strip())
    if '' in directors_list:
        directors_list.remove('')
    
    # Creating dataframes for actor count and director count and populating them 
    actor_count = pd.DataFrame(columns=['Name','Count'])
    for actor in actors_list:
        new_row = {'Name':actor,'Count':len(netflix[netflix['cast'].str.contains(actor) & netflix['country'].str.contains(country_name)])}
        actor_count = actor_count.append(new_row,ignore_index=True)
    actor_count.sort_values(by='Count',inplace=True,ascending=False)

    director_count = pd.DataFrame(columns=['Name','Count'])
    for director in directors_list:
        new_row = {'Name':director,'Count':len(netflix[netflix['director'].str.contains(director) & netflix['country'].str.contains(country_name)])}
        director_count = director_count.append(new_row,ignore_index=True)
    director_count.sort_values(by='Count',inplace=True,ascending=False)

    top_50_actors = actor_count[actor_count['Name'].str.contains(' ')].head(50)
    top_50_directors = director_count[director_count['Name'].str.contains(' ')].head(50)
    
    G = nx.DiGraph()
    
    for actor in top_50_actors['Name']:
        G.add_node(actor)
    for director in top_50_directors['Name']:
        G.add_node(director)
    for actor in top_50_actors['Name']:
        for director in top_50_directors['Name']:
            if len(netflix[netflix['director'].str.contains(director) & netflix['cast'].str.contains(actor) & netflix['country'].str.contains(country_name)]) > 0:
                G.add_edge(actor, director)
    
    #Blue nodes for actors and red for directors
    color_map = []
    for node in G:
        if node in top_50_actors['Name'].values:
            color_map.append('blue')
        else:
            color_map.append('red')
    
    plt.figure(1,figsize=(30,30))
    nx.draw(G,node_color=color_map, with_labels=True,font_color='green',font_size=25)
    print('Max connections: '+ str(max(dict(G.degree()).items(), key = lambda x : x[1])))
    plt.show()

In [None]:
draw_network_graph('United States')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.cluster import MiniBatchKMeans

# Build the tfidf matrix with the descriptions
start_time = time.time()
text_content = netflix['description']
vector = TfidfVectorizer(max_df=0.4,         # drop words that occur in more than X percent of documents
                             min_df=1,      # only use words that appear at least X times
                             stop_words='english', # remove stop words
                             lowercase=True, # Convert everything to lower case 
                             use_idf=True,   # Use idf
                             norm=u'l2',     # Normalization
                             smooth_idf=True # Prevents divide-by-zero errors
                            )
tfidf = vector.fit_transform(text_content)

# Clustering  Kmeans
k = 200
kmeans = MiniBatchKMeans(n_clusters = k)
kmeans.fit(tfidf)
centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = vector.get_feature_names()

# print the centers of the clusters
# for i in range(0,k):
#     word_list=[]
#     print("cluster%d:"% i)
#     for j in centers[i,:10]:
#         word_list.append(terms[j])
#     print(word_list) 
    
request_transform = vector.transform(netflix['description'])
# new column cluster based on the description
netflix['cluster'] = kmeans.predict(request_transform) 

netflix['cluster'].value_counts().head()

# Most Featured Actors on Netflix

In [None]:
def country_trace(country, flag = "movie"):
    netflix["from_us"] = netflix['country'].fillna("").apply(lambda x : 1 if country.lower() in x.lower() else 0)
    small = netflix[netflix["from_us"] == 1]
    if flag == "movie":
        small = small[small["duration"] != ""]
    else:
        small = small[small["season_count"] != ""]
    cast = ", ".join(small['cast'].fillna("")).split(", ")
    tags = Counter(cast).most_common(25)
    tags = [_ for _ in tags if "" != _[0]]

    labels, values = [_[0]+"  " for _ in tags], [_[1] for _ in tags]
    trace = go.Bar(y=labels[::-1], x=values[::-1], orientation="h", name="", marker=dict(color="#a831de"))
    return trace

from plotly.subplots import make_subplots
traces = []
titles = ["United States", "","India","", "United Kingdom", "Canada","", "Spain","", "Japan"]
for title in titles:
    if title != "":
        traces.append(country_trace(title))

fig = make_subplots(rows=2, cols=5, subplot_titles=titles)
fig.add_trace(traces[0], 1,1)
fig.add_trace(traces[1], 1,3)
fig.add_trace(traces[2], 1,5)
fig.add_trace(traces[3], 2,1)
fig.add_trace(traces[4], 2,3)
fig.add_trace(traces[5], 2,5)

fig.update_layout(height=1200, showlegend=False)
fig.show()

# Popular Actors on Netflix in highest number of TV Shows

In [None]:
traces = []
titles = ["United States","", "United Kingdom"]
for title in titles:
    if title != "":
        traces.append(country_trace(title, flag="tv_shows"))
        
fig = make_subplots(rows=1, cols=3, subplot_titles=titles)
fig.add_trace(traces[0], 1,1)
fig.add_trace(traces[1], 1,3)

fig.update_layout(height=600, showlegend=False)
fig.show()

In [None]:
def draw_network_graph(country_name):
    # Creating actor and director lists
    actors_list = set()
    for actor, country in zip(netflix.cast,netflix.country):
        if country_name in country:
            for substr in actor.strip().split(','):
                actors_list.add(substr.strip())
    if '' in actors_list:
        actors_list.remove('')

    directors_list = set()
    for director, country in zip(netflix.director,netflix.country):
        if country_name in country:
            for substr in director.strip().split(','):
                directors_list.add(substr.strip())
    if '' in directors_list:
        directors_list.remove('')
    
    # Creating dataframes for actor count and director count and populating them 
    actor_count = pd.DataFrame(columns=['Name','Count'])
    for actor in actors_list:
        new_row = {'Name':actor,'Count':len(netflix[netflix['cast'].str.contains(actor) & netflix['country'].str.contains(country_name)])}
        actor_count = actor_count.append(new_row,ignore_index=True)
    actor_count.sort_values(by='Count',inplace=True,ascending=False)

    director_count = pd.DataFrame(columns=['Name','Count'])
    for director in directors_list:
        new_row = {'Name':director,'Count':len(netflix[netflix['director'].str.contains(director) & netflix['country'].str.contains(country_name)])}
        director_count = director_count.append(new_row,ignore_index=True)
    director_count.sort_values(by='Count',inplace=True,ascending=False)

    top_50_actors = actor_count[actor_count['Name'].str.contains(' ')].head(50)
    top_50_directors = director_count[director_count['Name'].str.contains(' ')].head(50)
    
    G = nx.DiGraph()
    
    for actor in top_50_actors['Name']:
        G.add_node(actor)
    for director in top_50_directors['Name']:
        G.add_node(director)
    for actor in top_50_actors['Name']:
        for director in top_50_directors['Name']:
            if len(netflix[netflix['director'].str.contains(director) & netflix['cast'].str.contains(actor) & netflix['country'].str.contains(country_name)]) > 0:
                G.add_edge(actor, director)
    
    #Blue nodes for actors and red for directors
    color_map = []
    for node in G:
        if node in top_50_actors['Name'].values:
            color_map.append('blue')
        else:
            color_map.append('red')
    
    plt.figure(1,figsize=(30,30))
    nx.draw(G,node_color=color_map, with_labels=True,font_color='green',font_size=15)
    print('Max connections: '+ str(max(dict(G.degree()).items(), key = lambda x : x[1])))
    plt.show()

In [None]:
draw_network_graph('India')