In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from numpy import percentile
import pandas_profiling
import missingno as msno
import matplotlib.pyplot as plt
from matplotlib.text import Text
from matplotlib.lines import Line2D
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.patches import Rectangle, Polygon
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.colors import n_colors
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
import seaborn as sns
import warnings
from scipy.stats import norm
import math
import time
import networkx as nx
import cufflinks as cf
from IPython.display import Markdown
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from collections import Counter

warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

In [None]:
df = pd.read_csv("/kaggle/input/netflix-shows/netflix_titles.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include="all")

In [None]:
print('Rows     :',df.shape[0])
print('Columns  :',df.shape[1])
print('\nFeatures :\n     :',df.columns.tolist())
print('\nMissing values    :',df.isnull().values.sum())
print('\nUnique values :  \n',df.nunique())

In [None]:
msno.matrix(df)
plt.title("Missing Quantities Distribution", fontsize=40)
plt.show()

In [None]:
profile = pandas_profiling.ProfileReport(df)
profile

In [None]:
n = msno.bar(df,color='purple')

In [None]:
df['country'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df['country'].replace(np.nan,"NA",inplace=True)
df['country'].value_counts()

In [None]:
df['date_added'].value_counts()

In [None]:
netflix_date = df[['date_added']].replace(np.nan,'Not Added')
netflix_date["release_month"] = netflix_date['date_added'].apply(lambda x: x.lstrip().split(" ")[0])
netflix_date.head()

In [None]:
netflix_date["release_month"].value_counts()

In [None]:
netflix_date['release_month'].replace('Not', 0,inplace=True)
netflix_date["release_month"].value_counts()

In [None]:
netflix_date.drop("date_added",axis=1,inplace=True)
netflix_date.head()

In [None]:
netflix = pd.concat([df,netflix_date],axis=1)
netflix.head()

In [None]:
netflix['date_added'] = pd.to_datetime(netflix['date_added'])
netflix['year_added'] = netflix['date_added'].dt.year
netflix['month_added'] = netflix['date_added'].dt.month

netflix['season_count'] = netflix.apply(lambda x : x['duration'].split(" ")[0] if "Season" in x['duration'] else "", axis = 1)
netflix['duration'] = netflix.apply(lambda x : x['duration'].split(" ")[0] if "Season" not in x['duration'] else "", axis = 1)

netflix.head()

In [None]:
netflix["rating"].value_counts()

In [None]:
netflix.isnull().sum()

In [None]:
netflix["rating"].replace(np.nan,"TV-MA",inplace=True)
netflix.isnull().sum()

In [None]:
netflix['type'].value_counts()

In [None]:
netflix.head()

In [None]:
sns.set()
sns.countplot(x="type",data=netflix)
plt.title("Number of Movies vs Number of TV Shows", fontsize=40)
plt.show()

In [None]:
categ = netflix['type'].value_counts().to_frame().reset_index().rename(columns={'index':'type', 'type':'count'})
fig = go.Figure([go.Pie(labels=categ['type'], values=categ['count'], pull=[0.05, 0], marker=dict(colors=["#6ad49b", "#a678de"]))])
fig = fig.update_traces(hoverinfo='label+percent', textinfo='value+percent', textfont_size=15, insidetextorientation='radial')
fig = fig.update_layout(title='Types of Content on Netflix', title_x=0.5)
fig.show()

In [None]:
labels = netflix['type'].value_counts().index
values = netflix['type'].value_counts().values
title = 'Content Type'
trace = go.Pie(labels = labels,
               values = values,
               title = title,
               hoverinfo = 'percent+value',
               textinfo = 'percent',
               textposition = 'inside',
               hole = 0.7,
               showlegend = True,
               marker = dict(colors = ['cyan','gold'],
                             line = dict(color="#000000",
                                        width = 2),
                            )
              )
fig = go.Figure(data=[trace])
fig.show()

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
netflix['type'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('Distribution of Content Type on Netflix', fontsize=30)
ax[0].set_ylabel('Count')
sns.countplot('type',data=netflix,ax=ax[1],order=netflix['type'].value_counts().index)
ax[1].set_title('Count of Contents', fontsize=30)
plt.show()

In [None]:
movies = netflix[netflix['type']=='Movie']
shows = netflix[netflix['type']=='TV Show']

movies.head()

In [None]:
shows.head()

In [None]:
plt.figure(figsize=(12,9))
plt.title("Rating Distribution of Content on Netflix", fontsize=30)
sns.countplot(x="rating",data=netflix,order= netflix['rating'].value_counts().index[0:14])

In [None]:
plt.figure(figsize=(12,9))
plt.title("Rating Distribution of Movies on Netflix", fontsize=30)
sns.countplot(x="rating",data=movies,order= movies['rating'].value_counts().index[0:14])

In [None]:
plt.figure(figsize=(12,9))
plt.title("Rating Distribution of Shows on Netflix", fontsize=30)
sns.countplot(x="rating",data=shows,order= shows['rating'].value_counts().index[0:14])

In [None]:
fig = px.pie(netflix,values = netflix['rating'].value_counts()[:10],names = netflix['rating'].value_counts()[:10].index,labels= netflix['rating'].value_counts()[:10].index)
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.update_layout(title='Rating Distribution of Content on Netflix', title_x=0.5)
fig.show()

In [None]:
fig = px.pie(movies,values = movies['rating'].value_counts()[:10],names = movies['rating'].value_counts()[:10].index,labels= movies['rating'].value_counts()[:10].index)
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.update_layout(title='Rating Distribution of Movies on Netflix', title_x=0.5)
fig.show()

In [None]:
fig = px.pie(shows,values = shows['rating'].value_counts()[:10],names = shows['rating'].value_counts()[:10].index,labels= shows['rating'].value_counts()[:10].index)
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.update_layout(title='Rating Distribution of Shows on Netflix', title_x=0.5)
fig.show()

In [None]:
r = netflix['rating'].value_counts()[:15].to_dict()
data = dict(
    count = r.values(),
    rating = r.keys())
fig = px.funnel(data, x='count', y='rating')
fig.update_layout(title='Count of Content Ratings on Netflix', title_x=0.5)
fig.show()

In [None]:
r = movies['rating'].value_counts()[:15].to_dict()
data = dict(
    count = r.values(),
    rating = r.keys())
fig = px.funnel(data, x='count', y='rating')
fig.update_layout(title='Count of Movie Ratings on Netflix', title_x=0.5)
fig.show()

In [None]:
r = shows['rating'].value_counts()[:15].to_dict()
data = dict(
    count = r.values(),
    rating = r.keys())
fig = px.funnel(data, x='count', y='rating')
fig.update_layout(title='Count of Show Ratings on Netflix', title_x=0.5)
fig.show()

In [None]:
movies_rating = movies['rating'].value_counts().reset_index()
shows_rating = shows['rating'].value_counts().reset_index()

trace1 = go.Bar(x = movies_rating['index'],
                y = movies_rating['rating'],
                name = "Movies",
                marker = dict(color = 'rgb(249, 6, 6)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )

trace2 = go.Bar(x = shows_rating['index'],
                y = shows_rating['rating'],
                name = "TV Shows",
                marker = dict(color = 'rgb(26, 118, 255)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )

layout = go.Layout(template='plotly_dark', title='Rating by Content Type', xaxis=dict(title='Rating'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()

In [None]:
sns.set()
plt.figure(figsize=(30,9))
sns.countplot(x="release_year",data= netflix,order = netflix['release_year'].value_counts().index[0:20])
plt.title("Content Released over the years", fontsize=40)
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.set()
plt.figure(figsize=(30,9))
sns.countplot(x="release_year",data= movies,order = movies['release_year'].value_counts().index[0:20])
plt.title("Movies released over the years", fontsize=40)
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.set()
plt.figure(figsize=(30,9))
sns.countplot(x="release_year",data= shows,order = shows['release_year'].value_counts().index[0:20])
plt.title("Shows released over the years", fontsize=40)
plt.xticks(rotation=45)
plt.show()

In [None]:
fig = px.pie(netflix,values = netflix['release_year'].value_counts()[:10],names = netflix['release_year'].value_counts()[:10].index,labels= netflix['release_year'].value_counts()[:10].index)
fig.update_layout(title="Pie Chart for Distribution of Content based on Year Released")
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
fig = px.pie(movies,values = movies['release_year'].value_counts()[:10],names = movies['release_year'].value_counts()[:10].index,labels= movies['release_year'].value_counts()[:10].index)
fig.update_layout(title="Pie Chart for Distribution of Movies based on Year Released")
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
fig = px.pie(shows,values = shows['release_year'].value_counts()[:10],names = shows['release_year'].value_counts()[:10].index,labels= shows['release_year'].value_counts()[:10].index)
fig.update_layout(title="Pie Chart for Distribution of Shows based on Year Released")
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
r = netflix['release_year'].value_counts()[:15].to_dict()
data = dict(
    count = r.values(),
    year = r.keys())
fig = px.funnel(data, x='year', y='count')
fig.update_layout(title="Funnel Plot for Distribution of Content based on Year Released")
fig.show()

In [None]:
r = movies['release_year'].value_counts()[:15].to_dict()
data = dict(
    count = r.values(),
    year = r.keys())
fig = px.funnel(data, x='year', y='count')
fig.update_layout(title="Funnel Plot for Distribution of Movies based on Year Released")
fig.show()

In [None]:
r = shows['release_year'].value_counts()[:15].to_dict()
data = dict(
    count = r.values(),
    year = r.keys())
fig = px.funnel(data, x='year', y='count')
fig.update_layout(title="Funnel Plot for Distribution of Shows based on Year Released")
fig.show()

In [None]:
content = netflix['release_year'].value_counts().reset_index()
trace1 = go.Bar(x = content['index'],
                y = content['release_year'],
                marker = dict(color='rgb(255, 165, 0)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )
layout = go.Layout(template='plotly_dark', title='Content Release over the years', xaxis=dict(title='Year'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [None]:
content = movies['release_year'].value_counts().reset_index()
trace1 = go.Bar(x = content['index'],
                y = content['release_year'],
                marker = dict(color='rgb(255, 165, 0)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )
layout = go.Layout(template='plotly_dark', title='Movies Release over the years', xaxis=dict(title='Year'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [None]:
content = shows['release_year'].value_counts().reset_index()
trace1 = go.Bar(x = content['index'],
                y = content['release_year'],
                marker = dict(color='rgb(255, 165, 0)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )
layout = go.Layout(template='plotly_dark', title='Shows Release over the years', xaxis=dict(title='Year'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [None]:
movies_release = movies['release_year'].value_counts().reset_index()
shows_release = shows['release_year'].value_counts().reset_index()

trace1 = go.Bar(x = movies_release['index'],
                y = movies_release['release_year'],
                name = "Movies",
                marker = dict(color = 'rgb(249, 6, 6)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )

trace2 = go.Bar(x = shows_release['index'],
                y = shows_release['release_year'],
                name = "TV Shows",
                marker = dict(color = 'rgb(26, 118, 255)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )

layout = go.Layout(template='plotly_dark', title='Release Years by Content Type', xaxis=dict(title='Years'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()

In [None]:
sns.set()
plt.figure(figsize=(20,8))
sns.countplot(x="release_month",data= netflix,order = netflix['release_month'].value_counts().index)
plt.title("Content Released over the Months", fontsize=40)
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.set()
plt.figure(figsize=(20,8))
sns.countplot(x="release_month",data= movies,order = movies['release_month'].value_counts().index)
plt.title("Movies Released over the Months", fontsize=40)
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.set()
plt.figure(figsize=(20,8))
sns.countplot(x="release_month",data= shows,order = shows['release_month'].value_counts().index)
plt.title("Shows Released over the Months", fontsize=40)
plt.xticks(rotation=45)
plt.show()

In [None]:
fig = px.pie(netflix,values = netflix['release_month'].value_counts(),names = netflix['release_month'].value_counts().index,labels= netflix['release_month'].value_counts().index)
fig.update_layout(title="Pie Chart for Distribution of Content based on Month Released")
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
fig = px.pie(movies,values = movies['release_month'].value_counts(),names = movies['release_month'].value_counts().index,labels= movies['release_month'].value_counts().index)
fig.update_layout(title="Pie Chart for Distribution of Movies based on Month Released")
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
fig = px.pie(shows,values = shows['release_month'].value_counts(),names = shows['release_month'].value_counts().index,labels= shows['release_month'].value_counts().index)
fig.update_layout(title="Pie Chart for Distribution of Shows based on Month Released")
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
r = netflix['release_month'].value_counts().to_dict()
data = dict(
    count = r.values(),
    month = r.keys())
fig = px.funnel(data, x='month', y='count')
fig.update_layout(title="Funnel Plot for Distribution of Content based on Month Released")
fig.show()

In [None]:
r = movies['release_month'].value_counts().to_dict()
data = dict(
    count = r.values(),
    month = r.keys())
fig = px.funnel(data, x='month', y='count')
fig.update_layout(title="Funnel Plot for Distribution of Movies based on Month Released")
fig.show()

In [None]:
r = shows['release_month'].value_counts().to_dict()
data = dict(
    count = r.values(),
    month = r.keys())
fig = px.funnel(data, x='month', y='count')
fig.update_layout(title="Funnel Plot for Distribution of Shows based on Month Released")
fig.show()

In [None]:
content = netflix['release_month'].value_counts().reset_index()
trace1 = go.Bar(x = content['index'],
                y = content['release_month'],
                marker = dict(color='rgb(255, 165, 0)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )
layout = go.Layout(template='plotly_dark', title='Content Release over the months', xaxis=dict(title='Months'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [None]:
content = movies['release_month'].value_counts().reset_index()
trace1 = go.Bar(x = content['index'],
                y = content['release_month'],
                marker = dict(color='rgb(255, 165, 0)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )
layout = go.Layout(template='plotly_dark', title='Movies Release over the months', xaxis=dict(title='Months'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [None]:
content = shows['release_month'].value_counts().reset_index()
trace1 = go.Bar(x = content['index'],
                y = content['release_month'],
                marker = dict(color='rgb(255, 165, 0)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )
layout = go.Layout(template='plotly_dark', title='Shows Release over the months', xaxis=dict(title='Months'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [None]:
movies_release = movies['release_month'].value_counts().reset_index()
shows_release = shows['release_month'].value_counts().reset_index()

trace1 = go.Bar(x = movies_release['index'],
                y = movies_release['release_month'],
                name = "Movies",
                marker = dict(color = 'rgb(249, 6, 6)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )

trace2 = go.Bar(x = shows_release['index'],
                y = shows_release['release_month'],
                name = "TV Shows",
                marker = dict(color = 'rgb(26, 118, 255)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )

layout = go.Layout(template='plotly_dark', title='Release Months by Content Type', xaxis=dict(title='Months'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()

In [None]:
sns.set()
plt.figure(figsize=(20,8))
sns.countplot(x="year_added",data= netflix,order = netflix['year_added'].value_counts().index[0:20])
plt.title("Content Added over the years", fontsize=40)
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.set()
plt.figure(figsize=(20,8))
sns.countplot(x="year_added",data= movies,order = movies['year_added'].value_counts().index[0:20])
plt.title("Movies Added over the years", fontsize=40)
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.set()
plt.figure(figsize=(20,8))
sns.countplot(x="year_added",data= shows,order = shows['year_added'].value_counts().index[0:20])
plt.title("Shows Added over the years", fontsize=40)
plt.xticks(rotation=45)
plt.show()

In [None]:
fig = px.pie(netflix,values = netflix['year_added'].value_counts(),names = netflix['year_added'].value_counts().index,labels= netflix['year_added'].value_counts().index)
fig.update_layout(title="Pie Chart for Distribution of Content based on Year Added")
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
fig = px.pie(movies,values = movies['year_added'].value_counts(),names = movies['year_added'].value_counts().index,labels= movies['year_added'].value_counts().index)
fig.update_layout(title="Pie Chart for Distribution of Movies based on Year Added")
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
fig = px.pie(shows,values = shows['year_added'].value_counts(),names = shows['year_added'].value_counts().index,labels= shows['year_added'].value_counts().index)
fig.update_layout(title="Pie Chart for Distribution of Shows based on Year Added")
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
r = netflix['year_added'].value_counts().to_dict()
data = dict(
    count = r.values(),
    year = r.keys())
fig = px.funnel(data, x='year', y='count')
fig.update_layout(title="Funnel Plot for Distribution of Content based on Year Added")
fig.show()

In [None]:
r = movies['year_added'].value_counts().to_dict()
data = dict(
    count = r.values(),
    year = r.keys())
fig = px.funnel(data, x='year', y='count')
fig.update_layout(title="Funnel Plot for Distribution of Movies based on Year Added")
fig.show()

In [None]:
r = shows['year_added'].value_counts().to_dict()
data = dict(
    count = r.values(),
    year = r.keys())
fig = px.funnel(data, x='year', y='count')
fig.update_layout(title="Funnel Plot for Distribution of Shows based on Year Added")
fig.show()

In [None]:
content = netflix['year_added'].value_counts().reset_index()
trace1 = go.Bar(x = content['index'],
                y = content['year_added'],
                marker = dict(color='rgb(255, 165, 0)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )
layout = go.Layout(template='plotly_dark', title='Content Added over the years', xaxis=dict(title='Year'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [None]:
content = movies['year_added'].value_counts().reset_index()
trace1 = go.Bar(x = content['index'],
                y = content['year_added'],
                marker = dict(color='rgb(255, 165, 0)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )
layout = go.Layout(template='plotly_dark', title='Movies Added over the years', xaxis=dict(title='Year'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [None]:
content = shows['year_added'].value_counts().reset_index()
trace1 = go.Bar(x = content['index'],
                y = content['year_added'],
                marker = dict(color='rgb(255, 165, 0)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )
layout = go.Layout(template='plotly_dark', title='Shows Added over the years', xaxis=dict(title='Year'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [None]:
movies_release = movies['year_added'].value_counts().reset_index()
shows_release = shows['year_added'].value_counts().reset_index()

trace1 = go.Bar(x = movies_release['index'],
                y = movies_release['year_added'],
                name = "Movies",
                marker = dict(color = 'rgb(249, 6, 6)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )

trace2 = go.Bar(x = shows_release['index'],
                y = shows_release['year_added'],
                name = "TV Shows",
                marker = dict(color = 'rgb(26, 118, 255)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )

layout = go.Layout(template='plotly_dark', title='Year Added by Content Type', xaxis=dict(title='Year'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()

In [None]:
sns.set()
plt.figure(figsize=(20,8))
sns.countplot(x="month_added",data= netflix,order = netflix['month_added'].value_counts().index)
plt.title("Content Added over the Months", fontsize=40)
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.set()
plt.figure(figsize=(20,8))
sns.countplot(x="month_added",data= movies,order = movies['month_added'].value_counts().index)
plt.title("Movies Added over the Months", fontsize=40)
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.set()
plt.figure(figsize=(20,8))
sns.countplot(x="month_added",data= shows,order = shows['month_added'].value_counts().index)
plt.title("Shows Added over the Months", fontsize=40)
plt.xticks(rotation=45)
plt.show()

In [None]:
fig = px.pie(netflix,values = netflix['month_added'].value_counts(),names = netflix['month_added'].value_counts().index,labels= netflix['month_added'].value_counts().index)
fig.update_layout(title="Pie Chart for Distribution of Content based on Month Added")
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
fig = px.pie(movies,values = movies['month_added'].value_counts(),names = movies['month_added'].value_counts().index,labels= movies['month_added'].value_counts().index)
fig.update_layout(title="Pie Chart for Distribution of Movies based on Month Added")
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
fig = px.pie(shows,values = shows['month_added'].value_counts(),names = shows['month_added'].value_counts().index,labels= shows['month_added'].value_counts().index)
fig.update_layout(title="Pie Chart for Distribution of Shows based on Month Added")
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
r = netflix['month_added'].value_counts().to_dict()
data = dict(
    count = r.values(),
    month = r.keys())
fig = px.funnel(data, x='month', y='count')
fig.update_layout(title="Funnel Plot for Distribution of Content based on Month Added")
fig.show()

In [None]:
r = movies['month_added'].value_counts().to_dict()
data = dict(
    count = r.values(),
    month = r.keys())
fig = px.funnel(data, x='month', y='count')
fig.update_layout(title="Funnel Plot for Distribution of Movies based on Month Added")
fig.show()

In [None]:
r = shows['month_added'].value_counts().to_dict()
data = dict(
    count = r.values(),
    month = r.keys())
fig = px.funnel(data, x='month', y='count')
fig.update_layout(title="Funnel Plot for Distribution of Shows based on Month Added")
fig.show()

In [None]:
content = netflix['month_added'].value_counts().reset_index()
trace1 = go.Bar(x = content['index'],
                y = content['month_added'],
                marker = dict(color='rgb(255, 165, 0)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )
layout = go.Layout(template='plotly_dark', title='Content Added over the months', xaxis=dict(title='Month'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [None]:
content = movies['month_added'].value_counts().reset_index()
trace1 = go.Bar(x = content['index'],
                y = content['month_added'],
                marker = dict(color='rgb(255, 165, 0)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )
layout = go.Layout(template='plotly_dark', title='Movies Added over the months', xaxis=dict(title='Month'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [None]:
content = shows['month_added'].value_counts().reset_index()
trace1 = go.Bar(x = content['index'],
                y = content['month_added'],
                marker = dict(color='rgb(255, 165, 0)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )
layout = go.Layout(template='plotly_dark', title='Shows Added over the months', xaxis=dict(title='Month'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [None]:
movies_added = movies['month_added'].value_counts().reset_index()
shows_added = shows['month_added'].value_counts().reset_index()

trace1 = go.Bar(x = movies_added['index'],
                y = movies_added['month_added'],
                name = "Movies",
                marker = dict(color = 'rgb(249, 6, 6)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )

trace2 = go.Bar(x = shows_added['index'],
                y = shows_added['month_added'],
                name = "TV Shows",
                marker = dict(color = 'rgb(26, 118, 255)',
                             line = dict(color='rgb(0, 0, 0)', width=1.5))
               )

layout = go.Layout(template='plotly_dark', title='Months Added by Content Type', xaxis=dict(title='Month'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()

In [None]:
df.head()

In [None]:
shows_date = df[['date_added']].dropna()
shows_date['year'] = shows_date['date_added'].apply(lambda x: x.split(', ')[-1])
shows_date['month'] = shows_date['date_added'].apply(lambda x: x.lstrip().split(' ')[0])
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'][::-1]
n = shows_date.groupby('year')['month'].value_counts().unstack().fillna(0)[month_order].T
plt.figure(figsize=(10, 7), dpi=200)
plt.pcolor(n, cmap='gist_heat_r', edgecolors='white', linewidths=2) # heatmap
plt.xticks(np.arange(0.5, len(n.columns), 1), n.columns, fontsize=7, fontfamily='serif')
plt.yticks(np.arange(0.5, len(n.index), 1), n.index, fontsize=7, fontfamily='serif')

plt.title('Netflix Contents Update', fontsize=12, fontfamily='serif', fontweight='bold', position=(0.20, 1.0+0.02))
cbar = plt.colorbar()

cbar.ax.tick_params(labelsize=8) 
cbar.ax.minorticks_on()
plt.show()

In [None]:
group_country_movies=netflix.groupby('country')['show_id'].count().sort_values(ascending=False).head(10);
plt.subplots(figsize=(15,8));
group_country_movies.plot(kind='bar',fontsize=12);
plt.xlabel('Content Count',fontsize=12)
plt.ylabel('Country',fontsize=12)
plt.title('Content count by Country',fontsize=12)
plt.ioff()

In [None]:
sns.set()
plt.figure(figsize=(25,9))
sns.countplot(x="country",data= netflix,hue= "type",order = netflix['country'].value_counts().index[0:15])
plt.xticks(rotation=45)
plt.show()

In [None]:
fig = px.pie(netflix,values = netflix['country'].value_counts()[:10],names = netflix['country'].value_counts()[:10].index,labels= netflix['country'].value_counts()[:10].index)
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
fig = go.Figure([go.Bar(y=netflix['country'].value_counts()[:20].index,
                        x=netflix['country'].value_counts()[:20],
                        orientation="h",
                        name="",
                        marker=dict(color="#a678de"))])
fig = fig.update_layout(title='Countries with most Content', height=700, legend=dict(x=0.1, y=1.1, orientation="h"))
fig.show()

In [None]:
c = netflix['country'].value_counts()[:15].to_dict()
data = dict(
    count = c.values(),
    country = c.keys())
fig = px.funnel(data, x='count', y='country')
fig.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.barplot(netflix['country'].value_counts()[:20].index, netflix['country'].value_counts()[:20], palette="RdGy")
plt.xlabel('Countries', fontsize=20)
plt.ioff()
plt.show()

In [None]:
import squarify
y = netflix['country'].value_counts()[:20]
fig = plt.figure(figsize=(15, 10))
squarify.plot(sizes = y.values, label = y.index, color = sns.color_palette("RdGy", n_colors = 20), 
              linewidth=4, text_kwargs={'fontsize':14, 'fontweight':'bold'})
plt.title('Top 20 producing countries', position=(0.5, 1.0+0.03), fontsize=20, fontweight='bold')
plt.axis('off')
plt.show()

In [None]:
wordcloud = WordCloud(background_color = "black",width=1730,height=970).generate(" ".join(netflix.country))
plt.figure(figsize=(15,10))
plt.imshow(wordcloud,interpolation = 'bilinear')
plt.axis("off")
plt.title("Countries watching on Netflix",color="black",fontsize=40)
plt.figure(1,figsize=(12,12))
plt.show()

In [None]:
Country = pd.DataFrame(netflix["country"].value_counts().reset_index().values,columns=["country","TotalShows"])
Country.head()

In [None]:
fig = px.choropleth(   
    locationmode='country names',
    locations=Country.country,
    featureidkey="Country.country",
    labels=Country["TotalShows"]
)
fig.show()

In [None]:
Country2 = netflix.groupby('year_added')['country'].value_counts().reset_index(name='counts')
Country2.head()

In [None]:
fig  = px.choropleth(Country2, locations="country", color="counts", locationmode="country names",
                    animation_frame="year_added", range_color=[0, 200], color_continuous_scale=px.colors.sequential.OrRd)
fig.update_layout(title="Comparison by Country")
fig.show()

In [None]:
wordcloud = WordCloud(background_color = "black",width=1730,height=970).generate(" ".join(netflix.title))
plt.figure(figsize=(15,10))
plt.imshow(wordcloud,interpolation = 'bilinear')
plt.axis("off")
plt.title("Titles", color="black", fontsize=40)
plt.figure(1,figsize=(12,12))
plt.show()

In [None]:
wrds = netflix['title'].str.split("(").str[0].value_counts().keys()
wordcloud = WordCloud(stopwords=STOPWORDS,scale = 5,background_color = "black",width=1730,height=970, colormap="rainbow").generate(" ".join(wrds))
plt.figure(figsize=(15,10))
plt.imshow(wordcloud,interpolation = 'bilinear')
plt.axis("off")
plt.title("Key Words in Movie Titles", color="black", fontsize=40)
plt.figure(1,figsize=(12,12))
plt.show()

In [None]:
netflix["listed_in"].value_counts()

In [None]:
netflix["listed_in"].value_counts()[0:25]

In [None]:
top_listed_in = netflix["listed_in"].value_counts()[:25]

In [None]:
fig = px.pie(netflix,values = top_listed_in,names = top_listed_in.index,labels= top_listed_in.index)
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.barplot(top_listed_in, top_listed_in.index)
plt.xlabel('Count', fontsize=20)
plt.ylabel('Movie Listing', fontsize=20)
plt.ioff()
plt.show()

In [None]:
c = netflix['listed_in'].value_counts()[:25].to_dict()
data = dict(
    count = c.values(),
    genre = c.keys())
fig = px.funnel(data, x='count', y='genre')
fig.show()

In [None]:
genres = list(movies['listed_in'])
gen = []

for i in genres:
    i = list(i.split(','))
    for j in i:
        gen.append(j.replace(' ',''))
gen[:20]

In [None]:
g = Counter(gen)
g

In [None]:
text = list(set(gen))
plt.rcParams['figure.figsize']=(13, 13)
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color='black').generate(str(text))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
g = {k: v for k, v in sorted(g.items(), key=lambda item: item[1], reverse=True)}
fig, ax = plt.subplots()
x = list(g.keys())
y = list(g.values())
ax.vlines(x, ymin=0, ymax=y)
ax.plot(x, y, "o")
ax.set_xticklabels(x, rotation=90)
ax.set_ylabel("Count of Movie Genres")
ax.set_title("Genres")

In [None]:
g = {k: v for k, v in sorted(g.items(), key=lambda item: item[1], reverse=True)}
data = dict(
    count = g.values(),
    genre = g.keys())
fig = px.funnel(data, x='count', y='genre')
fig.show()

In [None]:
gen = pd.DataFrame.from_dict(g, orient='index', columns=['Count'])
gen.head()

In [None]:
plt.figure(figsize=(20, 10))
shows['listed_in'].value_counts()[:20][::-1].plot(kind="barh", color="black")
plt.title("Top 10 Genres of TV Shows", fontsize=18)

In [None]:
netflix['length']=netflix['description'].str.len()
df.head()

In [None]:
plt.figure(figsize=(12,5))

plot = sns.distplot(netflix['length'])
plot.set_title("Price Distribuition Filtered 300", fontsize=20)
plot.set_xlabel("Prices(US)", fontsize=15)
plot.set_ylabel("Frequency Distribuition", fontsize=15)


plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.set(style="darkgrid")
plot = sns.kdeplot(netflix['length'], shade=True)
plot.set_title("Price Distribuition Filtered 300", fontsize=20)
plot.set_xlabel("Prices(US)", fontsize=15)
plot.set_ylabel("Frequency Distribuition", fontsize=15)


plt.show()

In [None]:
netflix['duration'].head()

In [None]:
plt.figure(figsize=(12,5))
sns.set(style="darkgrid")
plot = sns.kdeplot(data=movies['duration'], shade=True)
plot.set_title("Distribution of Movie Duration", fontsize=20)

plt.show()

In [None]:
movies['duration'] = movies.apply(lambda x : x['duration'].split(" ")[0] if "Season" not in x['duration'] else "", axis = 1)
x1 = movies['duration'].fillna(0.0).astype(float)
fig = ff.create_distplot([x1], ['a'], bin_size=0.7, curve_type='normal', colors=['#6ad49b'])
fig.update_layout(title_text='Distplot with Normal Distribution')
fig.show()

In [None]:
trace = go.Histogram(x = netflix['duration'],
                     xbins = dict(size=0.5),
                     marker = dict(color = 'rgb(26, 118, 255)')
                    )
layout = go.Layout(template = 'plotly_dark', title='Distribution of Durations', xaxis=dict(title='Duration'))
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [None]:
def duration_in_country(country):
    netflix_country_dur = netflix.loc[(netflix.country==country) & (netflix.type == 'Movie')].duration[:]
    netflix_country_dur = netflix_country_dur.apply(lambda x: int(x.strip(' minSeaso')))
    
    return netflix_country_dur

fig, axes = plt.subplots(10, 1, figsize=(18,18), constrained_layout=True)
fig.suptitle(f"Variation in Movie Durations in Various Countries", fontsize=16)
Countries = ['United States', 'India', 'Australia', 'Mexico', 'Spain', 'United Kingdom', 'Italy', 'Germany', 'France', 'Canada']

for i in range(10):
    country_mov_dur = duration_in_country(Countries[i])
    sns.kdeplot(country_mov_dur, Label=f"Movie Duration in {Countries[i]}", color="red", ax=axes[i])

In [None]:
old = netflix.sort_values('release_year', ascending=True)
old_movies = old[old['duration']!=""][:20]
old_movies

In [None]:
old_shows = old[old['season_count']!=""][:20]
old_shows

In [None]:
newest = netflix.sort_values(by='release_year', ascending=False)
newest_movies = newest[newest['duration']!=''][:20]
newest_movies

In [None]:
newest_shows = newest[newest['season_count']!=''][:20]
newest_shows

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['title', 'release_year'],
                                           fill_color='paleturquoise'),
                               cells=dict(values=[old_shows['title'], old_shows['release_year']],
                                         fill_color='pink')
                              )])
fig.show()

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['title', 'year_added'],
                                           fill_color='paleturquoise'),
                               cells=dict(values=[old_shows['title'], old_shows['year_added']],
                                         fill_color='pink')
                              )])
fig.show()

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['title', 'release_year'],
                                           fill_color='paleturquoise'),
                               cells=dict(values=[old_movies['title'], old_movies['release_year']],
                                         fill_color='pink')
                              )])
fig.show()

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['title', 'release_year'],
                                           fill_color='paleturquoise'),
                               cells=dict(values=[newest_shows['title'], newest_shows['release_year']],
                                         fill_color='pink')
                              )])
fig.show()

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['title', 'release_year'],
                                           fill_color='paleturquoise'),
                               cells=dict(values=[newest_movies['title'], newest_movies['release_year']],
                                         fill_color='pink')
                              )])
fig.show()

In [None]:
m = movies['release_year'].value_counts().reset_index()
m = m.rename(columns={'release_year': 'count', 'index':'release_year'})
m['percent'] = m['count'].apply(lambda x: 100*x/sum(m['count']))
m = m.sort_values('release_year')

s = shows['release_year'].value_counts().reset_index()
s = s.rename(columns={'release_year': 'count', 'index':'release_year'})
s['percent'] = s['count'].apply(lambda x: 100*x/sum(s['count']))
s = s.sort_values('release_year')

m.head()

In [None]:
trace1 = go.Bar(x=m['release_year'], y=m['count'], name="Movies", marker=dict(color="#a678de"))
trace2 = go.Bar(x=s['release_year'], y=s['count'], name="TV Shows", marker=dict(color="#6ad49b"))
data = [trace1, trace2]
layout = go.Layout(title="Content Released over the Years", legend=dict(x=0.1, y=1.1, orientation="h"), template="plotly_dark")
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
m = movies['year_added'].value_counts().reset_index()
m = m.rename(columns={'year_added': 'count', 'index':'year_added'})
m['percent'] = m['count'].apply(lambda x: 100*x/sum(m['count']))
m = m.sort_values('year_added')

s = shows['year_added'].value_counts().reset_index()
s = s.rename(columns={'year_added': 'count', 'index':'year_added'})
s['percent'] = s['count'].apply(lambda x: 100*x/sum(s['count']))
s = s.sort_values('year_added')

m.head()

In [None]:
trace1 = go.Bar(x=m['year_added'], y=m['count'], name="Movies", marker=dict(color="#a678de"))
trace2 = go.Bar(x=s['year_added'], y=s['count'], name="TV Shows", marker=dict(color="#6ad49b"))
data = [trace1, trace2]
layout = go.Layout(title="Content Added over the Years", legend=dict(x=0.1, y=1.1, orientation="h"), template="plotly_dark")
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
m = movies['month_added'].value_counts().reset_index()
m = m.rename(columns={'month_added': 'count', 'index':'month_added'})
m['percent'] = m['count'].apply(lambda x: 100*x/sum(m['count']))
m = m.sort_values('month_added')

s = shows['month_added'].value_counts().reset_index()
s = s.rename(columns={'month_added': 'count', 'index':'month_added'})
s['percent'] = s['count'].apply(lambda x: 100*x/sum(s['count']))
s = s.sort_values('month_added')

m.head()

In [None]:
trace1 = go.Bar(x=m['month_added'], y=m['count'], name="Movies", marker=dict(color="#a678de"))
trace2 = go.Bar(x=s['month_added'], y=s['count'], name="TV Shows", marker=dict(color="#6ad49b"))
data = [trace1, trace2]
layout = go.Layout(title="Content Added over the Months", legend=dict(x=0.1, y=1.1, orientation="h"), template="plotly_dark")
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
s = shows['season_count'].value_counts().reset_index()
s = s.rename(columns = {'season_count':'count', 'index':'season_count'})
s['percent'] = s['count'].apply(lambda x: 100*x/sum(s['count']))
s = s.sort_values('season_count')

trace1 = go.Bar(x=s['season_count'], y=s['count'], name="TV Shows", marker=dict(color="#a678de"))
data = [trace1]
layout = go.Layout(title="TV Show Seasons", legend=dict(x=0.1, y=1.1, orientation="h"), 
                   template="plotly_dark", 
                   xaxis=dict(title="Number of Seasons"), 
                   yaxis=dict(title="Number of Shows"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
trace = go.Histogram(x = netflix['season_count'], marker = dict(color='rgb(249, 6, 6)'))
layout = go.Layout(template='plotly_dark', title='Seasons of TV Shows', 
                   xaxis=dict(title="Number of Seasons"),
                   yaxis=dict(title="Number of Shows")
                  )
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [None]:
s.head()

In [None]:
col = ['title', 'season_count']
top = shows[col]
top = top.sort_values(by='season_count', ascending=False)
top

In [None]:
for i in range(len(top)):
    top['season_count'].iloc[i] = int(top['season_count'].iloc[i])

In [None]:
top25 = top[:25]
top25

In [None]:
top25.plot(kind='bar',x='title',y='season_count', figsize=(12, 7))

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['title', 'season_count']),
                              cells = dict(values=[top25['title'], top25['season_count']], fill_color = 'lavender'))])
fig.show()

In [None]:
bottom25 = top[::-1][:25]
bottom25

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['title', 'season_count']),
                              cells = dict(values=[bottom25['title'], bottom25['season_count']], fill_color = 'lavender'))])
fig.show()

In [None]:
m = movies['rating'].value_counts().reset_index()
m = m.rename(columns={'rating': 'count', 'index':'rating'})
m['percent'] = m['count'].apply(lambda x: 100*x/sum(m['count']))
m = m.sort_values('rating')

s = shows['rating'].value_counts().reset_index()
s = s.rename(columns={'rating': 'count', 'index':'rating'})
s['percent'] = s['count'].apply(lambda x: 100*x/sum(s['count']))
s = s.sort_values('rating')

m.head()

In [None]:
trace1 = go.Bar(x=m['rating'], y=m['count'], name="Movies", marker=dict(color="#a678de"))
trace2 = go.Bar(x=s['rating'], y=s['count'], name="TV Shows", marker=dict(color="#6ad49b"))
data = [trace1, trace2]
layout = go.Layout(title="Content Added over the Years according to rating", legend=dict(x=0.1, y=1.1, orientation="h"), template="plotly_dark")
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
n = netflix.dropna()
plt.figure(figsize=(100,50))
fig = px.treemap(n, path=['country', 'director'],
                 color='director',
                 hover_data=['director', 'title'],
                 color_continuous_scale='Purple')
fig.show()

In [None]:
indian = netflix[netflix['country']=='India']
indian = indian.dropna()
plt.figure(figsize=(100,50))
fig = px.treemap(indian, path=['country', 'director'],
                 color='director',
                 hover_data=['director', 'title'],
                 color_continuous_scale='Purple')
fig.show()

In [None]:
indian = netflix[netflix['country']=='Spain']
indian = indian.dropna()
plt.figure(figsize=(100,50))
fig = px.treemap(indian, path=['country', 'director'],
                 color='director',
                 hover_data=['director', 'title'],
                 color_continuous_scale='Purple')
fig.show()

In [None]:
movies_per_year = []
shows_per_year = []

for i in range(2000, 2021):
    h = netflix.loc[(netflix['type']=='Movie') & (netflix.year_added == i)]
    g = netflix.loc[(netflix['type']=='TV Show') & (netflix.year_added == i)]
    movies_per_year.append(len(h))
    shows_per_year.append(len(g))

movies_per_year[:20]

In [None]:
shows_per_year[:20]

In [None]:
trace1 = go.Scatter(x=[i for i in range(2000, 2021)], y=movies_per_year, mode='lines+markers', name='Movies')
trace2 = go.Scatter(x=[i for i in range(2000, 2021)], y=shows_per_year, mode='lines+markers', name='Shows')

data = [trace1, trace2]
layout = go.Layout(title="Content added over the years", legend=dict(x=0.1, y=1.1, orientation='h'), template="plotly_dark")
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
indian_movies_per_year = []
indian_shows_per_year = []

for i in range(2000, 2021):
    h = netflix.loc[(netflix['type']=='Movie') & (netflix.year_added == i) & (netflix.country=='India')]
    g = netflix.loc[(netflix['type']=='TV Show') & (netflix.year_added == i) & (netflix.country=='India')]
    indian_movies_per_year.append(len(h))
    indian_shows_per_year.append(len(g))
    
trace1 = go.Scatter(x=[i for i in range(2000, 2021)], y=indian_movies_per_year, mode='lines+markers', name='Movies')
trace2 = go.Scatter(x=[i for i in range(2000, 2021)], y=indian_shows_per_year, mode='lines+markers', name='Shows')

data = [trace1, trace2]
layout = go.Layout(title="Indian Content added over the years", legend=dict(x=0.1, y=1.1, orientation='h'), template="plotly_dark")
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
american_movies_per_year = []
american_shows_per_year = []

for i in range(2000, 2021):
    h = netflix.loc[(netflix['type']=='Movie') & (netflix.year_added == i) & (netflix.country=='United States')]
    g = netflix.loc[(netflix['type']=='TV Show') & (netflix.year_added == i) & (netflix.country=='United States')]
    american_movies_per_year.append(len(h))
    american_shows_per_year.append(len(g))
    
trace1 = go.Scatter(x=[i for i in range(2000, 2021)], y=american_movies_per_year, mode='lines+markers', name='Movies')
trace2 = go.Scatter(x=[i for i in range(2000, 2021)], y=american_shows_per_year, mode='lines+markers', name='Shows')

data = [trace1, trace2]
layout = go.Layout(title="Anerican Content added over the years", legend=dict(x=0.1, y=1.1, orientation='h'), template="plotly_dark")
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
def content_in_countries(country):
    movies_per_year = []
    shows_per_year = []

    for i in range(2000, 2021):
        h = netflix.loc[(netflix['type']=='Movie') & (netflix.year_added == i) & (netflix.country==country)]
        g = netflix.loc[(netflix['type']=='TV Show') & (netflix.year_added == i) & (netflix.country==country)]
        movies_per_year.append(len(h))
        shows_per_year.append(len(g))

    trace1 = go.Scatter(x=[i for i in range(2000, 2021)], y=movies_per_year, mode='lines+markers', name='Movies')
    trace2 = go.Scatter(x=[i for i in range(2000, 2021)], y=shows_per_year, mode='lines+markers', name='Shows')

    data = [trace1, trace2]
    layout = go.Layout(title=f"Content added over the years in {country}", legend=dict(x=0.1, y=1.1, orientation='h'), template="plotly_dark")
    fig = go.Figure(data, layout=layout)
    fig.show()
    
Countries = ['United States', 'India', 'Australia', 'Mexico', 'Spain', 'United Kingdom', 'Italy', 'Germany', 'France', 'Canada']
for i in Countries:
    content_in_countries(i)

In [None]:
indian_movies = movies[movies['country']=='India']
indian_directors = ", ".join(indian_movies['director'].fillna('')).split(", ")
top_indian_directors = Counter(indian_directors).most_common(11)
top_indian_directors

In [None]:
top_indian_directors = [_ for _ in top_indian_directors if _[0]!=""]
top_indian_directors

In [None]:
indian_directors = [_[0] for _ in top_indian_directors][::-1]
movie_counts = [_[1] for _ in top_indian_directors][::-1]
indian_directors

In [None]:
movie_counts

In [None]:
trace = go.Bar(x = indian_directors, y = movie_counts, marker = dict(color='rgb(255, 165, 0)', 
                                                     line = dict(color ='rgb(0,0,0)',width=1.5)))
layout = go.Layout(template = 'plotly_dark', title="Top 10 Movie Directors from India with Maximum content",
                  xaxis = dict(title='Directors'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [None]:
indian_actors = ", ".join(indian_movies['cast'].fillna('')).split(", ")
top_indian_actors = Counter(indian_actors).most_common(11)
top_indian_actors

In [None]:
indian_actors = [_[0] for _ in top_indian_actors][::-1]
movie_counts = [_[1] for _ in top_indian_actors][::-1]
trace = go.Bar(x = indian_actors, y = movie_counts, marker = dict(color='rgb(255, 165, 0)', 
                                                     line = dict(color ='rgb(0,0,0)',width=1.5)))
layout = go.Layout(template = 'plotly_dark', title="Top 10 Movie Actors from India with Maximum content",
                  xaxis = dict(title='Actors'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [None]:
american_movies = movies[movies['country']=='United States']
american_directors = ", ".join(american_movies['director'].fillna('')).split(", ")
top_american_directors = Counter(american_directors).most_common(11)
top_american_directors

In [None]:
top_american_directors = [_ for _ in top_american_directors if _[0]!=""]
top_american_directors

In [None]:
american_directors = [_[0] for _ in top_american_directors][::-1]
movie_counts = [_[1] for _ in top_american_directors][::-1]
trace = go.Bar(x = american_directors, y = movie_counts, marker = dict(color='rgb(255, 0, 0)', 
                                                     line = dict(color ='rgb(0,0,0)',width=1.5)))
layout = go.Layout(template = 'plotly_dark', title="Top 10 Movie Directors from America with Maximum content",
                  xaxis = dict(title='Directors'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [None]:
american_actors = ", ".join(american_movies['cast'].fillna('')).split(", ")
top_american_actors = Counter(american_actors).most_common(11)
top_american_actors

In [None]:
top_american_actors = [_ for _ in top_american_actors if _[0]!=""]
top_american_actors

In [None]:
american_actors = [_[0] for _ in top_american_actors][::-1]
movie_counts = [_[1] for _ in top_american_actors][::-1]
trace = go.Bar(x = american_actors, y = movie_counts, marker = dict(color='rgb(255, 0, 0)', 
                                                     line = dict(color ='rgb(0,0,0)',width=1.5)))
layout = go.Layout(template = 'plotly_dark', title="Top 10 Movie Actors from America with Maximum content",
                  xaxis = dict(title='Actors'), yaxis=dict(title='Count'))
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [None]:
!pip install pywaffle

In [None]:
from pywaffle import Waffle

fig = plt.figure(
    FigureClass = Waffle,
    rows = 13,
    columns = 21,
    values = netflix['country'].value_counts()[:20],
    labels = ["{}({})".format(a, b) for a,b in zip(netflix['country'].value_counts().index[:20], netflix['country'].value_counts()[:20])],
    colors = sns.color_palette("RdGy", n_colors=20),
    legend = {'loc':'lower left', 'bbox_to_anchor': (0, -0.3), 'ncol': len(y)//4, 'framealpha': 0},
    font_size = 15,
    figsize=(12, 12),
    icon_legend = True
)

plt.title('Top 20 Content Producing Countries', position = (0.5, 1.0+0.03), fontsize=40, fontweight='bold')
plt.show()

In [None]:
fig = plt.figure(
    FigureClass = Waffle,
    rows = 13,
    columns = 21,
    values = netflix['country'].value_counts()[:20],
    labels = ["{}({})".format(a, b) for a,b in zip(netflix['country'].value_counts().index[:20], netflix['country'].value_counts()[:20])],
    colors = sns.color_palette("colorblind", n_colors=20),
    legend = {'loc':'lower left', 'bbox_to_anchor': (0, -0.3), 'ncol': len(y)//4, 'framealpha': 0},
    font_size = 15,
    figsize=(12, 12),
    icon_legend = True
)

plt.title('Top 20 Content Producing Countries', position = (0.5, 1.0+0.03), fontsize=40, fontweight='bold')
plt.show()

# **RECOMMENDATIONS**

In [None]:
!pip install rake-nltk
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
import time
from rake_nltk import Rake
import networkx as nx

In [None]:
df.head(20)

**Recommendations using CountVectorizer and Cosine Similarity and Rake()**

In [None]:
new_df = df[['title', 'director', 'cast', 'listed_in', 'description']]
new_df.head(10)

In [None]:
new_df.dropna(inplace=True)
blanks = []
col = ['title', 'director', 'cast', 'listed_in', 'description']
for i, col in new_df.iterrows():  # Iterate over DataFrame rows as (index, Series) pairs.
    if type(col)==str:            # avoid NaN values
        if col.isspace():         # test 'review' for white spaces
            blanks.append(i)      # add matching index numbers to the list
new_df.drop(blanks, inplace=True)
new_df.head(10)

In [None]:
new_df['key_words'] = ''
for index, row in new_df.iterrows():
    description = row['description']
    
    # Rake => uses english stopwords from NLTK, discards all puctuations
    r = Rake()
    r.extract_keywords_from_text(description)
    
    # Creating dictionary => key-words -> keys, scores -> values
    key_words_dict_scores = r.get_word_degrees()
    
    # Assigning key-words to new column for corresponding movies
    row['key_words'] = list(key_words_dict_scores.keys())
    
new_df['key_words'].head()

In [None]:
key_words_dict_scores

In [None]:
new_df.drop(columns=['description'], inplace=True)
new_df.head(10)

In [None]:
new_df['cast'] = new_df['cast'].map(lambda x: x.split(',')[:3])
new_df['listed_in'] = new_df['listed_in'].map(lambda x: x.lower().split(','))
new_df['director'] = new_df['director'].map(lambda x: x.split(' '))

new_df.head(10)

In [None]:
new_df.iterrows()

In [None]:
for index, row in new_df.iterrows():
    row['cast'] = [x.lower().replace(' ','') for x in row['cast']]
    row['director'] = ''.join(row['director']).lower()

new_df.head(10)

In [None]:
new_df['bag_of_words'] = ''
columns = new_df.columns
for index, row in new_df.iterrows():
    words = ''
    for col in columns:
        if col!= 'director':
            words += ''.join(row[col]) + ' '
        else:
            words += row[col] + ' '
    row['bag_of_words'] = words
    
new_df.head()

In [None]:
new_df.set_index('title', inplace=True)
new_df.head()

In [None]:
count = CountVectorizer()
count_matrix = count.fit_transform(new_df['bag_of_words'])
count_matrix

In [None]:
indices = pd.Series(new_df.index)
indices

In [None]:
cosine = cosine_similarity(count_matrix, count_matrix)
cosine

In [None]:
def recommendations(Title, cosine_sim = cosine):
    recommended = []
    idx = indices[indices==Title].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False)
    top_10_indices = list(score_series.iloc[1:11].index)
    
    for i in top_10_indices:
        recommended.append(list(new_df.index)[i])
    return recommended

recommendations('3 Idiots')

In [None]:
recommendations('Naruto')

In [None]:
recommendations('Brother')

In [None]:
recommendations('Hellboy')

**Recommendations using TfidVectorizer and Cosine Similarity**

In [None]:
new_df = df
# Removing Stopwords
tfidf = TfidfVectorizer(stop_words='english')
new_df['description'] = new_df['description'].fillna('')
tfidf_matrix = tfidf.fit_transform(new_df['description'])
tfidf_matrix.shape

In [None]:
# There are 16151 words described for 6234 movies

tfidf_matrix

In [None]:
# Compute Linear Kernel between tfidf_matrix and itself
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

In [None]:
indices = pd.Series(new_df.index, index=new_df['title'])
indices

In [None]:
indices = indices.drop_duplicates()
indices

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:11]
    movie_indices = [i[0] for i in similarity_scores]
    return new_df['title'].iloc[movie_indices]

get_recommendations('3 Idiots')

# **Adamic Adar measure**

It is a measure used to compute the closeness of nodes based on their shared neighbors.

* x and y are 2 nodes (2 Movies)
* N(one_node) is a function that return the set of adjacent nodes to one_node

adamicAdar(x,y)=∑u∈N(x)∩N(y)1log(N(u))
 
«say otherwise, for each node u in common to x and y, add to the measure 1/log(N(u))»

The quantity  1/log(N(u))  determine the importance of u in the measure.

* if x and y share a node u that has a lot of adjacent nodes, this node is not really relevant. → N(u) is high → 1/log(N(u)) is not high
* if x and y share a node u that not has a lot of adjacent nodes, this node is really relevant. → N(u) is not high → 1/log(N(u)) is higher

In [None]:
netflix.head()

In [None]:
netflix['day'] = netflix['date_added'].dt.day
netflix['directors'] = netflix['director'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])
netflix['categories'] = netflix['listed_in'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])
netflix['actors'] = netflix['cast'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])
netflix['countries'] = netflix['country']. apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])
netflix.head()

In [None]:
netflix.shape

# **K Means Clustering with TF-IDF**

In [None]:
start_time = time.time()
text_content = netflix['description']
vector = TfidfVectorizer(max_df = 0.4,           # drop words that occur more than max_df %
                         stop_words = 'english', # remove stop words
                         lowercase = True,       # everything to lowercase
                         use_idf = True,
                         norm = u'l2',
                         smooth_idf = True       # prevent divide by zero errors
                         )

tfidf = vector.fit_transform(text_content)

In [None]:
k = 200
kmeans = MiniBatchKMeans(n_clusters = k)
kmeans.fit(tfidf)
centres = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vector.get_feature_names()

request_transform = vector.transform(netflix['description'])
netflix['cluster'] = kmeans.predict(request_transform)
netflix['cluster'].value_counts().head()

In [None]:
def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i!=index]
    return [index for index in related_docs_indices][0:top_n]

In [None]:
G = nx.Graph(label="MOVIE")
start_time = time.time()
for i, row in netflix.iterrows():
    if(i%1000==0):
        print(" iter {} -- {} seconds --".format(i, time.time()-start_time))
    G.add_node(row['title'], key=row['show_id'], label='MOVIE', mtype=row['type'], rating=row['rating'])
    for element in row['actors']:
        G.add_node(element, label='PERSON')
        G.add_edge(row['title'], element, label='ACTED_IN')
    for element in row['categories']:
        G.add_node(element, label='CAT')
        G.add_edge(row['title'], element, label='CAT_IN')
    for element in row['directors']:
        G.add_node(element, label='PERSON')
        G.add_edge(row['title'], element, label='DIRECTED')
    for element in row['countries']:
        G.add_node(element, label='COU')
        G.add_edge(row['title'], element, label='COU_IN')
        
    indices = find_similar(tfidf, i, top_n=5)
    snode = "Sim("+row['title'][:15].strip()+")"
    G.add_node(snode, label='SIMILAR')
    G.add_edge(row['title'], snode, label='SIMILARITY')
    for element in indices:
        G.add_edge(snode, netflix['title'].loc[element], label='SIMILARITY')
print(" finish -- {} seconds --".format(time.time() - start_time))

In [None]:
indices

In [None]:
def get_all_adj_nodes(list_in):
    sub_graph = set()
    for m in list_in:
        sub_graph.add(m)
        for e in G.neighbors(m):
            sub_graph.add(e)
    return list(sub_graph)

In [None]:
def draw_sub_graph(sub_graph):
    subgraph = G.subgraph(sub_graph)
    colors = []
    for e in subgraph.nodes():
        if G.nodes[e]['label'] == 'MOVIE':
            colors.append('blue')
        elif G.nodes[e]['label']=="PERSON":
            colors.append('red')
        elif G.nodes[e]['label']=="CAT":
            colors.append('green')
        elif G.nodes[e]['label']=="COU":
            colors.append('yellow')
        elif G.nodes[e]['label']=="SIMILAR":
            colors.append('orange')    
        elif G.nodes[e]['label']=="CLUSTER":
            colors.append('orange')
    nx.draw(subgraph, with_labels=True, font_weight='bold', node_color=colors)
    plt.show()

In [None]:
list_in = ['3 Idiots', 'PK']
sub_graph = get_all_adj_nodes(list_in)
draw_sub_graph(sub_graph)

In [None]:
def get_recommendations(root):
    common_dict = {}
    for e in G.neighbors(root):
        for e2 in G.neighbors(e):
            if e2==root:
                continue
            if G.nodes[e2]['label']=='MOVIE':
                commons = common_dict.get(e2)
                if commons==None:
                    common_dict.update({e2: [e]}) 
                else:
                    commons.append(e)
                    common_dict.update({e2: commons})
    movies = []
    weight = []
    for key,values in common_dict.items():
        w = 0.0
        for e in values:
            w+=1/math.log(G.degree(e))
        movies.append(key)
        weight.append(w)
        
    result = pd.Series(data=np.array(weight), index=movies)
    result.sort_values(inplace=True, ascending=False)
    return result

In [None]:
result = get_recommendations('3 Idiots')
result

In [None]:
reco = list(result.index[:4].values)
reco.extend(['3 Idiots'])
sub_graph = get_all_adj_nodes(reco)
draw_sub_graph(sub_graph)