In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as pltgraph
import plotly.express as pl

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Data Preprocessing**

**The foremost step is to understand the dataset.**

In [None]:
df = pd.read_csv("/kaggle/input/netflix-shows/netflix_titles.csv")
df.head()

In [None]:
df['type'].value_counts()

In [None]:
df.info()

In [None]:
df.columns


In [None]:
df.isnull().sum()

**Let us find out about the missing data in every column in the dataset.**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

def missing_vals(data, thresh = 20, color = 'red', edgecolor = 'black', height = 2, width = 14):
    
    plt.figure(figsize = (width, height))
    percentage = (data.isnull().mean()) * 100
    percentage.sort_values(ascending = False).plot.bar(color = color, edgecolor = edgecolor)
    plt.axhline(y = thresh, color = 'r', linestyle = '-')
    
    plt.title('The percentage of Missing values in every column', fontsize = 20, weight = 'bold' )
    
    plt.text(len(data.isnull().sum()/len(data))/1.7, thresh + 12.5, f'Columns with more than {thresh}% missing values', fontsize = 12, color = 'purple',
         ha = 'left' ,va = 'top')
    plt.text(len(data.isnull().sum()/len(data))/1.7, thresh - 5, f'Columns with less than {thresh}% missing values', fontsize=12, color='orange',
         ha = 'left' ,va = 'top')
    plt.xlabel('Columns', size = 15, weight = 'bold')
    plt.ylabel('Missing values percentage')
    plt.yticks(weight = 'bold')
    
    return plt.show()

In [None]:
missing_vals(df, thresh = 20, color = sns.color_palette('Reds',15))

Since the data about directors is essential in carrying out data analysis, we cannot drop that field. 

Let us remove the rows that do not have any values. 

In [None]:
previous_df = df
df.dropna(inplace = True)

In [None]:
missing_vals(df, thresh = 20, color = sns.color_palette('Reds',15))

Let us check if we have any null values present in our dataset.

In [None]:
df.isnull().sum().sum()


# **Exploratory Data Analysis**

In [None]:
df['rating'].value_counts()

In [None]:
ratings_content = df.rating.value_counts().index
ratings_content
# ax = sns.countplot(x="rating", data=df, palette="Set3")

plt.figure(figsize=(12,10))
sns.set_style("dark")
sns.countplot(data=df, y="rating", order=ratings_content, palette="rocket_r")

In [None]:
import plotly.express as px
df_country_year = df.groupby(by=['country','type','rating']).count().reset_index()
df_country_year['total']=df_country_year.groupby(by=['country'])['title'].cumsum()

px.choropleth(df_country_year.sort_values(by='rating'), locations='country', title='Country wise statistics of Ratings',color='total', locationmode='country names', animation_frame='rating', range_color=[0,1000],
             )

**From the above graph, it is evidently visible that the most watched shows are Rated *TV-MA* which infers that the highest majority of the viewers are preferring matured content.**

In [None]:
# Pie chart
labels = df['release_year'].value_counts().index
sizes = df['release_year'].value_counts().values
# only "explode" the 2nd slice (i.e. 'Hogs')
fig1, ax1 = plt.subplots(figsize = (8, 8))

ax1.pie(sizes, labels = labels,
        shadow = True, startangle = 90, autopct='%1.1f%%', rotatelabels = True)
centre_circle = plt.Circle((0, 0), 0.70,fc = 'white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')  
plt.tight_layout()
plt.show()

**From the above piechart we can infer that the most number of released happened in the year 2017.**

In [None]:
plt.figure(figsize = (6, 6))
df['listed_in'].value_counts().head(10)
index = df['listed_in'].value_counts().head(10).index
values = df['listed_in'].value_counts().head(10).values
listed_in = sns.barplot(x = index, y = values, data = df)
listed_in.set_xticklabels(labels = index, rotation = 90)
listed_in

In [None]:
df['genere']=df['listed_in'].apply(lambda x:x.split(',')[0])
df_genere=df[['genere','rating','title']].groupby(by=['rating','genere']).count().reset_index()
to_10_genere=df_genere[['genere','title']].groupby(by='genere').sum().sort_values(by='title',ascending=False).reset_index()['genere'].head(5)
px.sunburst(df_genere[(df_genere['genere'].isin(to_10_genere) )],
            title='Visualization of Ratings and Genre',path=['rating','genere'],values='title',)

In [None]:
director_stack = df.set_index("title").director.str.split(',', expand=True).stack().reset_index(level=1, drop=True) #credits: csafrit

plt.figure(figsize=(13,7))
plt.title("Top 10 Directors with Highest Number of Content",size='20')
sns.countplot(y = director_stack, order=director_stack.value_counts().index[:10], palette='Paired')
plt.show()

In [None]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
import math
def roundup(x):
    return 100 + int(math.ceil(x / 100.0)) * 100 
sns.countplot(x='type',data=df)
ax = plt.gca()
y_max = df['type'].value_counts().max() 
ax.set_ylim([0, 6500])
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), 
            fontsize=14, color='black', ha='center', va='bottom')
plt.title('Total nunmber of TV Shows & Movies',size='12')
plt.show()

In [None]:
px.scatter(df, x='release_year', color='type', hover_data=['title'])

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
text = " ".join(review for review in df.description)
print(len(text))
wordcloud = WordCloud(background_color="black").generate(text)
plt.figure(figsize=(20,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()