# Netflix Data Analysis

In [None]:
import numpy as np 
import pandas as pd
import pandas_profiling
import missingno as msno
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/netflix-shows/netflix_titles.csv")
df.head()

In [None]:
df.info()

In [None]:
profile = pandas_profiling.ProfileReport(df)
profile

In [None]:
pip install sweetviz

In [None]:
import sweetviz as sv
netflix_data_report = sv.analyze(df)
netflix_data_report.show_html('Analyst.html')

In [None]:
from IPython.display import IFrame
IFrame(src = 'Analyst.html',width=1000,height=600)

In [None]:
n = msno.bar(df,color='Orange')

We have null values in column Director,Cast,Country,Rating,Date_added,So before doing visualization let's at first deal with these values

We don't need Director,Cast from the Dataset to visualize & sincce these two column has missing values ,so we can drop them.

In [None]:
df.drop(["director","cast"],axis =1,inplace=True)

In [None]:
df.head()

In [None]:
df['country'].value_counts()

Country column is important for our Visualization & since it has some null values we can replace them by United States,Since United States has the largest nummber of shows,and Netflix was also created in United States

In [None]:
df['country'].replace(np.nan,"United States",inplace=True)

In [None]:
df['date_added'].value_counts()

since we have release_year so we don't need year value from date_added column,But Month is important for Visualizing our data,so let's separate the month from the date_added column & replace the Null values with 0

In [None]:
netflix_date = df[['date_added']].replace(np.nan,'Not Added')
netflix_date["release_month"] = netflix_date['date_added'].apply(lambda x: x.lstrip().split(" ")[0])
netflix_date.head()

In [None]:
netflix_date["release_month"].value_counts()

In [None]:
netflix_date['release_month'].replace('Not', 0,inplace=True)

In [None]:
netflix_date["release_month"].value_counts()

In [None]:
netflix_date.drop("date_added",axis=1,inplace=True)
netflix_date.head()

In [None]:
netflix = pd.concat([df,netflix_date],axis=1)
netflix.head()

In [None]:
netflix.drop("date_added",axis=1,inplace=True)
netflix.head()

In [None]:
netflix["rating"].value_counts()

In [None]:
netflix["rating"].isnull().sum()

Since rating column has only 10 null values,so let's replace the null values with TV-MA since they gives the most amount of Rating

In [None]:
netflix["rating"].replace(np.nan,"TV-MA",inplace=True)
netflix.isnull().sum()

So we successfully removed all the Null Values,Now we can visualize our Data

In [None]:
netflix.head()

Let's find out the number of Movie & Tv Show

In [None]:
sns.set()
sns.countplot(x="type",data=netflix)
plt.show()

So Netflix has around 5500 Movies & almost 2500 Tv Show

In [None]:
plt.figure(figsize=(12,9))
sns.countplot(x="rating",data=netflix,order= netflix['rating'].value_counts().index[0:14])

So most of the ratings is given by TV-MA then TV-14

In [None]:
sns.set()
plt.figure(figsize=(30,9))
sns.countplot(x="release_year",data= netflix,order = netflix['release_year'].value_counts().index[0:40])
plt.xticks(rotation=45)
plt.show()

So total Highest number of Movies & Tv Shows has been released in the Year 2018

Let's see which month directors prefer most to release their Movies & Tv Shows

In [None]:
sns.set()
plt.figure(figsize=(20,8))
sns.countplot(x="release_month",data= netflix,order = netflix['release_month'].value_counts().index[0:12])
plt.xticks(rotation=45)
plt.show()

Most of the directors prefer to release their Movies & Tv Shows in December.Since December is the Month of Vacations

In [None]:
sns.set()
plt.figure(figsize=(25,9))
sns.countplot(x="rating",data= netflix,hue= "type",order = netflix['rating'].value_counts().index[0:15])
plt.xticks(rotation=45)
plt.show()

Both for Movies & Tv Shows TV-MA always gives the Highest number of ratings,but TV-14 almost gives the same amount of ratings for Tv-Shows as TV-MA

In [None]:
netflix["country"].value_counts().head()

In [None]:
sns.set()
plt.figure(figsize=(25,9))
sns.countplot(x="country",data= netflix,hue= "type",order = netflix['country'].value_counts().index[0:15])
plt.xticks(rotation=45)
plt.show()

So United States provides the Highest number of Movies & Tv Shows,then at 2nd place India provides the Highest number of Movies

In [None]:
top = netflix['country'].value_counts()[0:8]
top.index

In [None]:
fig = px.pie(netflix,values = top,names = top.index,labels= top.index)
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

United States has the Highest number of Movies & Tv Shows

Let's find out which Genre of Movies & Tv Shows Netflix Provides the Most

In [None]:
top_listed_in=netflix["listed_in"].value_counts()[0:25]
top_listed_in.head()

In [None]:
sns.set()
plt.figure(figsize=(30,15))
sns.countplot(y='listed_in',data = netflix,order =netflix["listed_in"].value_counts().index[0:25])
plt.xticks(rotation = 90)
plt.show()

In [None]:
fig = px.pie(netflix,values = top_listed_in,names = top_listed_in.index,labels= top_listed_in.index)
fig.update_traces(textposition ='inside',textinfo='percent+label')
fig.show()

Netflix provides "Documentry" type Movies & TvShows most then in the 2nd place it provides Stand Up Comedy most

Let's see Listed of old Movies on Netflix

In [None]:
old = netflix.sort_values("release_year",ascending=True)
old[["title","type","country","release_year"]].head(20)

All of the oldest Movies & TV Shows on Netflix are from United States

List of Kids TV Shows on Netflix

In [None]:
kids_show=netflix[netflix["listed_in"] == "Kids' TV"].reset_index()
kids_show[["title","country","release_year"]].head(10)

Let's see India has any Movies on Netflix or Not

In [None]:
netflix[netflix["country"] == "India"]

In [None]:
Country = pd.DataFrame(netflix["country"].value_counts().reset_index().values,columns=["country","TotalShows"])
Country.head()