# Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Uploading the Data

In [None]:
import os
os.getcwd()

In [None]:
net = pd.read_csv("../input/netflix-shows/netflix_titles.csv")

net.head()

# Data Exploration

In [None]:
net.info()

In [None]:
net.isnull().sum()

#### Director, cast, country and date_added columns have missing data

In [None]:
net.shape

## Drop the director column as it has too many missing data points

In [None]:
netflix = net.drop(["director"], axis = 1)

netflix.head()

## Drop rows with missing data in the cast, country and date_added columns

In [None]:
netflix.dropna(inplace=True)

netflix.head()

In [None]:
netflix.isnull().sum()

In [None]:
netflix.shape

#### Now all the data has been cleaned 

## Analysis and Visualization

### Comparing content on netflix

In [None]:
plt.figure(figsize=(20,5))
plt.pie(netflix['type'].value_counts().sort_values(),labels=netflix['type'].value_counts().index,
        explode=[0.1,0],autopct='%1.1f%%',colors=['gold','red'])

plt.show()

##### TV shows are the majority taking up approximately 72% of content on Netflix

## Top 20 countries with most content

In [None]:
count = netflix['country'].value_counts().sort_values(ascending=False)
first20 = count.head(20)
first20

In [None]:
plt.figure(figsize = (15,8))
sns.barplot(x=first20.values, y=first20.index)

plt.title('Top 20 Content Producing countries on Netflix')

plt.figure(figsize=(20,9))
plt.pie(first20, labels=first20.index, autopct='%1.1f%%')
plt.title('Top 20 Content Producing countries on Netflix')

plt.show()

#### United States is the top content producing country on Netflix with approximately 44% followed by India, United Kingdom and Japan

## Count of content added yearly

In [None]:
added_yr = pd.DatetimeIndex(netflix['date_added']).year
added_yr

In [None]:
added_yrcount = added_yr.value_counts()
added_yrcount

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(added_yr,order = added_yr.value_counts().index)

#### 2019 had most content on Netflix of 1848.
#### However, note that 2021 does not have complete data as of the time of analysis

## Top 20 Movie Genres


In [None]:
movies = netflix[netflix['type'] == 'Movie']
movies.head()

In [None]:
plt.figure(figsize=(15,8))

sns.barplot(x=movies['listed_in'].value_counts().sort_values(ascending=False).head(20).index,
           y=movies['listed_in'].value_counts().sort_values(ascending=False).head(20).values, palette=('autumn'))

plt.xticks(rotation=80)
plt.title('Top 20 Neflix Movie Genre')

plt.show()

Dramas, International Movies is the number one movie genre followed by stand-up comedy

## Top 20 Neflix TV Show Genres

In [None]:
TV = netflix[netflix['type']=='TV Show']
TV.head(10)

In [None]:
plt.figure(figsize=(15,8))

sns.barplot(x=TV['listed_in'].value_counts().sort_values(ascending=False).head(20).index,
           y=TV['listed_in'].value_counts().sort_values(ascending=False).head(20).values, palette='cool')

plt.xticks(rotation=80)
plt.title('Top 20 Netflix TV Shows Genre')

plt.show()

#### Kids' TV is number one TV show genre on Netflix

##  Distribution of Movie and TV Show Rating


In [None]:
plt.figure(figsize=(15,8))

sns.barplot(x=movies['rating'].value_counts().sort_values(ascending=False).index,
           y=movies['rating'].value_counts().sort_values(ascending=False).values,palette='autumn_r')
plt.title('Distribution of Netflix Movies Rating')


plt.figure(figsize=(15,8))
sns.barplot(x=TV['rating'].value_counts().sort_values(ascending=False).index,
           y=TV['rating'].value_counts().sort_values(ascending=False).values, palette='cool_r')

plt.title("Distribution of Netflix TV Show Rating")

plt.show()

#### TV-MA is the majority rating for Netflix's movies and tv shows with TV-Y7-FV having least content

## Distribution of Movies Duration

In [None]:
from scipy.stats import norm

plt.figure(figsize=(15,8))

sns.distplot(movies['duration'].str.extract('(\d+)'),fit=norm,kde=False,color=['brown'])
plt.title('Distribution of Netflix Movies Duration')
plt.xlabel('Time in Minutes')
plt.show()

#### Majority of the movies are approximately 80 to 120 minutes long

## Percentage Distribution of TV Shows Duration

In [None]:
plt.figure(figsize=(15,8))

ax = sns.barplot(x=((TV['duration'].value_counts()/TV.shape[0])*100).index,
            y=round(((TV['duration'].value_counts()/TV.shape[0])*100),2).values, palette='RdGy')

plt.title('Percentage Distribution of TV Shows Duration')
plt.xticks(rotation=80)

for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height}', (x + width/2, y + height*1.02), ha='center')

#### The least number of seasons for tv shows is 16 with almost 80% of the shows having 1 and 2 seasons

#### 