# Importing Libraries

In [None]:
%matplotlib inline

# Data Processing and Visualization Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Overview

In [None]:
# load the datasets using pandas's read_csv method
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

In [None]:
# View the training data
df.head()

In [None]:
print(df.shape)   #shows datapoints and features in train dataset                   
print(df.columns) #displays column names in our train dataset

There are 12 columns

In [None]:
# Different data types in the dataset
df.dtypes

In [None]:
df.isnull().sum()

# Handling missing data

'director', 'cast', and 'country' contains significant portion of null values. So we cant simply delete the rows. We can replace it with 'No Data'.

'date_added' and 'rating' contains only small number of null values. So we can delete that rows.

In [None]:
# Replace null values
df['country'].replace(np.nan, 'No Data',inplace  = True)
df['cast'].replace(np.nan, 'No Data',inplace  = True)
df['director'].replace(np.nan, 'No Data',inplace  = True)

# Drops
df.dropna(subset=["date_added", "rating"], inplace=True)

In [None]:
df.isnull().sum()

# EDA- Exploratory Data Analysis

# More Movies or TV shows?

In [None]:
sns.countplot(x='type', data= df, palette="YlGnBu")
print(df.type.value_counts())

on Netflix there are more movies than Tv shows.

# Find out the month and year where highest and least amount of content is added?

In [None]:
df["date_added"] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year
df["year_added"].value_counts()

More contents were added in the year 2019 while only 1 content was added in 2010.

In [None]:
df['month_added']=df['date_added'].dt.month
df["month_added"].value_counts()

More contents were added in the month of December while less contents were added in February.

In [None]:
month_vs_year = df.groupby('year_added')['month_added'].value_counts().unstack().fillna(0)
ax=sns.heatmap(month_vs_year,cmap="YlGnBu")
ax.set_xlabel('Year_added').set_size(20)
ax.set_ylabel('Month_added').set_size(20)

In 2020, January was the month were most contents were released and February was themonth were least contents were released.

# Lets analyse content rating

In [None]:
print(df.rating.value_counts())

The different ratings means:

TV-MA: This program is intended to be viewed by adults and therefore may be unsuitable for children under 17.

TV-14: This program contains some material that many parents would find unsuitable for children under 14 years of age.

TV-PG: Parental guidance is recommended; these programs may be unsuitable for younger children.

R: Restricted, Children Under 17 Require Accompanying Parent or Adult Guardian. This rating means the film contains adult material such as adult activity, harsh language, intense graphic violence, drug abuse and nudity. 

PG-13: This rating is a stronger caution for parents that content included may not be appropriate for children under 13 (pre-teen ages).

TV-Y: This program is aimed at a very young audience, including children from ages 2–6.

TV-Y7: This program is designed for children age 7 and above.

PG: Parents urged to give 'parental guidance.' May contain some material parents might not like for their young children.

TV-G: Most parents would find this program suitable for all ages. Programs rated TV-G are generally suitable for all ages.      

NR: Not rated, If a film has not been submitted for a rating or is an uncut version of a film that was submitted

G: General Audiences - All Ages Admitted. It is a rating of the Motion Picture Association in which the organization believes is suitable for everyone

TV-Y7-FV: TV shows for Children Seven Years of Age or Older that may contain Fantasy Violence    

UR: Unrated, It means that the film hasn’t been submitted to the MPAA for a rating. There’s no law that says a film has to have a rating. 

NC-17: No One 17 and Under Admitted. These films are too adult for children. The rating doesn’t mean that they are obscene or pornographic, but that the content is only appropriate for adult audiences.

In [None]:
plt.figure(figsize=(10,4))
sns.countplot(x='rating', hue= 'type', data= df, palette="YlGnBu")

TV-MA has the highest count of contents while NC-17 has lowest.

# Countries with most content

In [None]:
print(df.country.value_counts())

In [None]:
from collections import Counter
country = df['country']
filtered_countries = country[country != 'No Data']
country_count = pd.Series(dict(Counter(','.join(filtered_countries).split(',')))).sort_values(ascending=False)
top20 = country_count.head(20)

plt.figure(figsize=(15,5))
sns.barplot(x= top20.index, y=top20)
plt.xticks(rotation=60)
plt.title('Top 20 countries with most contents', fontsize=15, fontweight='bold')
plt.xlabel('Country')
plt.show()

Most contents were produced on USA, followed by India, United Kingdom and canada.

# Duration of contents 

Total Duration is given in minutes or number of seasons. 

Duration of Movies is given in minutes. Duration of TV shows are given in seasons.

In [None]:
movies_df = df[df.type == "Movie"]
tvshows_df = df[df.type == "TV Show"]

In [None]:
movies_df['duration']=movies_df['duration'].str.extract('(\d+)')
movies_df['duration']=movies_df['duration'].astype(str).astype(int)

# movie duration distribution
plt.figure(figsize=(10,8))
sns.set_style("darkgrid")
sns.kdeplot(data=movies_df["duration"], shade=True)
plt.title('kdeplot of duration of Movies',fontweight="bold")


Most movies on netflix have a duration of 80-120 minutes.

In [None]:
tvshows_df['duration']=tvshows_df['duration'].str.extract('(\d+)')
tvshows_df['duration']=tvshows_df['duration'].astype(str).astype(int)

# movie duration distribution
plt.figure(figsize=(10,8))
sns.set_style("darkgrid")
sns.countplot(x=tvshows_df['duration'], data=tvshows_df, palette="hls");
plt.title('countplot of TV shows seasons',fontweight="bold")

Most TV shows have only 1 season.

# Top 20 Genres

In [None]:
from collections import Counter
listed_in = df['listed_in']
genre_count = pd.Series(dict(Counter(','.join(listed_in).split(',')))).sort_values(ascending=False)
top20genre = genre_count.head(20)

plt.figure(figsize=(15,5))
sns.barplot(x= top20genre.index, y=top20genre)
plt.xticks(rotation=60)
plt.title('Top 20 genre with most contents', fontsize=15, fontweight='bold')
plt.xlabel('genre')
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x = tvshows_df["listed_in"].value_counts().head(10).index,
            y = tvshows_df["listed_in"].value_counts().head(10).values, palette="YlGnBu")
plt.xticks(rotation=60)
plt.title("Top10 Genre in TV shows", fontsize=15, fontweight="bold")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x = movies_df["listed_in"].value_counts().head(10).index,
            y = movies_df["listed_in"].value_counts().head(10).values, palette="YlGnBu")
plt.xticks(rotation=60)
plt.title("Top10 Genre in Movies", fontsize=15, fontweight="bold")
plt.show()

In [None]:
director = df['director']
filtered_director = director[director != 'No Data']
director_count = pd.Series(dict(Counter(','.join(filtered_director).split(',')))).sort_values(ascending=False)
top20director = director_count.head(20)


plt.figure(figsize=(15,6))
sns.barplot(x= top20director.index, y=top20director)
plt.xticks(rotation=60)
plt.title('Top 20 director with most contents', fontsize=15, fontweight='bold')
plt.xlabel('director')
plt.show()

# Top 20 Actors

In [None]:
actor = df['cast']
filtered_actor = actor[actor != 'No Data']
actor_count = pd.Series(dict(Counter(','.join(filtered_actor).split(',')))).sort_values(ascending=False)
top20actor = actor_count.head(20)


plt.figure(figsize=(15,5))
sns.barplot(x= top20actor.index, y=top20actor)
plt.xticks(rotation=60)
plt.title('Top 20 actor with most contents', fontsize=15, fontweight='bold')
plt.xlabel('actor')
plt.show()