In [None]:
## Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
## Importing the dataset
dataset = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

In [None]:
dataset.head(10)

In [None]:
dataset.info()

In [None]:
# Splitting the comma separated values in the country column
ds1 = dataset.assign(country= dataset['country'].str.split(', ')).explode('country')
ds1

In [None]:
# Applying filter to verify the split columns fetch correct rows 
ds1[ds1['show_id']==81145628]

In [None]:
## Visualising the dataset with type and year of release
plt.figure(figsize=(15,8))
plt.style.use('dark_background')
sns.countplot(x='type',data=dataset,hue='release_year')
plt.xlabel('Type- Movie or TV show',fontsize=15)
plt.ylabel('Count',fontsize=15)
plt.legend(loc='best',ncol=3)
plt.show()

### As we see, TV shows have gained a lot of traction in the last ten years. Up until late 1990s, there were a very few TV shows.On the other hand, Netflix movies have been around as early as 1940s but became more relevant after 1990s. 

In [None]:
## Visualising the dataset with type and selected countries
plt.figure(figsize=(15,12))
myCountries=['India','France','United States','United Kingdom','Germany','Australia','Brazil','China','Singapore','Russia',
            'Canada','Spain','Denmark','South Korea','Mexico']
df = ds1[ds1.country.isin(myCountries)]
ax = sns.countplot(x='type',data=df,hue='country')
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0f}'.format(height), (x + 0.02, y + height+ 15),rotation=90,size=12)
plt.xlabel('Type- Movie or TV show',fontsize=15)
plt.ylabel('Count',fontsize=15)
plt.legend(loc='best',ncol=1,fontsize=12)
plt.show()

### Movies are quite popular among the masses in US,India,UK,Spain,France etc.. Whereas in countries such as South Korea,Russia. TV shows are more popular than movies.  

In [None]:
# Applying filter on dataframe to cross check the number in the graph above
df[(df['country'].isin(['United States'])) & (df['type']=='Movie')]

In [None]:
## Visualising the dataset with type for the past 10 years
plt.figure(figsize=(12,10))
yearlist=list(range(2010,2021,1))
recentYears = dataset[dataset['release_year'].isin(yearlist)]
ax = sns.countplot(x='type',data=recentYears,hue=recentYears['release_year'])
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0f}'.format(height), (x + 0.02, y + height+ 8),rotation=90,size=12)
plt.xlabel('Type- Movie or TV show',fontsize=15)
plt.ylabel('Count',fontsize=15)
plt.legend(loc='best',ncol=1)
plt.show()

### As we can see in the above plot, the number of TV shows has surpassed the number of movies consecutively for two years 2019 and 2020. This may suggest that Neflix TV shows have become quite popular among the audience in the recent years. Also, we can see an increasing trend in the number of movies and TV shows until 2018/2019.

In [None]:
# Splitting the comma separated values in the listed_in column
ds2 = ds1.assign(listed_in= ds1['listed_in'].str.split(', ')).explode('listed_in')
ds2

In [None]:
# Removing the duplicate rows based on show_id after the explode operation 
ds2 = ds2.drop_duplicates(subset=None, keep='first', inplace=False)

In [None]:
ds2.head(10)

In [None]:
# Visualising the dataset with movie content available in selected countries
fig, axes = plt.subplots(15,1,figsize=(12,100),squeeze=False)
for i in range(0,15):
    plt.tight_layout(pad=5)
    movieOnly = ds2[(ds2['type']=='Movie') & (ds2['country'].str.contains(myCountries[i]))]
    sc = sns.countplot(x='listed_in',data=movieOnly,ax=axes[i][0],order = movieOnly['listed_in'].value_counts().index)
    axes[i][0].set_title('Movies in ' + myCountries[i],fontweight='bold')
    axes[i][0].set_xlabel('Genre of Netflix movies')
    axes[i][0].set_xticklabels(movieOnly['listed_in'].unique(),rotation=90)
    for p in sc.patches:
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy() 
        sc.annotate('{:.0f}'.format(height), (x+0.2, y + height))
plt.show()

In [None]:
# Visualising the dataset with TV content available in selected countries

fig, axes = plt.subplots(15,1,figsize=(12,100),squeeze=False)
for i in range(0,15):
    plt.tight_layout(pad=2)
    TVOnly = ds2[(ds2['type']=='TV Show') & (ds2['country']==myCountries[i])]
    sc = sns.countplot(x='listed_in',data=TVOnly,ax=axes[i][0],order = TVOnly['listed_in'].value_counts().index)
    axes[i][0].set_title('TV Shows in ' + myCountries[i],fontweight='bold')
    axes[i][0].set_xlabel('Genre of Netflix TV shows')
    axes[i][0].set_xticklabels(TVOnly['listed_in'].unique(),rotation=90)
    for p in sc.patches:
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy() 
        sc.annotate('{:.0f}'.format(height), (x+0.2, y + height))
plt.show()

In [None]:
# Formatting the date_added column
ds1['date_added'] = pd.to_datetime(ds1["date_added"])

In [None]:
ds1

In [None]:
# Visualising the number of movies added in 2019.
df = ds1[(ds1['release_year']==2019) & (ds1['type']=='Movie')]
df['month'] = df['date_added'].dt.month
sns.countplot(x=df['month'],data=df)
plt.xlabel('Months-2019')
plt.ylabel('Number of movies')
plt.title('Number of movies added monthwise in 2019')
plt.show()

In [None]:
# Visualising the number of TV shows added in 2019.
df = ds1[(ds1['release_year']==2019) & (ds1['type']=='TV Show')]
df['month'] = df['date_added'].dt.month
sns.countplot(x=df['month'],data=df)
plt.xlabel('Months-2019')
plt.ylabel('Number of TV shows')
plt.title('Number of TV shows added monthwise in 2019')
plt.show()

### The highest number of movies were added in November for the year 2019. Interestingly, the highest number of TV shows were also added in November for the year 2019.