In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
data.head()

In [None]:
#shape of data
data.shape

we have 6234 rows and 12 columns. Lets list all the columns.


In [None]:
data.columns

null values?


In [None]:
data.isnull().sum()

Director, cast, country, date added and rating have missing values. We'll deal with this further along the line.

For now, I will drop the show_id columns as it seems to serve no real value 

In [None]:
#dropping the show Id
data.drop(['show_id'], axis = 1, inplace = True)

# Plots

Lets see how many movies and Tv shows we have across our dataset.

In [None]:
plt.figure(figsize = (8,6))
sns.set_style('darkgrid')
sns.countplot(x = 'type', palette = 'Set2', data = data)
plt.title('Plot Depicting Number of Movies and TV Shows in Dataset')
plt.show()

Another thing that I am interested in is that how Movies and TS Shows are distributed across the year? There are a hell lot of TV shows that we have come to love and it would be interesting to see when this all started to pick up.

But as our dataset is skewed more towards the Movie side of the spectrum, I am not very sure if we can get a very clear picture.

Lets try anyways.

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.lineplot(x = 'release_year', y = data.index, palette = 'Set2', hue = 'type',ci = None, data = data)
plt.title('Plot Depicting Number of Movies and TV Shows per Year')
plt.show()

I will create two different datasets. One for Movies and one for TV Shows.

In [None]:
movies_data = data[data['type'] == 'Movie']
tv_shows_data = data[data['type'] == 'TV Show']

## Movies

In [None]:
movies_data.isnull().sum()

dropping all columns with null values

In [None]:
movies_data.dropna(axis = 0, inplace = True)

### Number of Movies per Year

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.countplot(x = 'release_year', palette ='Set2', data = movies_data)
plt.xticks(rotation = 90)
plt.title('Number of Movies per Year')
plt.show()

Note that number of movies are very less in 2020. No wonder there! This is a year we would all have liked to skip ahead off. At least me. 

The number of movies grew substantially ever since 2014. I wonder if Netflix started to produce originals at this time.


### **How many movies per country (top20) ?**

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.countplot(x = 'country', palette = 'Set2', 
              order = movies_data['country'].value_counts().iloc[: 20].index,
              data = movies_data)
plt.xticks(rotation = 90)
plt.title('Number of movies as per country')
plt.show()

USA and India are major players in Movies department.

### **Movies per country/Year (top 5)**

In [None]:
country_cols = ['United States', 'India', 'United Kingdom', 'Canada', 'Spain']
movies_country_year_data = movies_data.loc[movies_data['country'].isin(country_cols)]
movies_country_year_data.head()

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.countplot(x = 'release_year', hue = 'country',
              palette = 'Set2',
             order = movies_country_year_data['release_year'].value_counts().iloc[: 10].index,
             data = movies_country_year_data)
plt.xticks(rotation = 90)
plt.title('Count Distribution of Movies per Years (TOP 10) vs Country (TOP 5)')
plt.legend(loc = 'upper right')
plt.show()

### **Count of Ratings (top 10)**

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.countplot(y = 'rating', 
              palette = 'Set2',
             order = movies_data['rating'].value_counts().iloc[: 10].index,
             data = movies_data)
plt.title('Count of Ratings (TOP 10)')
plt.show()

### **Ratings (top10) across Countries (top 5)**

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.countplot(x = 'rating', hue = 'country',
              palette = 'Set2',
              order = movies_country_year_data['rating'].value_counts().iloc[: 10].index,
              data = movies_country_year_data)
plt.title('Ratings (TOP 10) across Countries (TOP 5)')
plt.legend(loc = 'upper right')
plt.show()

### **Listed In?**

The feature listed_in describes the genre/class of the movies/TV shows. 

#### **Count of Genre (top 15)**

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.countplot(x = 'listed_in', palette = 'Set2',
              order = movies_data['listed_in'].value_counts().iloc[: 15].index,
              data = movies_data)
plt.xticks(rotation = 90)
plt.xlabel('Genre')
plt.title('Counts of Genre')
plt.show()

### Duration

In [None]:
movies_data['duration'] = movies_data['duration'].str.replace(' min',' ')
movies_data['duration'] = movies_data['duration'].astype('str').astype('int')
movies_data['duration']

In [None]:
plt.figure(figsize = (8,6))
sns.kdeplot(data = movies_data['duration'], shade = True)
plt.title('Kernal Density Estimation for Duration')
plt.show()

Most of the movies range from 90-120 mins which is pretty standard.

# TV Shows

In [None]:
tv_shows_data.isnull().sum()

In [None]:
tv_shows_data.dropna(axis = 0, inplace = True)

### Number of TV Shows per Year

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.countplot(x = 'release_year', palette = 'Set2', data = tv_shows_data)
plt.xticks(rotation = 90)
plt.title('Number of TV Shows per Year')
plt.show()

This is a much cleaner chart than its movies counterpart. Not a surprise as TV Series were not a cultural norm back in 40s-80s. 

The boom in number of TV Series produced each year can be seen steadily increasing since 2014. 

Again 2020 being all crazy argument can be made for TV Shows as well but on further thought it can have something to do with when this dataset was compiled. If it was compiled earlier this year then naturally the number of TV Shows/Movies would be less.

### Top 20 Countries having most number of TV Shows.

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.countplot(x = 'country',palette = 'Set2',
              order = tv_shows_data['country'].value_counts().iloc[: 20].index,
              data = tv_shows_data)
plt.xticks(rotation = 90)
plt.title('Top 20 Countries with Most TV Shows')
plt.show()

In contrast with the Movies, India has receeded to 6th position in the number of TV shows present. Netflix is fairly new in India and TV Series are just starting to gain momentum here and so I am not very surprised.

Also we can see that UK has a large number of TV shows to its name. And surprisingly Japan, South Korea and Turkey are leading as well. This is a surprise,at least to me!

### Number of TV Shows Each Year for Top 5 countries

In [None]:
country_cols_tv = ['United States', 'United Kingdom', 'Japan', 'South Korea', 'Turkey']
tv_country_year_data = tv_shows_data.loc[tv_shows_data['country'].isin(country_cols_tv)]
tv_country_year_data.head()

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.countplot(x = 'release_year', palette = 'Set2',
             order = tv_country_year_data['release_year'].value_counts().iloc[: 15].index,
             hue = 'country',
             data = tv_country_year_data)
plt.xticks(rotation = 90)
plt.title('Number of TV Shows Each Year(TOP 15) per Country (TOP 5)')
plt.legend(loc = 'upper right')
plt.show()

Very Intersting chart, I must say. United States has consistently out performed all the countries across chart.
1. UK had no consistency in productions before 2015. It had 1 TV show in 2001,then again in 2006, a couple more in 2012 before it became consistent.
2. South Korea has had TV shows ever since 2012
3. Turkey has shown no consistency and still made the list. Kudos!

### Count of Ratings

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.countplot(y = 'rating',
             order = tv_shows_data['rating'].value_counts().iloc[: 10].index,
             palette = 'Set2',
             data = tv_shows_data)
plt.yticks(rotation = 90)
plt.title('Top 10 Ratings for TV Shows')
plt.show()

Well, there are only 7 ratings that are present for TV Shows.

### Ratings Across Top 5 Countries

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.countplot(x = 'rating', hue = 'country', palette = 'Set2',
             data = tv_country_year_data)
plt.xticks(Rotation = 90)
plt.legend(loc = 'upper right')
plt.title('Countrywise Distribuition of Ratings')
plt.show()

United States is the only country with a R rated show! ;)

### Genres (Listed In)

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.countplot(x = 'listed_in', palette = 'Set2',
             order = tv_shows_data['listed_in'].value_counts().iloc[: 10].index,
             data = tv_shows_data)
plt.xticks(rotation = 90)
plt.title('Distribution for Top 10 Genres')
plt.show()

#### Genre distribution in top 5 countries

In [None]:
plt.figure(figsize = (18,8))
sns.set_style('darkgrid')
sns.countplot(x = 'listed_in', palette = 'Set2',hue = 'country',
             order = tv_country_year_data['listed_in'].value_counts().iloc[: 10].index,
             data = tv_country_year_data)
plt.xticks(rotation = 90)
plt.title('Distribution for Top 10 Genres for Top 5 Countries')
plt.legend(loc = 'upper right')
plt.show()

Nice Distribution. :P

### Duration (Number of Seasons)

In [None]:
tv_shows_data['duration'].unique()

In [None]:
#converting string to integers
tv_shows_data['duration'] = tv_shows_data['duration'].str.replace(' Seasons', ' ')
tv_shows_data['duration'] = tv_shows_data['duration'].str.replace(' Season', ' ')
tv_shows_data['duration'].unique()

In [None]:
tv_shows_data['duration'] = tv_shows_data['duration'].astype('str').astype('int')

In [None]:
#plot
plt.figure(figsize = (8,6))
sns.set_style('darkgrid')
sns.kdeplot(data = tv_shows_data['duration'], shade  = True)
plt.title('Kernal Density Estimation for Duration of TV Shows')
plt.show()

So most of the TV Shows have just 1 season and few have 2 seasons. Also Netflix does have a tendency to cancel shows as well. can that be the reason?

### I will add more analysis with time.
## Any and all feedback or suggestions are welcome. Let me know down in the comments what you think or suggest any improvements.
# If you like this Kernal, please leave an upvote! 
# Cheers!!
