In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
sns.color_palette('viridis')

In [None]:
blue = sns.color_palette('viridis')[1]
green = sns.color_palette('viridis')[4]
plt.rcParams['figure.figsize'] = (15,4)
plt.rcParams['figure.dpi'] = 200
plt.style.use('fivethirtyeight')

In [None]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

In [None]:
df.head()

In [None]:
df.dtypes # Default data types assigned to the columns

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated(subset='title').sum() # No duplicate records

In [None]:
df['date_added'] = pd.to_datetime(df['date_added']) # converting the 'date_added' column to datetype

In [None]:
df['year_added'] = df['date_added'].dt.year # getting the 'year' from the 'date_added' column
df['month_added'] = df['date_added'].dt.month # getting the 'month' from the 'date_added' column

In [None]:
df = df.drop(columns='show_id') # dropping the 'show_id' column

In [None]:
df.head()

### Count of TV shows vs Movies on Netflix

In [None]:
tv_vs_movies_count = df['type'].value_counts()
tv_vs_movies_count

In [None]:
plt.figure(figsize=(4,4), dpi=120)
plt.pie(tv_vs_movies_count.values, labels=tv_vs_movies_count.index, 
        shadow=True, 
        autopct='%1.1f%%', 
        wedgeprops={'edgecolor':'black'},
        explode=[0,0.1],
        colors=[blue, green],
        startangle=90,
        textprops={'fontsize': 8}
       )
plt.title('Movies vs TV Shows comparison', fontsize=10);

### Most number of directions

In [None]:
most_directions = df.loc[df['director'].notnull(), 'director'].value_counts().head(10)
most_directions

In [None]:
sns.barplot(x=most_directions.values, y=most_directions.index, color=blue)
plt.xticks(np.arange(1,20,1))
plt.xlabel('# of directions')
plt.ylabel('Directors')
plt.tight_layout()
plt.title('Most number of directions');

### Most content per country

In [None]:
# For a lot of the records the country column contains multiple values as shown below

In [None]:
df.loc[(df['country'].notnull()) & (df['country'].str.contains(','))].head()

In [None]:
# The below function which will count every occurrence of a country to get an accurate result.
# so for example - let's say a record had 2 countries - India and USA, it will count that record as part of both.

In [None]:
tracker = {} # Dictionary to keep a count of every occurrence of a country


def count_country(country_str):
    countries = country_str.split(', ')
    
    for country in countries:
        if country in tracker:
            tracker[country] += 1
        else:
            tracker[country] = 1

In [None]:
df.loc[df['country'].notnull(), 'country'].apply(count_country)

In [None]:
tracker # Accurate country count (including both movies and tv shows)

In [None]:
country_count = pd.DataFrame(data=tracker.values(), index=tracker.keys(), columns=['Count']).reset_index()
country_count.rename(columns={'index': 'country', 'Count': 'count'}, inplace=True)

In [None]:
country_count_top10 = country_count.sort_values('count', ascending=False).head(10)
country_count_top10

In [None]:
sns.barplot(x='country', y='count', data=country_count_top10, color=green)
plt.title('Most content by country')
plt.xlabel('Countries')
plt.ylabel('Count of content')
plt.tight_layout();

#### Breakdown of the above count on 'Movies' and 'TV Shows'

In [None]:
tracker = {} # Resetting

movies = df.loc[(df['type'] == 'Movie') & (df['country'].notnull())] # All movies where country is non null
tv = df.loc[(df['type'] == 'TV Show') & (df['country'].notnull())]   # All TV shows where country is non null

In [None]:
movies['country'].apply(count_country)

In [None]:
tracker # Count of Movies by country

In [None]:
# Converting the above dictionary into a dataframe (will later add the column for tv shows count)

content_by_country = pd.DataFrame(data=tracker.values(), index=tracker.keys(), columns=['Count']).reset_index()
content_by_country.rename(columns={'index': 'country', 'Count': 'movies'}, inplace=True)

content_by_country

In [None]:
tracker = {} # Resetting

tv['country'].apply(count_country)

In [None]:
tracker # Count of TV shows per country

In [None]:
# Adding the above dictionary to the content_by_country dataframe for a head-to-head comparison

content_by_country['tv_shows'] = content_by_country['country'].map(tracker)

In [None]:
# For validation purposes, calculating the total count - this should match with the output of cell 26 (which it does)
content_by_country['total_content'] = content_by_country['movies'] + content_by_country['tv_shows']


top_content = content_by_country.sort_values('total_content', ascending=False).head(10)
top_content

In [None]:
# Visualizing the content per country based on the type of content

x_ticks = np.arange(0,10,1)
width=0.25

plt.bar(x_ticks-width, top_content['movies'], label='Movies', width=width, color=blue)
plt.bar(x_ticks, top_content['tv_shows'], label='TV Shows', width=width, color=green)

plt.xticks(ticks=x_ticks-(width/2), labels=top_content.country)
plt.title('Content comparision')

plt.legend();


### Distribution of duration for Movies and TV Shows

In [None]:
movies = df.loc[df['type'] == 'Movie'] # All records with type 'movies'

In [None]:
len(movies) # Total records 

In [None]:
movies['duration'].str.contains('min').sum() # All records have time in 'mins'

In [None]:
movies['duration'] = pd.to_numeric(movies['duration'].str.replace(' min', '')) # removing the string 'mins'
#and converting to int type

In [None]:
movies.head(2)

In [None]:
sns.histplot(data=movies, x='duration', bins=np.arange(0,300,20), kde=True)
plt.xticks(ticks=np.arange(0,300,20))
plt.title('Movie Duration Distribution')
plt.ylabel('# of Movies')
plt.xlabel('Duration in minutes')
plt.tight_layout();

In [None]:
tv_shows = df.loc[df['type'] == 'TV Show'] # All records with type 'TV Show'

In [None]:
len(tv_shows['duration']) # Total records with type 'TV Show'

In [None]:
tv_shows['duration'].str.contains('Season').sum() # Duration for all TV Shows is measured in Seasons

In [None]:
tv_shows['duration'].value_counts()

In [None]:
plt.figure(figsize=(15,6), dpi=200)

sns.countplot(data=tv_shows, x='duration', order=tv_shows['duration'].value_counts().index, color=green);
plt.xticks(rotation=90);

plt.title("TV Shows duration's Distribution")
plt.ylabel('# of TV Shows')
plt.xlabel('Duration in Seasons')
plt.tight_layout();

### Trends of TV Shows and Movies added in past 5 years

In [None]:
df['year_added'].value_counts().sort_index(ascending=False)

In [None]:
past_10_years = df.loc[df['year_added'] >= 2015]

ticks = sorted(list(past_10_years['year_added'].value_counts().index))
labels = [int(tick) for tick in ticks]

sns.countplot(data=past_10_years, x='year_added', hue='type', palette='viridis')
plt.xticks(ticks=np.arange(0,7,1), labels=labels)
plt.xlabel('Content added')
plt.ylabel('Year added')
plt.legend(loc=("upper left"), ncol=2)
plt.title('Content added in past 5 years');


### Analyzing the 'Rating' field

In [None]:
ratings = df.loc[df['rating'].notnull()]
ratings.head(2)

In [None]:
sns.countplot(data=ratings, x='rating', hue='type', palette='viridis')
plt.legend(loc='upper right')
plt.title('Rating categorization for movies/tv shows')
plt.xlabel('Rating')
plt.ylabel('Movies/TV Shows')
plt.tight_layout();

### Most popular genre in Movies and TV Shows

In [None]:
movies.head(2)

In [None]:
genre_bucket = {}

def count_genres(genres):
    genres_list = genres.split(', ')
    
    for genre in genres_list:
        if genre in genre_bucket:
            genre_bucket[genre] += 1
        else:
            genre_bucket[genre] = 1


In [None]:
movies['listed_in'].apply(count_genres)

In [None]:
genre_bucket

In [None]:
most_popular_movie_genres = pd.DataFrame(genre_bucket.values(), index=genre_bucket.keys(), columns=['count'])
most_popular_movie_genres = most_popular_movie_genres.reset_index()

most_popular_movie_genres.rename(columns={'index': 'genre'}, inplace=True)

In [None]:
most_popular_movie_genres = most_popular_movie_genres.sort_values('count', ascending=False)
most_popular_movie_genres.head()

In [None]:
plt.figure(figsize=(15,8), dpi=150)
sns.barplot(data=most_popular_movie_genres, y='genre', x='count')
plt.title('Most popular movie genres')
plt.ylabel('Movie Genres')
plt.xlabel('');

In [None]:
tv_shows.head(2)

In [None]:
genre_bucket = {} # Resetting

In [None]:
tv_shows['listed_in'].apply(count_genres)

In [None]:
genre_bucket

In [None]:
most_popular_tv_genres = pd.DataFrame(genre_bucket.values(), index=genre_bucket.keys(), columns=['count'])
most_popular_tv_genres = most_popular_tv_genres.reset_index()

most_popular_tv_genres.rename(columns={'index': 'genre'}, inplace=True)

In [None]:
most_popular_tv_genres = most_popular_tv_genres.sort_values('count', ascending=False)
most_popular_tv_genres.head()

In [None]:
plt.figure(figsize=(15,8), dpi=150)
sns.barplot(data=most_popular_tv_genres, y='genre', x='count')
plt.title('Most popular tv show genres')
plt.ylabel('TV Show Genres')
plt.xlabel('');

In [None]:
# If you have come this far, I would really appreciate if you could leave a constructive feedback in the comments below!