# Importing Libraries

In [None]:
!pip install siuba

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import datetime
from siuba.dply.forcats import fct_lump

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Setting the Style

In [None]:
sns.set_theme(style = 'darkgrid')

# Importing Data

In [None]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
netflix = df.copy()
netflix.head()

### Inspecting the Data

In [None]:
(netflix.isnull().sum()/len(netflix)) * 100

# Preprocessing

In [None]:
# filling missing values for each variables
netflix['director'] = netflix['director'].fillna(value = 'No Director')
netflix['cast'] = netflix['cast'].fillna(value = 'No Cast')
netflix['country'] = netflix['country'].fillna(value = 'United States')
netflix['date_added'] = netflix['date_added'].fillna(value = datetime.datetime(2020, 1,1))
netflix['rating'] = netflix['rating'].fillna(value = 'Other')

# lump together least/most common factor levels into "other"
netflix['rating'] = fct_lump(netflix['rating'], n = 5)

# getting the months from the dates
netflix['month'] = pd.to_datetime(netflix['date_added']).dt.month

In [None]:
(netflix.isnull().sum()/len(netflix)) * 100

# Creating Useful Functions for Efficiency!

In [None]:
def column_list_tokenizer_count(df, subset_cols, secondary_col):

    # Creating an empty list
    token_list = []

    # Removing any missing values just in case if they are any within the dataset. 
    # As well reseting the index.
    clean_df = df.dropna(subset = [subset_cols])
    clean_df.reset_index(inplace = True)
    
    # This for loop would go to every single row and split the string 
    # values into a list essentially tokenizing them.
    if secondary_col == None:
        for i, element in clean_df.iterrows():
            for token in str(element[subset_cols]).strip(' ').split(','):
                token_list.append([token.strip()])

    # Returns a dataframe based from the appended list and counting each unique value 
    # from the inserted 'subset_cols' variable.
        token_data = pd.DataFrame(data = token_list, columns = [token])
        return token_data
    else:
        for i, element in clean_df.iterrows():
            secondary_cols = element[secondary_col]
            for token in str(element[subset_cols]).strip(' ').split(','):
                token_list.append([secondary_cols, token.strip()])

        token_data = pd.DataFrame(data = token_list, columns = [secondary_cols, token])
        return token_data.value_counts().to_frame().rename(columns = {0: 'count'}).reset_index(level = [0, 1])

In [None]:
def plot_bar(x_var, y_var, df, num_colors, title_name, xlabel_name, ylabel_name, hue_col):
    if hue_col == None:
        plt.figure(figsize = (10,6))
        sns.barplot(x = x_var, y = y_var, data = df, ci = False,
                    palette = sns.dark_palette(color = '#b60c26', n_colors = num_colors, reverse = True, input = 'hsl'))
        plt.title(title_name, fontdict = {'fontsize': 16, 'fontweight': 'bold'})
        plt.xlabel(xlabel_name)
        plt.ylabel(ylabel_name)
        plt.show()
    else:
        plt.figure(figsize = (10,6))
        sns.barplot(x = x_var, y = y_var, data = df, ci = False, hue = hue_col,
                    palette = sns.dark_palette(color = '#b60c26', n_colors = num_colors, reverse = True, input = 'hsl'))
        plt.title(title_name, fontdict = {'fontsize': 16, 'fontweight': 'bold'})
        plt.xlabel(xlabel_name)
        plt.ylabel(ylabel_name)
        plt.show()

# Genres Distribution between Movies and TV Shows

In [None]:
genre_data = (column_list_tokenizer_count(df = netflix, subset_cols = 'listed_in', secondary_col = 'type')
             .rename(columns = {'Movie': 'type', ' Music & Musicals': 'genre'}))

In [None]:
top_genre_movies = genre_data[genre_data['type'] == 'Movie'].nlargest(n = 10, columns = 'count')
top_genre_movies

In [None]:
plot_bar(x_var = 'count',
         y_var = 'genre',
         df = top_genre_movies, 
         num_colors = 10, 
         title_name = 'Top 10 Genres in Movies', 
         xlabel_name = 'Frequency', 
         ylabel_name = 'Name of Genres',
         hue_col = None)

In [None]:
top_genre_shows = genre_data[genre_data['type'] == 'TV Show'].nlargest(n = 10, columns = 'count')
top_genre_shows

In [None]:
plot_bar(x_var = 'count',
         y_var = 'genre',
         df = top_genre_shows, 
         num_colors = 10, 
         title_name = 'Top 10 Genres in TV Shows', 
         xlabel_name = 'Frequency', 
         ylabel_name = 'Name of Genres', 
         hue_col = None)

# Content Type Distibution in the World

In [None]:
plt.figure(figsize = (10, 6))
sns.countplot(x = 'type', hue = 'rating', data = netflix, palette = sns.color_palette("icefire"))
plt.title('Rating Type Distibution in Movies and TV Shows', fontdict = {'fontsize': 16, 'fontweight': 'bold'})
plt.xlabel('Content Type')
plt.ylabel('Rating Frequency')
plt.show()

In [None]:
top_10_countries = (column_list_tokenizer_count(df = netflix, subset_cols = 'country', secondary_col = None)
                   .value_counts()
                   .to_frame()
                   .reset_index()
                   .rename(columns = {' United States': 'country', 0: 'count'})
                   .nlargest(n = 10, columns = 'count'))

plot_bar(x_var = 'count',
         y_var = 'country',
         df = top_10_countries, 
         xlabel_name = 'Frequency', 
         ylabel_name ='Name of Countries', 
         title_name = 'Countries with the Most Content', 
         num_colors = 10, 
         hue_col = None)

In [None]:
country_data = (column_list_tokenizer_count(df = netflix, subset_cols = 'country',secondary_col = 'type')
                .rename(columns = {'Movie': 'type', ' United States': 'country'}))
top_countries_content = country_data[country_data['country'].isin(top_10_countries['country'])]
top_countries_content.head()

In [None]:
plot_bar(x_var = 'count', 
         y_var = 'country', 
         df = top_countries_content,
         num_colors = 2, 
         xlabel_name = 'Frequency', 
         ylabel_name = 'Name of Countries', 
         title_name = 'Which Content Type does each Country produce the most?', 
         hue_col = 'type')

In [None]:
years_count_type = (netflix[(netflix['release_year'] >= 2007) & (netflix['release_year'] < 2021)]
                   .groupby(by = ['type', 'release_year'], as_index = False)['show_id']
                   .count())
years_count_type.rename(columns = {'show_id': 'count'}, inplace = True)
years_count_type.head()

In [None]:
plt.figure(figsize = (10, 6))
sns.lineplot(x = 'release_year', y = 'count', hue = 'type', data = years_count_type, 
             palette = sns.dark_palette(color = '#b60c26', n_colors = 2, reverse = True, input = 'hsl'))
plt.title('The Growth of Movies/TV Shows over the years', fontdict = {'fontsize': 16, 'fontweight': 'bold'})
plt.xlabel('Release Year')
plt.xticks(ticks = years_count_type['release_year'].unique())
plt.ylabel('Frequency')
plt.show()

In [None]:
net_rating = (netflix
             .groupby(by = ['month', 'rating'])['month']
             .count()
             .to_frame()
             .rename(columns = {'month':'count'})
             .reset_index())

plt.figure(figsize = (10,6))
sns.lineplot(x = 'month', 
             y = 'count', 
             data = net_rating,
             hue = 'rating', 
             style = 'rating', 
             markers= True,
             palette = sns.color_palette("icefire"))

plt.title('Which Rating Type does Netflix\n put more into their Platform Per Month?', 
           fontdict = {'fontsize': 16, 'fontweight': 'bold'})
plt.xticks(ticks = net_rating['month'].unique())
plt.xlabel('Month')
plt.ylabel('Frequency')
plt.show()

In [None]:
net_type = (netflix
            .groupby(by = ['month', 'rating', 'type'])['month']
            .count()
            .to_frame()
            .rename(columns = {'month':'count'})
            .reset_index())

g = sns.FacetGrid(net_type, col = 'type', hue = 'rating', palette = sns.color_palette('icefire'))
g.map(sns.lineplot, 'month', 'count')
g.set_titles(col_template = 'Which rating type does Netflix\n put more into their Plaform for\n {col_name}s Per Month?')
g.set_axis_labels('Month', 'Frequency')
g.set(xticks = net_type['month'].unique())
g.fig.subplots_adjust(wspace = .15, hspace = .25)
g.add_legend()
plt.show()

# Director Analysis for Movies and TV Shows

In [None]:
director_data = (column_list_tokenizer_count(df = netflix, subset_cols = 'director', secondary_col = 'type')
                .rename(columns = {'Movie': 'type', 'Sam Dunn': 'director'}))
director_data.head()

In [None]:
top_director_movies = (director_data[(director_data['type'] == 'Movie') & (director_data['director'] != 'No Director')]
                      .nlargest(n = 10, columns = 'count'))

plot_bar(x_var = 'count', y_var = 'director', hue_col = None,
         df = top_director_movies, xlabel_name = 'Director Frequency', ylabel_name = 'Name of Directors', num_colors = 10,
         title_name = 'Top 10 Famous Directors in Movies')

In [None]:
top_director_shows = (director_data[(director_data['type'] == 'TV Show') & (director_data['director'] != 'No Director')]
                      .nlargest(n = 10, columns = 'count'))
                      
plot_bar(x_var = 'count', y_var = 'director', hue_col = None,
         df = top_director_shows, xlabel_name = 'Director Frequency', ylabel_name = 'Name of Directors', num_colors = 10,
         title_name = 'Top 10 Famous Directors in TV Shows')

# Actor Analysis for Movies and TV Shows

In [None]:
actor_data = (column_list_tokenizer_count(df = netflix, subset_cols = 'cast', secondary_col = 'type')
             .rename(columns = {'Movie': 'type', 'No Cast': 'cast'}))
actor_data.head()

In [None]:
top_actor_movies = (actor_data[(actor_data['type'] == 'Movie') & (actor_data['cast'] != 'No Cast')]
                    .nlargest(n = 10, columns = 'count'))
                      
plot_bar(x_var = 'count', y_var = 'cast', hue_col = None,
         df = top_actor_movies, xlabel_name = 'Actor Frequency', ylabel_name = 'Name of Actors', num_colors = 10,
         title_name = 'Top 10 Famous Actors in Movies')

In [None]:
top_actor_shows = (actor_data[(actor_data['type'] == 'TV Show') & (actor_data['cast'] != 'No Cast')]
                    .nlargest(n = 10, columns = 'count'))
                      
plot_bar(x_var = 'count', y_var = 'cast', hue_col = None,
         df = top_actor_shows, xlabel_name = 'Actor Frequency', ylabel_name = 'Name of Actors', num_colors = 10,
         title_name = 'Top 10 Famous Actors in TV Shows')