# Steps followed in this notebook :

1. Importing required packages

2. Preparing Data for analysis:
     
    a. Creating a new column 'year_added'
    
    b. Taking only last 10 year data
    
    c. Replacing 'nan' values in ['director', 'cast', 'country', 'rating'] col with 'Other'
    
    d. Collecting a list of all directors, actors, genres and countries
    
    e. Converting 'listed_id' and country to a list of show types and list of countries respectively
    
    f. Converting duration into a bin with different lengths
    
3. Analysing the data:

    a. Bar Plot to see which (Movie / TV Shows) has the most number of contents
    
    b. Which country has the most number of content in the last 10 years, also the top genres made in the last 10 years.
    
    c. A function 'top_actor_or_director' with i/p parameter dataframe, country (can take any of the countries) and attribute ('director' or 'cast) which returns a pie chart of top (parameter 'top' take any int n) n 'director' or 'actor' in that specific country
    
    d. Bar plot showing number of content (movie and TV show) added per year
    
    e. Bar plot showing the total number of Movies and TV Shows per ratings respectively
    
    f. A function 'genre_actor_or_director' with i/p parameter dataframe, name (of actor / director) and a bool value indicating Ture for actor or director rspectively plots a pie chart with the total number of movies / TV shows done in each genre by the actor/director
    
    g. Bar chart showing when is the most content added during the year
  

## 1. Importing required packages

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime # convert to datetime
import collections
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
netflix = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

In [None]:
netflix.head()

In [None]:
netflix.info()

# 2. Preparing data for analysis

In [None]:
netflix.drop('show_id', axis = 1, inplace = True)

### percentage of null values in the data

In [None]:

netflix.isnull().mean()*100

## a. Creating a new column 'year_added'

1. Extracted from 'date_added'
2. Contaings year in which the show/movie was added 
3. Row where the data was missing was substituted from the release year
4. Converting 'date_added' to datetime and removing rows withi have only year information (keeping month day and year information)

In [None]:
netflix.loc[netflix["date_added"].isna(), 'date_added'] =  netflix.loc[netflix["date_added"].isna(), 'release_year']

In [None]:
netflix['year_added'] = netflix['date_added'].apply(lambda x : x[-4:] if type(x) == str else str(x))

In [None]:
netflix.date_added = netflix.date_added.apply(lambda x : datetime.strptime(x.strip(), '%B %d, %Y') if type(x) == str else x)
netflix = netflix[~netflix['date_added'].apply(lambda x: str(x).isdigit())]

## b. Taking only last 10 year data

In [None]:
netflix = netflix[(netflix['year_added'] >= '2010') & (netflix['year_added'] < '2021')]

## c. Replacing 'nan' values in ['director', 'cast', 'country', 'rating'] col with 'Other'

In [None]:
netflix.director.fillna('Other', inplace=True)
netflix.cast.fillna('Other', inplace=True)
netflix.country.fillna('Other', inplace=True)
netflix.rating.fillna('Other', inplace=True)

In [None]:
idx_drop = netflix[(netflix.director == 'Other') & (netflix.cast == 'Other') & (netflix.country == 'Other') & (netflix.rating == 'Other')].index
netflix.drop(idx_drop , inplace=True)

## d. Collecting a list of all directors, actors, genres and countries

In [None]:
genre = list(set([ i.strip(' ') for i in list(set([e for l in list(set(netflix.listed_in.to_list())) for e in  l.split(',') ])) ]))
countries = list(set([ i.strip(' ') for i in list(set([e for l in list(set(netflix.country.to_list())) for e in  l.split(',') ])) ]))
countries.remove('')
actors = list(set([ i.strip(' ') for i in list(set([e for l in list(set(netflix.cast.to_list())) for e in  l.split(',') ])) ]))
director = list(set([ i.strip(' ') for i in list(set([e for l in list(set(netflix.director.to_list())) for e in  l.split(',') ])) ]))

## e. Converting 'listed_id' and country to a list of show types and list of countries respectively

In [None]:
netflix['genre'] = netflix.listed_in.apply(lambda x : [e.strip() for e in x.strip().split(',')])
netflix['country_list'] = netflix.country.apply(lambda x : [e.strip() for e in x.strip().split(',')])
netflix['director_list'] = netflix.director.apply(lambda x : [e.strip() for e in x.strip().split(',')])
netflix['cast_list'] = netflix.cast.apply(lambda x : [e.strip() for e in x.strip().split(',')])

## f. Converting duration into a bin with different lengths

In [None]:
bins = [0, 90, 120, 180, 240]
labels = ['short', 'medium', 'average', 'lengthy']
netflix.duration.loc[netflix.duration.str.contains(' min'), ] = pd.cut(netflix.duration.loc[netflix.duration.str.contains(' min'), ].apply(lambda x : int(x.strip(' min'))), bins, labels= labels)

In [None]:
netflix.duration.value_counts()

### Below we can see the data prepared for analysis

In [None]:
netflix.head(10)

In [None]:
netflix.shape

In [None]:
netflix.to_csv('netflix_processed.csv', index = False)

########################################################################################################################################

In [None]:
netflix.head(2)

# 3. Visually Analysing the data

## a. From the below plot we can see that Movies dominate

In [None]:
netflix.groupby('type')['type'].count().plot(kind='bar')

## b. We can see that US and India are the countries where most of the content is made. Also further below we can see the top genres made in the last 10 years

In [None]:
def bar_plot(df, col_name, col_list):
    
    df = pd.concat([df[[col_name]],pd.DataFrame(columns = col_list)])
    
    for c in col_list:
        df[c] = df[col_name].apply(lambda x : len([e for e in x if e == c]) )
    
        
    plt.figure(figsize=(20,10))
    df.iloc[:,1:].sum(axis=0)[df.iloc[:,1:].sum(axis=0)>100].sort_values(ascending = False).plot(kind='bar')

In [None]:
bar_plot(netflix, 'country_list', countries)

In [None]:
bar_plot(netflix, 'genre', genre)

## c. A function 'top_actor_or_director' with i/p parameter country can take any of the countries and attribute ('director' or 'cast) gives us pie chart of top (parameter 'top' take any int n) n 'director' or 'actor' in that specific country

In [None]:
def top_actor_or_director(df, country, attribute, top = 2, other = False):
    
    df = df[(df.country.str.contains(country))]
    a = [l.strip() for l in df[attribute].to_list()]
    b = [i.split(',') for i in a]
    c = [j for i in b for j in i]
    d = [i.strip() for i in c]
    e = dict(collections.Counter(d))
    f = dict(collections.OrderedDict(sorted(e.items(), 
                                  key=lambda kv: kv[1], reverse=True)))
    
    del a,b,c,d,e
    
    if other:
        m = dict(itertools.islice(f.items(), 0, top))
    
    else:
        m = dict((k, v) for k, v in f.items() if k != 'Other')
        m = dict(itertools.islice(m.items(), 0, top))
    
    
    plt.figure(figsize=(15,10))
    explode = tuple(sorted(np.linspace(0, 0.25, len(m)), reverse=True))
    def absolute_value(val):
        a  = np.round(val*sum(m.values())/100)
        return a
    
    plt.pie(m.values(), labels=m.keys(), autopct= absolute_value, #'%1.1f%%',
            shadow=True, startangle=90, explode=explode)
    
    plt.title ('Top ' + str(top) + ' ' + attribute + ' of' + ' ' + country, size=15, weight="bold")
    
    
    plt.show()

In [None]:
top_actor_or_director(netflix, country='India', attribute='director', top = 7, other = False)

In [None]:
top_actor_or_director(netflix, country='United States', attribute='cast', top = 7)

## d. Bar plot showing number of content (movie and TV show) added per year

In [None]:
netflix.groupby(['year_added', 'type'])['type'].count().unstack(level=1).plot(kind='bar', subplots=False, figsize=(15, 8))
plt.show()

## e. The bar plot below shows the total number of Movies and TV Shows per ratings respectively

In [None]:
netflix[netflix['type'] == 'Movie'].groupby('rating')['rating'].count().plot(kind='bar', figsize=(15, 8))

In [None]:
netflix[netflix['type'] == 'TV Show'].groupby('rating')['rating'].count().plot(kind='bar', figsize=(15, 8))

## f. A function 'genre_actor_or_director' with i/p parameter the dataframe name of actor / director and a bool value indicating Ture for actor or director rspectively plots a pie chart with the total number of movies / TV shows done in each genre by the actor/director

In [None]:
def genre_actor_or_director(df, name, actor = False, director = False):
    
    if actor:
        df = df[df.cast.str.contains(name)]['listed_in'].to_list()
    elif director:
        df = df[df.director.str.contains(name)]['listed_in'].to_list()
    else:
        return('Please specify if an actor or director')
    
    a = [l.strip() for l in df]
    b = [i.split(',') for i in a]
    c = [j for i in b for j in i]
    d = [i.strip() for i in c]
    e = dict(collections.Counter(d))
    m = dict(collections.OrderedDict(sorted(e.items(), 
                                  key=lambda kv: kv[1], reverse=True)))
    
    plt.figure(figsize=(15,10))
    explode = tuple(sorted(np.linspace(0, 0.25, len(m)), reverse=True))
    def absolute_value(val):
        a  = np.round(val*sum(m.values())/100)
        return a
    
    plt.pie(m.values(), labels=m.keys(), autopct= absolute_value, #'%1.1f%%',
            shadow=True, startangle=90, explode=explode)
    
    plt.title ('Top genre of ' + name, size=15, weight="bold")
    
    
    plt.show()

In [None]:
genre_actor_or_director(netflix, 'Leonardo DiCaprio', actor = True)

In [None]:
genre_actor_or_director(netflix, 'Christopher Nolan', director = True)

## g. From the below bar chart we can see that most of the content is added towards the last qarter of a year

In [None]:
netflix['date_added'] = pd.to_datetime(netflix['date_added'])
netflix['content_added_month'] = netflix['date_added'].dt.month

In [None]:
netflix.groupby(['content_added_month', 'type'])['type'].count().unstack(level=1).plot(kind='bar', subplots=False, figsize=(15, 8))
plt.show()