In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import nltk
import pandas as pd
from nltk.corpus import stopwords
from  wordcloud import WordCloud
import matplotlib as plt
import random
from collections import Counter

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Get a look at the Data**

In [None]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
df.head()

# Basic and Quick Analysis


In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.nunique()

**Helper Functions**

In [None]:
def generic_filter(colName, value):
    return df[df[colName]==value]

def get_uniques(df, colName):
    lst = df[colName].unique().tolist()
    #remove NaN values
    cleanedList = [x for x in lst if x == x]
    return cleanedList

def generic_get(df, colname):
    if colname == 'listed_in':
        return get_genres(df)
    return df[colname]

def get_length(colName):
    return len(df[colName])

 

**Data Specific Functions**

In [None]:

def get_num_titles_together(actor, director):
    x =df[df['cast'].str.count(actor)>0]
    z=x[x['director'].str.count(director)>0]
    return len(z)

def get_genres(df):
    genre_list = get_uniques(df, 'listed_in')
    new_list=[]
    for grouped_genres in genre_list:
        if ',' in grouped_genres:
            s=grouped_genres.split(',')
            for genre in s:
                if(genre[0]==' '):
                    genre=genre[1:]
                new_list.append(genre)
        else:
            new_list.append(grouped_genres)

    return list(new_list)

def get_actors(df):
    actor_list = get_uniques(df, 'cast')
    new_list=[]
    for grouped_actors in actor_list:
        if ',' in grouped_actors:
            s=grouped_actors.split(',')
            for actor in s:
                if(actor[0]==' '):
                    actor=actor[1:]
                new_list.append(actor)
        else:
            new_list.append(grouped_actors)

    return list(new_list)


def get_directors(df):
    director_list = get_uniques(df, 'director')
    new_list=[]
    for grouped_director in director_list:
        if ',' in grouped_director:
            s=grouped_director.split(',')
            for director in s:
                if(director[0]==' '):
                    director=director[1:]
                new_list.append(director)
        else:
            new_list.append(grouped_director)

    return list(new_list)


def get_unique_genres(df):
    return list(set(get_genres(df)))

def get_unique_countries(df):
    c = get_uniques(df,'country')
    new_list=[]
    for grouped_countries in c:
        if type(grouped_countries) != float:
            grouped_countries = grouped_countries.split(',')
            for country in grouped_countries:
                if country != '':
                    if(country[0]==' '):
                        country=country[1:]
                new_list.append(country)
    
    return list(set(new_list))[1:]

def get_content_by_country(country):
      return df[df['country'].str.contains(country, na=False)]


def get_words_from_genre(genre):
    sub = generic_filter('listed_in', genre)
    descriptions = generic_get(sub, 'description')
    descriptions = [x for x in descriptions if x not in stopwords.words('english')]

    text=[]
    sp = stopwords.words('english')
    sp.append("-")
    for  description in descriptions:
        words=description.split(' ')
        for word in words:
           if word not in sp:
                if '"' in word:
                    word.replace('"', '')
                text.append(word)
    text=(" ").join(text)            
    return(text)

**Functions that Plot**

In [None]:


def plot_most_popular_genres(country, n):
    country_content = get_content_by_country(country)
    genres= generic_get(country_content, 'listed_in')
    genres = Counter(genres)
    most_common_genres=genres.most_common(n)
    temp = pd.DataFrame(most_common_genres, columns =['Genre','Count']) 
    temp.plot.bar(x='Genre', y='Count', title='Top '+ str(n) +' Popular Genres in '+ country)

def plot_most_popular_ratings(country, n):
    country_content = get_content_by_country(country)
    ratings = generic_get(country_content, 'rating')
    ratings = Counter(ratings)
    most_common_ratings = ratings.most_common(n)
    temp = pd.DataFrame(most_common_ratings, columns =['Rating','Count']) 
    temp.plot.bar(x='Rating', y='Count', title='Top '+ str(n) +' Popular Ratings in '+ country)

def plot_show_type_over_time(df, years):
    data={'Movies':[],
         'TV Shows':[]}
    for year in years:
        temp=df[df['release_year']==year]
        
        tv=temp[temp['type']=="TV Shows"]
        movies= temp[temp['type']=="Movie"]
        data['Movies'].append()
        num_tv = Counter(tv)
        num_movie= Counter(movies)
        
        data['Movies'].append(num_movie)
        data['TV Shows'].append(num_tv)
        
    
    plot= pd.Dataframe(data ,years)
    plot.plot.line()

def genre_wordcloud(genre):
    try:
        text = get_words_from_genre(genre)
        wordcloud = WordCloud(max_font_size=40).generate(text)
        plt.pyplot.figure()
        plt.pyplot.imshow(wordcloud, interpolation="bilinear")
        plt.pyplot.axis("off")
        plt.pyplot.title(genre)
        plt.pyplot.show(wordcloud)
    except:
        return

# Understanding what content is available in different countries



We can look at the amount of content by genre available in each country. I collect a random sample from the list of unique countries and use a helper function to plot that countries data based on 'n' genres. Some countries only have shows with fewer than 'n' genres.

In [None]:
sample_size=5

unique_countries =get_unique_countries(df)

sample= random.sample(unique_countries, sample_size)

number_of_genres_to_plot=5

for country in sample:
    plot_most_popular_genres(country, number_of_genres_to_plot)

Similar to the previous example, we can look at the count of the top n ratings for a sample of countries. Some countries have fewer than 'n' ratings.

In [None]:
sample_size=5

unique_countries =get_unique_countries(df)

sample= random.sample(unique_countries, sample_size)

top_n_ratings=5

for country in sample:
    plot_most_popular_ratings(country, top_n_ratings)


# Identifying similar content by matching text-based features


Lets look at the most popular words used in the descriptions of titles and generate a wordcloud based on the genre. I use a helper function to append a list of words in each show's description, according to genre. I then remove stopwords based on the english nltk.stopwords list. 

In [None]:

genres = get_unique_genres(df)
for genre in genres:
    genre_wordcloud(genre)

We can analyze each genre to generate a heat map based on the intersection of the words in their descriptions. 

In [None]:

intersection_matrix = np.empty(shape=(len(genres),len(genres)))

for i in range(len(genres)):
    text1 = set(get_words_from_genre(genres[i]))
    for j in range(len(genres)):
        text2 = set(get_words_from_genre(genres[j]))
        intersect = len(text2.intersection(text1))
        intersection_matrix[i,j]=int(intersect)

heatmap = pd.DataFrame(intersection_matrix.astype(int), index=genres, columns=genres)

'''
Some genres only contain descriptions that are not in english
unfortunately cleaning the list with english stopwords makes the text return an empty list
we want to remove the rows and columns that returned an empty list and would have 0 as intersection. 
'''

heatmap = heatmap[(heatmap.T != 0).any()]
heatmap = heatmap.T[(heatmap != 0).any()].T

Setting the style for the dataframe that will be used as a heatmap. 

In [None]:

heatmap.style.background_gradient(cmap='Greens')\
    .set_caption('Lengths of Set Intersections of Words in Genre Descriptions')
        


# Has Netflix been increasingly focusing on TV rather than movies in recent years?

Collect a list of `years` from the `years_added` column

In [None]:
dates=list(df['date_added'])
year_data={'year_added':[]}
years=[]
for date in dates:
    if date == date:
        year = date[len(date)-4: len(date)]
        years.append(int(year))
        year_data['year_added'].append(year)
    else:
        print(years[len(years)-1])
        year_data['year_added'].append('0')


Using a boxplot to show the distribution of years. We can see that:
* **median** is 2018
* **Q1** 2014-2017
* **Q2** 2017-2018
* **Q3** 2018-2019
* **Q4** 2019-2020



In [None]:
fig1, ax1 = plt.pyplot.subplots()
ax1.set_title('Distribution of Years that Titles are Added')
ax1.boxplot(years,0, '',vert=False)



To analyze the data, I am going to assume that the minimum year would be the median and filter out years that dont meet this criteria

In [None]:
median = 2018
min_year = median
recent_years = [year for year in years if year >= median ]


In [None]:
yr ={"year_added":years}
df['year_added']=year_data['year_added']

In [None]:
df.head()

I use  a sub dataframe to get only the columns we need and then use a lambda expression to search for any substring in any row that is in the `recent_years` 


In [None]:
recent_years_str = list(map(str, recent_years))
sub = df[['type','year_added']]
mask = sub.year_added.apply(lambda x: any(item for item in recent_years_str if item in x))
sub[mask]


Lets take a look at the growth of TV titles 

In [None]:
def plot_show_type_over_time(x, years):
    tv_shows=[]
    movie=[]
    diff=[]
    
    for year in years:
        temp=x[x['year_added']==str(year)]
        
        tv=temp[temp['type']=="TV Show"]
        movies= temp[temp['type']=="Movie"]
        
        movie.append(len(movies))
        tv_shows.append(len(tv))
        diff.append(len(tv)/len(movies))
        
    plot = pd.DataFrame({
   '':diff
   }, index=years)
    plot.plot.line()

v=list(sub['year_added'].unique())
v.sort()
v.remove("0")
plot_show_type_over_time(sub, v)