## Objective

I wanted to be able to analyse the dataset with a focus on certain questions to answer. In my analysis, I walk through each of the questions, answers in both visualizations and text form. 


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns


from itertools import chain
from collections import Counter
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the dataset

In [None]:
# Loading the dataset

data = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

print(data.shape)
print(data.columns)

In [None]:
data.info()

# Analysis

Looking to answer the following questions:

1. [Who are the top 10 cast members who appear in the most Netflix Movies and TV Shows? Just Movies? Just TV Shows?](#q1)
2. [What is the distribution of the release year for both Movies and TV Shows? Just movies? Just TV Shows?](#q2)
3. [What is the frequency and distribution of Netflix Movies' duration?](#q3)
4. [What is the relationship between Duration and Ratings for Movies? For TV Shows?](#q4)
5. [Do Movies or TV Shows have more cast members?](#q5)
6. [Which Movie and TV Shows were filmed in the most amount of countries?](#q6)
7. [Which is the most popular location to film for both Movies and TV shows? Just Movies? Just TV Shows?](#q7)
8. [What are the top 10 most popular movie/TV show category? Just Movies? Just TV Shows?](#q8)

### 1. <a id='q1'></a>  Who are the top 10 cast members who appear in the most Netflix Movies and TV Shows? Just Movies? Just TV Shows?


In [None]:
## Preprocessing

## Identifying the cast members in both Movie and TV shows - unnest cast 
cast_col = data['cast']
cast_members = cast_col.str.split(",",expand=True)

# Identifying the cast members in either Movie or TV Shows
movie_cast = (data[data['type'] == "Movie"]['cast']).str.split(",", expand=True)
tv_cast = (data[data['type'] == "TV Show"]['cast']).str.split(",", expand=True)

In [None]:
# Identifying the number of cast members per TV Show and Movie on Netflix
num_cast_members = []

for i in range(len(data.index)):
    # counting the numbers of cast members, not including nulls    
    num_cast_members.append(cast_members.iloc[i].notnull().sum())

#adding new column 'num_cast' to dataset    
data['num_cast'] = num_cast_members


In [None]:
## want to create a list of lists, removing any None and nan values 
cast_members_list = cast_members.apply(lambda x: x.dropna().tolist()).tolist()
movie_cast_list = movie_cast.apply(lambda x: x.dropna().tolist()).tolist()
tv_cast_list = tv_cast.apply(lambda x: x.dropna().tolist()).tolist()
# cast_members.values.tolist() <-- adds all values to the list 


### --- retrieved functions start --- # 
# Python3 program to extract first and last  
# element of each sublist in a list of lists 
  
def Extract(lst): 
    return [item[0] for item in lst] 

def converttostr(input_seq, seperator):
    # Join all the strings in list
    final_str = seperator.join(input_seq)
    return final_str

### ------- retrieved functions end --- #

# count the number of cast members 
counting_occurences = chain.from_iterable(cast_members_list)
occurences = Counter(counting_occurences)
answer = converttostr(Extract(occurences.most_common(10)), ",")
print(f'The top 10 cast members who appear in the most Netflix Movies and TV Shows are: \n {answer} \n')

movie_counting_occurences = chain.from_iterable(movie_cast_list)
movie_occurences = Counter(movie_counting_occurences)
answer = converttostr(Extract(movie_occurences.most_common(10)), ",")
print(f'The top 10 cast members who appear in the most Netflix Movies are: \n {answer} \n')

tv_counting_occurences = chain.from_iterable(tv_cast_list)
tv_occurences = Counter(tv_counting_occurences)
answer = converttostr(Extract(tv_occurences.most_common(10)), ",")
print(f'The top 10 cast members who appear in the most Netflix TV Shows are: \n {answer} \n')


In [None]:
## Visualizing the answers 

## ---- FOR THE STACKED BAR CHART ----## 

## identify # of occurences of cast in either movie or TV shows to create a stacked bar chart
all_movie_occur_dict = dict(movie_occurences)
all_tv_occur_dict = dict(tv_occurences)
top10 = dict(occurences.most_common(10)).keys()

# finding the top 10 cast member occurences by type 
top10_cast_by_movie = []
top10_cast_by_tv = []

for key in top10:
    if key in all_movie_occur_dict.keys():
        top10_cast_by_movie.append(all_movie_occur_dict[key])
    else: 
        top10_cast_by_movie.append(0)   
    if key in all_tv_occur_dict.keys():
        top10_cast_by_tv.append(all_tv_occur_dict[key])
    else: 
        top10_cast_by_tv.append(0)  

# to display the top 10 occurences for Netflix Movies and TV Shows, Just Movies and Just TV Shows
occur_dict = dict(occurences.most_common(10))
movie_occur_dict = dict(movie_occurences.most_common(10))
tv_occur_dict = dict(tv_occurences.most_common(10))

# Using keys, and num_occur for the first bar chart 
keys = list(occur_dict.keys())
num_occur = list((occur_dict.values()))

plt.figure(figsize = (10,3))
plt.title("Top 10 Most Common Cast Members Who Appear the Most in Netflix Movies and TV Shows",fontsize=16)
plt.xticks(rotation=45 , ha='right')
plt.xlabel('Cast Members')
plt.ylabel('# of Occurences')
plt.bar(keys, num_occur,color= 'black', edgecolor='black')
#plt.bar(*zip(*sorted(occur_dict.items()))) <-- another option, display not sorted
plt.show()

## STACKED BAR CHART 
plt.figure(figsize = (10,3))
plt.title("Top 10 Most Common Cast Members Who Appear the Most in Netflix Movies and TV Shows",fontsize=16)
plt.bar(list(top10), top10_cast_by_movie, color='b',edgecolor='black')
plt.bar(list(top10), top10_cast_by_tv, bottom=top10_cast_by_movie, color='r',edgecolor='black')
plt.legend(['Movies', 'TV Shows'],loc="upper right")
plt.xlabel('Cast Members')
plt.ylabel('# of Occurences')
plt.xticks(rotation=45 , ha='right')
plt.show()

plt.figure(figsize = (10,3))
plt.title("Top 10 Most Common Cast Members Who Appear the Most in Netflix Movies",fontsize=16)
plt.xticks(rotation=45 , ha='right')
plt.bar(*zip(*sorted(movie_occur_dict.items())),edgecolor='black')
plt.xlabel('Cast Members')
plt.ylabel('# of Occurences')
plt.show()

plt.figure(figsize = (10,3))
plt.title("Top 10 Most Common Cast Members Who Appear the Most in Netflix TV Shows",fontsize=16)
plt.xticks(rotation=45 , ha='right')
plt.bar(*zip(*sorted(tv_occur_dict.items())), color='r',edgecolor='black')
plt.xlabel('Cast Members')
plt.ylabel('# of Occurences')
plt.show()

### 2. <a id="q2"></a> What is the distribution of the release year for both Movies and TV Shows? Just movies? Just TV Shows?


In [None]:
plt.figure()
sns.distplot(data['release_year'], color='black')
plt.title("Distribution of the Overall Release Year of Netflix' Movies and TV Shows")
plt.xlabel('Release Year')
plt.show()

sns.distplot(data[data['type']=="Movie"]['release_year'])
plt.title("Distribution of the Overall Release Year of Netflix' Movies")
plt.xlabel('Release Year')
plt.show()

sns.distplot(data[data['type']=="TV Show"]['release_year'], color='r')
plt.title("Distribution of the Overall Release Year of Netflix' TV Shows")
plt.xlabel('Release Year')
plt.show()


There is an increase of Netflix Movies and TV Shows released in the past few years than in any other year. 

### 3. <a id="q3"></a> What is the frequency and distribution of Netflix Movies' duration?

In [None]:
## Getting subsets of data for visualization
subset = pd.DataFrame(data, columns = ['show_id','title','type','rating', 'duration'])
movie_subset = subset[subset['type'] == "Movie"]
tv_subset = subset[subset['type'] == "TV Show"]

# for the movie_subset, remove 'min' from the column duration 
## identify the frequency 
## create a histogram and buckets for heatmap
movie_subset['Duration_mins'] =movie_subset['duration'].str.rstrip('min').astype('int')

sns.distplot(movie_subset['Duration_mins'])
plt.title("Distribution of Netflix Movie Duration in minutes")
plt.xlabel('Duration')
plt.show()

movie_subset.hist(column='Duration_mins', bins=30, grid=False, alpha=0.5,edgecolor="black")
plt.title("Histogram of Netflix Movie Duration in minutes")
plt.xlabel('Duration')
plt.show()

### 4. <a id = "q4"></a> What is the relationship between Duration and Ratings for Movies? For TV Shows?

As a note, Movies and TV Shows do not have the same values for their respective duration. The analysis is done on movies and TV shows seperately. 

In [None]:
## Creating bins for Duration_mins
bins = [0, 25, 50, 75, 100,125, 150, 175, 200, 225, 250, 275, 300]
movie_subset['binned'] = pd.cut(movie_subset['Duration_mins'], bins)

#only keeping the necessary data for pivot 
tv_subset_sub = tv_subset[['show_id','duration','rating']]
movie_subset_sub = movie_subset[['show_id','binned','rating']]

#creating a pivot table for the heatmaps 
tv_pivot = pd.pivot_table(tv_subset_sub, index=['duration'], columns= ['rating'], aggfunc='count',fill_value =0)
movie_pivot = pd.pivot_table(movie_subset_sub, index=['binned'], columns= ['rating'], aggfunc='count',fill_value =0)

# Identifying all the cols and indexes needed for the new df for heatmap 
tv_cols = list(tv_pivot.columns.get_level_values(level=1))
tv_index = list(tv_pivot.index.get_level_values(level=0))

movie_cols = list(movie_pivot.columns.get_level_values(level=1))
movie_index = list(movie_pivot.index.get_level_values(level=0))

#creates the df in preparation for the heatmaps 
tv_output = pd.DataFrame(tv_pivot.values, index=tv_index, columns=tv_cols)
movie_output = pd.DataFrame(movie_pivot.values, index=movie_index, columns=movie_cols)

## Heatmaps

# --- Movies -- 
plt.figure(figsize = (16,16))
sns.heatmap(movie_output,cmap='Blues', linewidths=0.5, annot=True, annot_kws={"fontsize":10}, fmt='g',linecolor='gray')
plt.yticks(rotation=0)
plt.ylabel('Duration of Movies, in mins')
plt.xlabel('Rating of Movies')
plt.title('Heatmap for the Count of Netflix Movies by Durating and Ratings',fontsize=16)
plt.show()

# --- TV Shows --- 
plt.figure(figsize = (16,16))
sns.heatmap(tv_output,cmap='Reds', linewidths=0.5, annot=True, annot_kws={"fontsize":10}, fmt='g',linecolor='gray')
plt.yticks(rotation=0)
plt.ylabel('Duration of TV Show')
plt.xlabel('Rating of TV Show')
plt.title('Heatmap for the count of Netflix TV Shows by Durating and Ratings',fontsize=16)
plt.show()



### 5. <a id = "q5"></a> Do Movies or TV Shows have more cast members? 

In [None]:
## Finding out if movies or TV shows have more cast members 
#data.hist(column="num_cast", by="type", figsize=(10,6), color='r') <-- unable to find how to change colors by groups

data.hist(column="num_cast", figsize=(10,3), color='black', grid=False, edgecolor='black')
plt.title("Histogram for Netflix Movies and TV Shows by # of Cast Members")
plt.show()

data[data['type'] == "Movie"].hist(column="num_cast", figsize=(10,3), color='b', grid=False,edgecolor='black')
plt.title("Histogram for Netflix Movies by # of Cast Members")
plt.show()

data[data['type'] == "TV Show"].hist(column="num_cast", figsize=(10,3), color='r', grid=False,edgecolor='black')
plt.title("Histogram for Netflix TV Shows by # of Cast Members")
plt.show()

sns.distplot(data['num_cast'], color='black')
plt.title("Distribution of Netflix Movies and TV Shows by # of Cast Members")
plt.xlabel('# of Cast Members')
plt.show()

#plt.figure(figsize = (15,6))
sns.distplot(data[data['type']=="Movie"]['num_cast'])
sns.distplot(data[data['type']=="TV Show"]['num_cast'], color='r')
plt.title("Distribution of Netflix Movies and TV Shows by # of Cast Members")
plt.xlabel('# of Cast Members')
plt.legend(['Movies', 'TV Shows'],loc="upper right")
plt.show()

In general, based on the dataset, it looks like there are more cast members in movies than in TV Shows. 

### 6. <a id = "q6"> </a> Which Movie and TV Shows were filmed in the most amount of countries? 

In [None]:
## Identifying the number of countries in both Movie and TV Shows
countries = data['country'].str.split(",",expand=True)
countries_movies = (data[data['type'] == "Movie"]['country']).str.split(",", expand=True)
countries_tv = (data[data['type'] == "TV Show"]['country']).str.split(",", expand=True)

#removing additional whitespace 
for col in countries.columns:
    countries[col] = countries[col].str.strip()
    
for col in countries_movies.columns:
    countries_movies[col] = countries_movies[col].str.strip()
    
for col in countries_tv.columns:
    countries_tv[col] = countries_tv[col].str.strip()

In [None]:
# Identifying the number of countries per TV Shows and Movie on Netflix
num_countries = []
for i in range(len(data.index)):
    #counting the numbers of countries, not including nulls 
    
    #found some additional whitespace in front of countries, stripping the whitespace 
    if countries.iloc[i].notnull().any():
        num_countries.append(countries.iloc[i].notnull().sum())
    else:
        num_countries.append(countries.iloc[i].notnull().sum())
data['num_countries'] = num_countries

In [None]:
## Which movies and TV shows were filmed in the most amount of countries?
movie_country = data[data['type'] == "Movie"].sort_values(by='num_countries',ascending=False)[['title', 'num_countries']].head(10)
tv_country = data[data['type'] == "TV Show"].sort_values(by='num_countries',ascending=False)[['title', 'num_countries']].head(10)
overall_countries = data.sort_values(by='num_countries',ascending=False)[['title', 'num_countries']].head(10)

plt.figure(figsize = (15,6))
sns.barplot(x='num_countries', y='title',data=overall_countries, palette="gray",  linewidth=1, edgecolor='k')
plt.title("Top 10 Netflix Movies and TV Shows filmed in the Amount of Countries",fontsize=16)
plt.xlabel("# of Countries")
plt.show()   

answer = converttostr(overall_countries['title'].to_list(), ",")
print(f'The top 10 Netflix Movies and TV Shows that appeared in the most amount of countries are: \n {answer} \n')
                                                                         
plt.figure(figsize = (15,6))
sns.barplot(x='num_countries', y='title',data=movie_country, palette="Blues_d",  linewidth=1, edgecolor='k')
plt.title("Top 10 Netflix Movies filmed in the Amount of Countries",fontsize=16)
plt.xlabel("# of Countries")
plt.show()

answer = converttostr(movie_country['title'].to_list(), ",")
print(f'The top 10 Netflix Movies that appeared in the most amount of countries are: \n {answer} \n')


plt.figure(figsize = (15,6))
sns.barplot(x='num_countries', y='title',data=tv_country, palette="Reds_d",  linewidth=1, edgecolor='k')
plt.title("Top 10 Netflix TV Shows filmed in the Amount of Countries",fontsize=16)
plt.xlabel("# of Countries")
plt.show()

answer = converttostr(tv_country['title'].to_list(), ",")
print(f'The top 10 Netflix TV Shows that appeared in the most amount of countries are: \n {answer} \n')


### 7. <a id = "q7"></a> Which is the most popular location to film for both Movies and TV shows? Just Movies? Just TV Shows? 

In [None]:
## Finding the most popular movies/tv shows location

## want to create a list of lists, removing any None and nan values 
countries_list = countries.apply(lambda x: x.dropna().tolist())  
countries_movie_list = countries_movies.apply(lambda x: x.dropna().tolist()) 
countries_tv_list = countries_tv.apply(lambda x: x.dropna().tolist())


# count the number of countries
counting_countries = chain.from_iterable(countries_list)
country_occurences = Counter(counting_countries)
#print(country_occurences.most_common(10))

# count the number of countries
counting_countries_movies = chain.from_iterable(countries_movie_list)
country_occurences_movies = Counter(counting_countries_movies)
#print(country_occurences_movies.most_common(10))

# count the number of countries
counting_countries_tv = chain.from_iterable(countries_tv_list)
country_occurences_tv = Counter(counting_countries_tv)
#print(country_occurences_tv.most_common(10))

## finding THE TOP 10 Values 
occur_country_dict = dict(country_occurences.most_common(10))
movie_country_occur_dict = dict(country_occurences_movies.most_common(10))
tv_country_occur_dict = dict(country_occurences_tv.most_common(10))

keys = list(occur_country_dict.keys())
num_occur = list((occur_country_dict.values()))


plt.figure(figsize = (10,3))
plt.title("Top 10 Most Common Locations Filmed found in Netflix Movies and TV Shows")
plt.xticks(rotation=45 , ha='right')
plt.xlabel('Countries')
plt.ylabel('# of Occurences')
plt.bar(keys, num_occur,color= 'black', edgecolor='black')
#plt.bar(*zip(*sorted(occur_dict.items()))) <-- another option, display not sorted
plt.show()


## --- NEEDED FOR STACKED CHART ONLY --- 

## identify # of occurences of countries in either movie or TV shows to create a stacked bar chart
all_movie_country_occur_dict = dict(country_occurences_movies)
all_tv_country_occur_dict = dict(country_occurences_tv)
top10_countries = dict(country_occurences.most_common(10)).keys()

# finding the top 10 cast member occurences by type 

top10_country_by_movie = []
top10_country_by_tv = []

for key in top10_countries:
    if key in all_movie_country_occur_dict.keys():
        top10_country_by_movie.append(all_movie_country_occur_dict[key])
    else: 
        top10_country_by_movie.append(0)   
    if key in all_tv_occur_dict.keys():
        top10_country_by_tv.append(all_tv_country_occur_dict[key])
    else: 
        top10_country_by_tv.append(0)  
        

## STACKED BAR CHART 
plt.figure(figsize = (10,3))
plt.title("Top 10 Most Common Locations Filmed found in Netflix Movies and TV Shows")
plt.bar(list(top10_countries), top10_country_by_movie, color='b',edgecolor='black')
plt.bar(list(top10_countries), top10_country_by_tv, bottom=top10_country_by_movie, color='r',edgecolor='black')
plt.legend(['Movies', 'TV Shows'],loc="upper right")
plt.xlabel('Countries')
plt.ylabel('# of Occurences')
plt.xticks(rotation=45 , ha='right')
plt.show()


plt.figure(figsize = (10,3))
plt.title("Top 10 Most Common Locations Filmed found in Netflix Movies")
plt.xticks(rotation=45 , ha='right')
plt.bar(*zip(*sorted(movie_country_occur_dict.items())),edgecolor='black')
plt.xlabel('Countries')
plt.ylabel('# of Occurences')
plt.show()


plt.figure(figsize = (10,3))
plt.title("Top 10 Most Common Locations Filmed found in Netflix TV Shows")
plt.xticks(rotation=45 , ha='right')
plt.bar(*zip(*sorted(tv_country_occur_dict.items())), color='r',edgecolor='black')
plt.xlabel('Countries')
plt.ylabel('# of Occurences')
plt.show()

print("---- Results: ---- ")
answer = converttostr(keys, ",")
print(f'The Top 10 Most Common Locations Filmed found in Netflix Movies and TV Shows are: \n {answer} \n')
answer_movie = converttostr(Extract(country_occurences_movies.most_common(10)), ",")
print(f'The top 10 Most Common Locations Filmed found in Netflix Movies are: \n {answer} \n')
answer_tv = converttostr(Extract(country_occurences_tv.most_common(10)), ",")
print(f'The top 10 Most Common Locations Filmed found in Netflix TV Shows are: \n {answer} \n')


In [None]:
## Which movie or TV shows were filmed in the most amount of countries -- heatmap by movies and TV shows 

country_subset = data[['show_id','type','num_countries']]
country_pivot = pd.pivot_table(country_subset, index=['type'], columns= ['num_countries'], aggfunc='count',fill_value =0)

country_cols = list(country_pivot.columns.get_level_values(level=1))
country_index = list(country_pivot.index.get_level_values(level=0))

country_output = pd.DataFrame(country_pivot.values, index=country_index, columns=country_cols)

# --- Movies & TV Shows -- 
#plt.figure(figsize = (15,6))
plt.figure(num=None, figsize=(10,3), dpi=80, facecolor='w', edgecolor='black')
sns.heatmap(country_output,cmap="Greys_r", linewidths=0.5, annot=True, annot_kws={"fontsize":10}, fmt='g',linecolor='gray')
plt.yticks(rotation=0)
plt.ylabel('Type')

plt.title('Heatmap for the Count of Netflix Movies and TV Shows by Number of Countries and Type',fontsize=16)

plt.xlabel('Number of Countries Filmed')
plt.show()

### 8. <a id ="q8"> </a> What are the top 10 most popular movie/TV show category? Just Movies? Just TV Shows?

In [None]:
## most popular Movie/TV Show by Category

## Identifying the number of countries in both Movie and TV Shows
category = data['listed_in'].str.split(",",expand=True)
category_movies = (data[data['type'] == "Movie"]['listed_in']).str.split(",", expand=True)
category_tv = (data[data['type'] == "TV Show"]['listed_in']).str.split(",", expand=True)

#removing additional whitespace 
for col in category.columns:
    category[col] = category[col].str.strip()

category_df = pd.DataFrame(category)

category_list = []
for l in category_df.values.tolist():
    l_list = []
    for elm in l:
        if elm != None:
            l_list.append(elm)
    category_list.append(l_list)
    
#removing additional whitespace 
for col in category_movies.columns:
    category_movies[col] = category_movies[col].str.strip()

category_movies_df = pd.DataFrame(category_movies)

category_movies_list = []
for l in category_movies_df.values.tolist():
    l_list = []
    for elm in l:
        if elm != None:
            l_list.append(elm)
    category_movies_list.append(l_list)
    
#removing additional whitespace 
for col in category_tv.columns:
    category_tv[col] = category_tv[col].str.strip()

category_tv_df = pd.DataFrame(category_tv)

category_tv_list = []
for l in category_tv_df.values.tolist():
    l_list = []
    for elm in l:
        if elm != None:
            l_list.append(elm)
    category_tv_list.append(l_list)


# count the number of cast members 
occur_category = chain.from_iterable(category_list)
occurences_category = Counter(occur_category)
#print(occurences_category.most_common(10))

movie_counting_occur_category = chain.from_iterable(category_movies_list)
movie_occurences_category = Counter(movie_counting_occur_category)
#print(movie_occurences_category.most_common(10))

tv_counting_occur_category = chain.from_iterable(category_tv_list)
tv_occurences_category = Counter(tv_counting_occur_category)
#print(tv_occurences_category.most_common(10))


## finding THE TOP 10 Values 
occur_category_dict = dict(occurences_category.most_common(10))
movie_category_occur_dict = dict(movie_occurences_category.most_common(10))
tv_category_occur_dict = dict(tv_occurences_category.most_common(10))

keys = list(occur_category_dict.keys())
num_occur = list((occur_category_dict.values()))

plt.figure(figsize = (10,3))
plt.title("Top 10 Most Common Categories Who Appear the Most in Netflix Movies and TV Shows")
plt.xticks(rotation=45 , ha='right')
plt.xlabel('Categories')
plt.ylabel('# of Occurences')
plt.bar(keys, num_occur,color= 'black', edgecolor='black')
plt.show()

## --- NEEDED FOR STACKED CHART ONLY --- 

## identify # of occurences of countries in either movie or TV shows to create a stacked bar chart
all_movie_categories_occur_dict = dict(movie_occurences_category)
all_tv_categories_occur_dict = dict(tv_occurences_category)
top10_categories = dict(occurences_category.most_common(10)).keys()

# finding the top 10 cast member occurences by type 

top10_categories_by_movie = []
top10_categories_by_tv = []

for key in top10_categories:
    if key in all_movie_categories_occur_dict.keys():
        top10_categories_by_movie.append(all_movie_categories_occur_dict[key])
    else: 
        top10_categories_by_movie.append(0)   
    if key in all_tv_categories_occur_dict.keys():
        top10_categories_by_tv.append(all_tv_categories_occur_dict[key])
    else: 
        top10_categories_by_tv.append(0)  
        

## STACKED BAR CHART 
plt.figure(figsize = (10,3))
plt.title("Top 10 Most Common Category found in Netflix Movies and TV Shows")
plt.bar(list(top10_categories), top10_categories_by_movie, color='b',edgecolor='black')
plt.bar(list(top10_categories), top10_categories_by_tv, bottom=top10_categories_by_movie, color='r',edgecolor='black')
plt.legend(['Movies', 'TV Shows'],loc="upper right")
plt.xlabel('Categories')
plt.ylabel('# of Occurences')
plt.xticks(rotation=45 , ha='right')
plt.show()


plt.figure(figsize = (10,3))
plt.title("Top 10 Most Common Category found in Netflix Movies")
plt.xticks(rotation=45 , ha='right')
plt.bar(*zip(*sorted(movie_category_occur_dict.items())),edgecolor='black')
plt.xlabel('Categories')
plt.ylabel('# of Occurences')
plt.show()

plt.figure(figsize = (10,3))
plt.title("Top 10 Most Common Category found in Netflix TV Shows")
plt.xticks(rotation=45 , ha='right')
plt.bar(*zip(*sorted(tv_category_occur_dict.items())), color='r',edgecolor='black')
plt.xlabel('Categories')
plt.ylabel('# of Occurences')
plt.show()

print("---- Results: ---- ")
answer = converttostr(keys, ",")
print(f'The Top 10 Most Common Category found in Netflix Movies and TV Shows are: \n {answer} \n')
answer_movie = converttostr(Extract(movie_occurences_category.most_common(10)), ",")
print(f'The Top 10 Most Common Category found in Netflix Movies are: \n {answer} \n')
answer_tv = converttostr(Extract(tv_occurences_category.most_common(10)), ",")
print(f'The Top 10 Most Common Category found in Netflix TV Shows are: \n {answer} \n')


## WORDCLOUDS


In [None]:
import re
from gensim import models, corpora
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 



In [None]:
#identify the columns needed for analysis 
## looking to generate a wordcloud 

subset = data[['type', 'description']]

NUM_TOPICS = 10 
STOPWORDS = stopwords.words('english')



In [None]:
## WORDCLOUD BY NETFLIX TYPE

movie_text = " ".join(desc for desc in subset[subset['type'] == "Movie"].description)
tv_text = " ".join(desc for desc in subset[subset['type'] == "TV Show"].description) 
# text = " ".join(desc for desc in subset.description)

# #creating the wordcloud 
wordcloud_movie = WordCloud(max_words=1000, background_color="white").generate(movie_text)
wordcloud_tv = WordCloud(max_words = 1000, background_color="white").generate(tv_text)

#print()
plt.subplots(figsize=(20,20))
# plt.subplots_adjust(top=0.8) 
# #display image
plt.subplot(1,2,1)
plt.title('Movie Description WordCloud', fontsize = 20)
plt.imshow(wordcloud_movie)
plt.axis("off")
# plt.show()
# plt.suptitle("WordClouds", fontsize = 20)

plt.subplot(1,2,2)
plt.title("TV Description WordCloud", fontsize = 20)
plt.imshow(wordcloud_tv)
plt.axis("off")

# # plt.subplots_adjust(top=0.8) 
# plt.suptitle("WordClouds", size=16)
# plt.subplots_adjust(top=0.8) 

# plt.tight_layout(10)
plt.show()

## WordClouds with the removal of stopwords

In [None]:
#creating a stopword list 
# stopwords = set(STOPWORDS)
stopwords = nltk.corpus.stopwords.words('english')
stopwords += ['s']
# stopwords.update(['become', 'series', 'find','one', 'two', 'three', 'tries', 'make', 'finds'])

# #creating the wordcloud 
# wordcloud = WordCloud(stopwords= stopwords, max_words=100, background_color="white").generate(text)

# #creating the wordcloud 
wordcloud_movie = WordCloud(stopwords= stopwords, max_words=1000, background_color="black", normalize_plurals = True).generate(movie_text)
wordcloud_tv = WordCloud(stopwords= stopwords, max_words = 1000, background_color="black", normalize_plurals = True).generate(tv_text)

#print()
plt.subplots(figsize=(20,20))
# plt.subplots_adjust(top=0.8) 
# #display image
plt.subplot(1,2,1)
plt.title('Movie Description WordCloud \n Some Stopwords Removed', fontsize = 20)
plt.imshow(wordcloud_movie)
plt.axis("off")
# plt.show()
# plt.suptitle("WordClouds", fontsize = 20)

plt.subplot(1,2,2)
plt.title("TV Description WordCloud \n Some Stopwords Removed", fontsize = 20)
plt.imshow(wordcloud_tv)
plt.axis("off")

# # plt.subplots_adjust(top=0.8) 
# plt.suptitle("WordClouds", size=16)
# plt.subplots_adjust(top=0.8) 

# plt.tight_layout(10)
plt.show()
# #display image
# plt.imshow(wordcloud)
# plt.axis("off")
# plt.show()

In [None]:
# counting the Most Popular Words in each description 
import warnings
warnings.filterwarnings('ignore')
num_top = 10 

#Removing stopwords and punctuation
subset['description_without_stopwords'] = subset['description'].str.replace('[^\w\s]', ' ')
subset['description_without_stopwords'] = subset['description_without_stopwords'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in (stopwords)]))


#finding all the words in the descriptions
words = (subset.description_without_stopwords
        .str.cat(sep=" ")
        .split()
        )

# finding all the words in the movie descriptions 
movie_words = (subset[subset['type'] == "Movie"].description_without_stopwords
        .str.cat(sep=" ")
        .split()
        )


# finding all the words in the tv show descriptions 
tv_words = (subset[subset['type'] == "TV Show"].description_without_stopwords
        .str.cat(sep=" ")
        .split()
        )


#output - 
most_common_words = pd.DataFrame(Counter(words).most_common(num_top), columns=['Word', 'Frequency']).set_index('Word').reset_index()
most_common_movie_words = pd.DataFrame(Counter(movie_words).most_common(num_top), columns=['Word', 'Frequency']).set_index('Word').reset_index() 
most_common_tv_words = pd.DataFrame(Counter(tv_words).most_common(num_top), columns=['Word', 'Frequency']).set_index('Word').reset_index()


plt.subplots(figsize=(20,3))

plt.subplot(1,3,1)
plt.bar(most_common_words['Word'].tolist(),most_common_words['Frequency'].tolist(),color="purple",edgecolor="white")  #, set_edgecolor="white")
plt.title("Most Common Words Found in Netflix Descriptions")
plt.xticks(rotation=45 , ha='right')
# # plt.show()

plt.subplot(1,3,2)
plt.bar(most_common_movie_words['Word'].tolist(),most_common_movie_words['Frequency'].tolist(), color="blue",  edgecolor="white")  #, set_edgecolor="white")
plt.title("Most Common Words Found in Netflix Movie Descriptions")
plt.xticks(rotation=45 , ha='right')


plt.subplot(1,3, 3)
plt.bar(most_common_tv_words['Word'].tolist(),most_common_tv_words['Frequency'].tolist(), color="red",  edgecolor="white")  #, set_edgecolor="white")
plt.title("Most Common Words Found in Netflix TV Show Descriptions")
plt.xticks(rotation=45 , ha='right')
plt.show()



In [None]:
#Another Idea for some Plots

# fig=plt.figure(figsize=(20,3))
# # plt.subplots(figsize=(20,3))
# ax1 = plt.subplot(131)
# ax2 = plt.subplot(132, sharey=ax1)
# ax3 = plt.subplot(133)


Thank you for reading! Feedback is welcome :) 


