First step, we need to import the libraries

In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#for data visualization        
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

After we import the libraries that we used, we need to read the dataset. We must have a good understanding about the dataset, we can used tools such as .info(), count(), and so on.

In [None]:
netflix = pd.read_csv('../input/netflix-shows/netflix_titles.csv')
netflix.head(5)

In [None]:
netflix.info()
#show_id and release_year are an integer and others are an object.
#from the dataset, we learn that column director, cast, country, date_added, and rating have different counts. We can check later, if there are NaN value or not.

In [None]:
netflix.isna().sum()
#there are NaN value, what we gonna do next is depends on what we need.
#we can erase the value, fill it with another value, or leave it like that if we not gonna use it.

What we gonna visualize first is column release_year. 
We can create data series to provide information on the year of release of netflix content. What we gonna do is: (1) groupby the dataset by release_year and show_id, (2) change release_year type into str, (3) filter release_year, and show only year that have value over than 30 netflix content per year, and (4) visualize it into bar type using matplotlib and seaborn 

In [None]:
#1. Groupby
summary_release = (netflix
                   .groupby('release_year')
                   .size().to_frame('show_id')
                   .reset_index())

In [None]:
#2. Change release_year type
summary_release['release_year'] = summary_release['release_year'].astype(str)

In [None]:
summary_release.dtypes

In [None]:
#3. Filter
summary_release = summary_release[(summary_release['show_id'] >= 30)]

In [None]:
#4. isualize it into bar type
plt.figure(figsize=(20,7))
graph = sns.barplot(data=summary_release, x='release_year', y='show_id')
plt.ylabel('Counts')
plt.xlabel('Release Year')
plt.title('Release Counts per Year')

for p in graph.patches:
        graph.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')

We can use the same step to visualize rating.

In [None]:
#1. groupby
summary_rating = (netflix
                   .groupby('rating')
                   .size().to_frame('show_id')
                   .reset_index())

In [None]:
#2. We need additional step to make a better understanding about rating.
summary_rating

In [None]:
#we need to sort the rating, for viewer point of view, easier for them if we sort the data before.
summary_rating.sort_values(by=['show_id'], na_position='first', ascending=False, inplace=True)
summary_rating

In [None]:
#3. Visualize it into bar type 
plt.figure(figsize=(15,7))
graph = sns.barplot(data=summary_rating, x='rating', y='show_id')
plt.ylabel('Counts')
plt.xlabel('Rating')
plt.title('Counts per Rating')
plt.rcParams['font.size']=12 

for p in graph.patches:
        graph.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')

Another option to intepret the data is using pie chart. We can implement this method for visualize Movie and TV Show column.

In [None]:
#1. Groupby
summary_types = (netflix
                   .groupby('type')
                   .size().to_frame('show_id')
                   .reset_index()) 
summary_types

We can visualize it using the value only or make it more easy to understand by percentage the value.

In [None]:
#2. Making a list of Movie and TV Show percentage
percent0 = round((summary_types['show_id'][0] / (summary_types['show_id'][0] + summary_types['show_id'][1])) * 100, 2)
percent1 = round((summary_types['show_id'][1] / (summary_types['show_id'][0] + summary_types['show_id'][1])) * 100, 2)
percent = [percent0, percent1]

In [None]:
#3. Visualize it into pie chart type
labels = ['Movie', 'TV Show']
sizes = [percent0, percent1]

fig1, ax = plt.subplots()
ax.pie(sizes, autopct='%1.1f%%', startangle=90)

ax.axis('equal')
plt.tight_layout()

plt.rcParams['font.sans-serif'] = 'Arial'
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['xtick.color'] = '#000000'
plt.rcParams['font.size']=20

ax.legend(labels=labels, frameon=False, bbox_to_anchor=(0.8,0.5))

A more challenging idea to interpret the dataset is to count how often an actor has appeared on netflix content. What we gonna do is: (1) making a list of actors, (2) drop the NaN value, (3) change the type, (4) filter actor, and show only actor that have appeared over than 30 netflix content, (5) sort it, and (6) visualize it into bar type using matplotlib and seaborn

In [None]:
#1. Making a list of actors
#2. Drop the NaN value
casts = [];
for cast in netflix.cast.dropna():
    casts.extend(str(cast).split(","))

casts2 = list(map(lambda x: x.strip(), casts))

import numpy as np
unique, counts = np.unique(casts2, return_counts=True)

cast_count = np.asarray((unique, counts)).T

summary_cast = pd.DataFrame(cast_count)

In [None]:
summary_cast.count()

In [None]:
summary_cast.dtypes

In [None]:
#3. Change summary_cast type
summary_cast[0] = summary_cast[0].astype(str)
summary_cast[1] = summary_cast[1].astype(int)

In [None]:
summary_cast.dtypes

In [None]:
#4. Filter
#5. Sort the summary_cast
summary_cast = summary_cast[(summary_cast[1] >= 20)]
summary_cast.sort_values(by=1, na_position='first', ascending=False, inplace=True)
summary_cast

In [None]:
#6. Visualize it into bar type
plt.figure(figsize=(24,8))
graph = sns.barplot(data=summary_cast, x=0, y=1)
plt.ylabel('Counts')
plt.xlabel('Actors')
plt.title('Actors Played in Movies or TV Show')
plt.rcParams['font.size']=10 

for p in graph.patches:
        graph.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')

We can use the same steps to visualize the origin of content was created.

In [None]:
#1. Making a list of countries
#2. Drop the NaN value
countries = [];
for country in netflix.country.dropna():
    countries.extend(str(country).split(","))

countries2 = list(map(lambda x: x.strip(), countries))

import numpy as np
unique, counts = np.unique(countries2, return_counts=True)

cast_count = np.asarray((unique, counts)).T

summary_country = pd.DataFrame(cast_count)

In [None]:
summary_country.count()

In [None]:
summary_country.dtypes

In [None]:
#3. Change summary_country type
summary_country[0] = summary_country[0].astype(str)
summary_country[1] = summary_country[1].astype(int)

In [None]:
#4. Filter
#5. Sort the summary_cast
summary_country = summary_country[(summary_country[1] >= 100)]
summary_country.sort_values(by=1, na_position='first', ascending=False, inplace=True)

In [None]:
#6. Visualize it into bar type
plt.figure(figsize=(22.5,8))
graph = sns.barplot(data=summary_country, x=0, y=1)
plt.ylabel('Counts')
plt.xlabel('Countries')
plt.title('The Origin of Netflix Content')
plt.rcParams['font.size']=12 

for p in graph.patches:
        graph.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')