In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
netflix = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

In [None]:
netflix.head()

## Okay, the first thing that we need to deal is date_added and duration!

In [None]:
#first look at date_added
netflix['date_added'].value_counts()

In [None]:
#first look at duration
netflix['duration'].value_counts()

# hmm...Okay, we should create new columns to extract the correct datetime and the duration

In [None]:
def find_month(entry):
    ''' Function that extract the month'''
    return " ".join(re.findall("[a-zA-Z]+", str(entry)))

In [None]:
def find_day(entry):
    ''' Function that extract the day'''
    match = re.match(r'.*(\d[0-9]{1},)', str(entry))
    if match is not None:
        return match.group(1).replace(',','')

In [None]:
def find_year(entry):
    ''' Function that extract the year'''
    match = re.match(r'.*([1-3][0-9]{3})', str(entry))
    if match is not None:
        return match.group(1)

In [None]:
# applying functions
netflix['month_added'] = netflix['date_added'].apply(find_month)
netflix['month_added'] = netflix['month_added'].apply(lambda x: str(x).replace('nan','Not available'))
netflix['day_added'] = netflix['date_added'].apply(find_day)
netflix['year_added'] = netflix['date_added'].apply(find_year)

In [None]:
netflix.head()

In [None]:
def extract_min(entry):
    match = re.match(r'.*([\d]+ min)', str(entry))
    if match is not None:
        return int(match.group(0).split(' ')[0])
    


In [None]:
def extract_season(entry):
    match = re.match(r'.*([0-9]{1,2} Season)', str(entry))
    if match is not None:
        return int(match.group(0).split(' ')[0])

In [None]:
netflix['duration_min'] = netflix['duration'].apply(extract_min)
netflix['seasons'] = netflix['duration'].apply(extract_season)

In [None]:
netflix.info()

In [None]:
netflix.isnull().sum()

In [None]:
df_aux_movie_day_adeed = pd.DataFrame(netflix.loc[netflix['type']=='Movie'].day_added.value_counts())
df_aux_movie_day_adeed = df_aux_movie_day_adeed.reset_index()
df_aux_movie_day_adeed.columns = ['day','count']
df_aux_movie_day_adeed.sort_values('day',inplace=True)

df_aux_tvshow_day_adeed = pd.DataFrame(netflix.loc[netflix['type']=='TV Show'].day_added.value_counts())
df_aux_tvshow_day_adeed = df_aux_tvshow_day_adeed.reset_index()
df_aux_tvshow_day_adeed.columns = ['day','count']
df_aux_tvshow_day_adeed.sort_values('day',inplace=True)

trace_movie_day_adeed = go.Bar(
                    x=df_aux_movie_day_adeed['day'], 
                    y=df_aux_movie_day_adeed["count"], 
                    name="Movies", 
                    marker=dict(color = 'rgb(249, 6, 6)',
                             line=dict(color='rgb(0,0,0)',width=1.5)))

trace_tvshow_day_adeed = go.Bar(
                    x=df_aux_tvshow_day_adeed['day'], 
                    y=df_aux_tvshow_day_adeed["count"], 
                    name="TV Show", 
                    marker= dict(color = 'rgb(26, 118, 255)',
                              line=dict(color='rgb(0,0,0)',width=1.5)))
layout = go.Layout(hovermode= 'closest', title = 'Content day added' , xaxis = dict(title = 'day'), yaxis = dict(title = 'Count'),template= "presentation")
fig = go.Figure(data = [trace_movie_day_adeed, trace_tvshow_day_adeed], layout=layout)
fig.show()

In [None]:
keys = [ "January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December" ]
values = [i for i in range(1,13)]

In [None]:
dictionary_month = dict(zip(keys, values))

In [None]:
df_aux_movie_month_added = pd.DataFrame(netflix.loc[netflix['type']=='Movie'].month_added.value_counts())
df_aux_movie_month_added = df_aux_movie_month_added.reset_index()
df_aux_movie_month_added.columns = ['month','count']
df_aux_movie_month_added['num_month'] = df_aux_movie_month_added['month'].map(dictionary_month)
df_aux_movie_month_added.dropna(axis=0,inplace=True)
df_aux_movie_month_added.sort_values('num_month',inplace=True)

df_aux_tvshow_month_added = pd.DataFrame(netflix.loc[netflix['type']=='TV Show'].month_added.value_counts())
df_aux_tvshow_month_added = df_aux_tvshow_month_added.reset_index()
df_aux_tvshow_month_added.columns = ['month','count']
df_aux_tvshow_month_added['num_month'] = df_aux_tvshow_month_added['month'].map(dictionary_month)
df_aux_tvshow_month_added.dropna(axis=0,inplace=True)
df_aux_tvshow_month_added.sort_values('num_month',inplace=True)

trace_movie_month_added = go.Bar(
                    x=df_aux_movie_month_added['month'], 
                    y=df_aux_movie_month_added["count"], 
                    name="Movies", 
                    marker=dict(color = 'rgb(249, 6, 6)',
                             line=dict(color='rgb(0,0,0)',width=1.5)))

trace_tvshow_month_added = go.Bar(
                    x=df_aux_tvshow_month_added['month'], 
                    y=df_aux_tvshow_month_added["count"], 
                    name="TV Show", 
                    marker= dict(color = 'rgb(26, 118, 255)',
                              line=dict(color='rgb(0,0,0)',width=1.5)))
layout = go.Layout(hovermode= 'closest', title = 'Content month added' , xaxis = dict(title = 'Month'), yaxis = dict(title = 'Count'),template= "presentation")
fig = go.Figure(data = [trace_movie_month_added, trace_tvshow_month_added], layout=layout)
fig.show()

In [None]:
df_aux_movie_release = pd.DataFrame(netflix.loc[netflix['type']=='Movie'].release_year.value_counts())
df_aux_movie_release = df_aux_movie_release.reset_index()
df_aux_movie_release.columns = ['year','count']
df_aux_movie_release.sort_values('year',inplace=True)

df_aux_tvshow_release = pd.DataFrame(netflix.loc[netflix['type']=='TV Show'].release_year.value_counts())
df_aux_tvshow_release = df_aux_tvshow_release.reset_index()
df_aux_tvshow_release.columns = ['year','count']
df_aux_tvshow_release.sort_values('year',inplace=True)

trace_movie_release = go.Bar(
                    x=df_aux_movie_release['year'], 
                    y=df_aux_movie_release["count"], 
                    name="Movies", 
                    marker=dict(color = 'rgb(249, 6, 6)',
                             line=dict(color='rgb(0,0,0)',width=1.5)))

trace_tvshow_release = go.Bar(
                    x=df_aux_tvshow_release['year'], 
                    y=df_aux_tvshow_release["count"], 
                    name="TV Show", 
                    marker= dict(color = 'rgb(26, 118, 255)',
                              line=dict(color='rgb(0,0,0)',width=1.5)))
layout = go.Layout(hovermode= 'closest', title = 'Content Release Year' , xaxis = dict(title = 'Year'), yaxis = dict(title = 'Count'),template= "presentation")
fig = go.Figure(data = [trace_movie_release, trace_tvshow_release], layout=layout)
fig.show()

# Please note that 2020 it's not over yet - so...less data avaliable

In [None]:
df_aux_movie_added = pd.DataFrame(netflix.loc[netflix['type']=='Movie'].year_added.value_counts())
df_aux_movie_added = df_aux_movie_added.reset_index()
df_aux_movie_added.columns = ['year','count']
df_aux_movie_added.sort_values('year',inplace=True)

df_aux_tvshow_added = pd.DataFrame(netflix.loc[netflix['type']=='TV Show'].year_added.value_counts())
df_aux_tvshow_added = df_aux_tvshow_added.reset_index()
df_aux_tvshow_added.columns = ['year','count']
df_aux_tvshow_added.sort_values('year',inplace=True)

trace_movie_added = go.Bar(
                    x=df_aux_movie_added['year'], 
                    y=df_aux_movie_added["count"], 
                    name="Movies", 
                    marker=dict(color = 'rgb(255, 30, 30)',
                             line=dict(width=2)))

trace_tvshow_added = go.Bar(
                    x=df_aux_tvshow_added['year'], 
                    y=df_aux_tvshow_added["count"], 
                    name="TV Show", 
                    marker= dict(color = 'rgb(30, 30, 255)', line=dict(width=2)))
layout = go.Layout(hovermode= 'closest', title = 'Year added' ,
                   xaxis = dict(title = 'Year'), yaxis = dict(title = 'Count'),
                   template= "presentation")

fig = go.Figure(data = [trace_movie_added, trace_tvshow_added], layout=layout)
fig.show()

In [None]:
labels = list(netflix.type.value_counts().index)
values = list(netflix.type.value_counts().values)

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.update_layout(
    title_text="Type distribution")
fig.show()

In [None]:
plt.rcParams['figure.figsize'] = (13, 13)
wordcloud = WordCloud(stopwords=STOPWORDS,background_color = 'white', width = 1000,
                      height = 1000, max_words = 150).generate(' '.join(netflix['title']))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Most Frequent Words in Title',fontsize = 30)
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (13, 13)
wordcloud = WordCloud(stopwords=STOPWORDS,background_color = 'white',
                      width = 1000,  height = 1000, max_words = 150).generate(' '.join(netflix['description']))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Most Frequent Words in Description',fontsize = 30)
plt.show()

In [None]:
netflix.head()

In [None]:
netflix[netflix.duration_min.notnull()]['type'].value_counts()

In [None]:
data = [
    go.Bar(
        x=netflix['duration_min'].value_counts().index,
        y=netflix['duration_min'].value_counts(),
        orientation='v',
        text="Duration in minutes, Occurrences ",
    )]


layout = go.Layout(
    height=500,
    title='Duration in minutes - histogram - MOVIES',
    hovermode='closest',
    xaxis=dict(title='Duration in minutes', ticklen=1, zeroline=True, gridwidth=1),
    yaxis=dict(title='Count', ticklen=1,zeroline=True, gridwidth=1),
    showlegend=False,template= "plotly_dark"
)

fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
data = [
    go.Bar(
        x=netflix['seasons'].value_counts().index,
        y=netflix['seasons'].value_counts(),
        orientation='v',
        text="Nº Season(s), Occurrences ",
    )]


layout = go.Layout(
    height=500,
    title='Duration in Nº Season(s) - Histogram - TV SHOW',
    hovermode='closest',
    xaxis=dict(title='Nº Season(s)', ticklen=1, zeroline=True, gridwidth=1),
    yaxis=dict(title='Count', ticklen=1,zeroline=True, gridwidth=1),
    showlegend=False,template= "plotly_dark"
)

fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
df_aux_movie_rating = pd.DataFrame(netflix.loc[netflix['type']=='Movie'].rating.value_counts()).reset_index()
df_aux_movie_rating.columns = ['rating','count']

df_aux_tvshow_rating = pd.DataFrame(netflix.loc[netflix['type']=='TV Show'].rating.value_counts()).reset_index()
df_aux_tvshow_rating.columns = ['rating','count']

# create trace1
trace1 = go.Bar(
                y = df_aux_movie_rating['count'],
                x = df_aux_movie_rating['rating'],
                name="TV Shows",
                marker = dict(color = 'rgb(30, 30, 255)',
                             line=dict(width=3)))
# create trace2 
trace2 = go.Bar(
                y = df_aux_tvshow_rating['count'],
                x = df_aux_tvshow_rating['rating'],
                name = "Movies",
                marker = dict(color = 'rgb(255, 30, 30)',
                              line=dict(width=3)))


layout = go.Layout(template= "presentation",title = 'Content type by rate!' ,
                   xaxis = dict(title = 'Rating'), yaxis = dict(title = 'Count'))

fig = go.Figure(data = [trace1, trace2], layout = layout)
fig.show()

# Work in progress!

[Inspiration & Credits](https://www.kaggle.com/vikassingh1996/netflix-movies-and-shows-plotly-recommender-sys)