In [93]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import platform
from pandas.core.common import SettingWithCopyWarning

# 운영체제별 한글 폰트 설정
if platform.system() == 'Darwin': # Mac 환경 폰트 설정
    plt.rc('font', family='AppleGothic')
elif platform.system() == 'Windows': # Windows 환경 폰트 설정
    plt.rc('font', family='Malgun Gothic')

plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정


# 글씨 선명하게 출력하는 설정
%config InlineBackend.figure_format = 'retina'

# Warning 제거
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) 
warnings.simplefilter(action='ignore', category=UserWarning) 
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [94]:
netf = pd.read_csv(r'data\netflix_titles.csv')
continental = pd.read_csv(r'data\country-continents.csv')

# pre-processing

In [95]:
netf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [96]:
netf.nunique()

show_id         8807
type               2
title           8807
director        4528
cast            7692
country          748
date_added      1767
release_year      74
rating            17
duration         220
listed_in        514
description     8775
dtype: int64

In [97]:
netf_edit = netf.dropna(subset = ['rating', 'date_added', 'duration',])        

In [98]:
netf_edit['country'] = netf_edit['country'].fillna('unknown')

In [99]:
netf_edit['date_added'] = netf_edit['date_added'].astype('datetime64')

In [100]:
netf_edit['date_added_yr'] = netf_edit['date_added'].dt.year

In [101]:
netf_edit['zenre'] = netf_edit['listed_in'].apply(lambda x: x.split(', ')[0])

In [102]:
netf_edit.loc[365,'country'] = 'France, Algeria'
netf_edit.loc[193,'country'] = 'South Korea'

In [103]:
netf_edit['country_edit'] = netf_edit['country'].apply(lambda x: x.split(',')[0])

In [104]:
ratings_ages = pd.DataFrame({
    'TV-PG': ['Older Kids'],
    'TV-MA': ['Adults'],
    'TV-Y7-FV': ['Older Kids'],
    'TV-Y7': ['Older Kids'],
    'TV-14': ['Teens'],
    'R': ['Adults'],
    'TV-Y': ['Kids'],
    'NR': ['Adults'],
    'PG-13': ['Teens'],
    'TV-G': ['Kids'],
    'PG': ['Older Kids'],
    'G': ['Kids'],
    'UR': ['Adults'],
    'NC-17': ['Adults']
}).T.reset_index()

In [105]:
ratings_ages.columns = ['rating','rating_summary']

In [106]:
netf_edit = pd.merge(netf_edit,ratings_ages, on='rating')

In [107]:
netf_edit = netf_edit[netf_edit['date_added_yr']>2015]

In [108]:
df = netf_edit.reset_index()

In [109]:
df.drop(labels='index',axis=1,inplace=True)

In [110]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,date_added_yr,zenre,country_edit,rating_summary
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021,Documentaries,United States,Teens
1,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,2021-09-24,2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...,2021,Comedies,United States,Teens
2,s28,Movie,Grown Ups,Dennis Dugan,"Adam Sandler, Kevin James, Chris Rock, David S...",United States,2021-09-20,2010,PG-13,103 min,Comedies,Mourning the loss of their beloved junior high...,2021,Comedies,United States,Teens
3,s29,Movie,Dark Skies,Scott Stewart,"Keri Russell, Josh Hamilton, J.K. Simmons, Dak...",United States,2021-09-19,2013,PG-13,97 min,"Horror Movies, Sci-Fi & Fantasy",A family’s idyllic suburban life shatters when...,2021,Horror Movies,United States,Teens
4,s30,Movie,Paranoia,Robert Luketic,"Liam Hemsworth, Gary Oldman, Amber Heard, Harr...","United States, India, France",2021-09-19,2013,PG-13,106 min,Thrillers,"Blackmailed by his company's CEO, a low-level ...",2021,Thrillers,United States,Teens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8647,s7318,Movie,Little Singham aur Kaal ka Mahajaal,Prakash Satam,,unknown,2018-12-01,2018,TV-Y7-FV,68 min,"Children & Family Movies, Comedies","When Kaal, the vilest demon on earth, threaten...",2018,Children & Family Movies,unknown,Older Kids
8648,s7514,Movie,Motu Patlu: King of Kings,Suhas Kadav,"Saurav Chakrabarty, Vinay Pathak",India,2017-07-06,2016,TV-Y7-FV,109 min,"Children & Family Movies, Comedies",Motu and Patlu want to help a circus lion get ...,2017,Children & Family Movies,India,Older Kids
8649,s7059,Movie,Immoral Tales,Walerian Borowczyk,"Lise Danvers, Fabrice Luchini, Charlotte Alexa...",France,2019-06-06,1974,UR,103 min,"Dramas, International Movies, Romantic Movies",This anthology illustrates the timeless nature...,2019,Dramas,France,Adults
8650,s7989,Movie,Sex Doll,Sylvie Verheyde,"Hafsia Herzi, Ash Stymest, Karole Rocher, Paul...","United Kingdom, France",2017-07-10,2016,UR,103 min,"Dramas, International Movies, Romantic Movies",A high-end call girl falls for a handsome man ...,2017,Dramas,United Kingdom,Adults


# processing & visualising

## country

In [111]:
df_c = df[df['country_edit']!='unknown'].groupby(['country_edit','date_added_yr'],as_index=False).agg(country_count = ('show_id','count'))
df_c

Unnamed: 0,country_edit,date_added_yr,country_count
0,Argentina,2016,3
1,Argentina,2017,10
2,Argentina,2018,23
3,Argentina,2019,16
4,Argentina,2020,13
...,...,...,...
323,Vietnam,2019,1
324,Vietnam,2020,1
325,Vietnam,2021,2
326,West Germany,2017,1


In [112]:
import pycountry
def generate_iso_alpha(dataframe, country_col):
    """
    Appends as 'ISO' column, which contains the alpha3 country code 
    to a pandas dataframe by utilizing the pycountry countries API
    :param dataframe: Pandas Dataframe 
    :param country_col: string Column that contains the country name
    :return: Pandas Dataframe with appended ISO column
    """
    for idx in dataframe.index:
        country = dataframe.loc[idx, country_col]
        try:
            iso = pycountry.countries.get(name=country).alpha_3
            dataframe.loc[idx, 'ISO'] = iso
        except Exception as e:
            try:
                iso = pycountry.countries.get(common_name=country).alpha_3
                dataframe.loc[idx, 'ISO'] = iso
            except Exception as e:
                print(country, e)
                pass
    return dataframe

generate_iso_alpha(df_c, 'country_edit')

df_c.dropna(subset=['ISO'],inplace=True)

Czech Republic 'NoneType' object has no attribute 'alpha_3'
Czech Republic 'NoneType' object has no attribute 'alpha_3'
Czech Republic 'NoneType' object has no attribute 'alpha_3'
Iran 'NoneType' object has no attribute 'alpha_3'
Russia 'NoneType' object has no attribute 'alpha_3'
Russia 'NoneType' object has no attribute 'alpha_3'
Russia 'NoneType' object has no attribute 'alpha_3'
Russia 'NoneType' object has no attribute 'alpha_3'
Russia 'NoneType' object has no attribute 'alpha_3'
Soviet Union 'NoneType' object has no attribute 'alpha_3'
Syria 'NoneType' object has no attribute 'alpha_3'
West Germany 'NoneType' object has no attribute 'alpha_3'


In [113]:
import plotly.express as px

fig = px.scatter_geo(df_c, 
                     locations="ISO",
                     hover_name="country_edit", size="country_count",
                     #animation_frame="date_added_yr",
                     projection="orthographic")
fig.show()

# import plotly.io as po
# po.write_html(fig, file='bubble_map.html', auto_open=True)

In [114]:
import plotly.graph_objects as go
ratio_total   = list(df.groupby(['country_edit'],as_index=False).agg(country_edit_count=('show_id','count')).loc[:,'country_edit_count'])
labels_total  = list(df.groupby(['country_edit'],as_index=False).agg(country_edit_count=('show_id','count')).loc[:,'country_edit'])

# pull is given as a fraction of the pie radius
fig = go.Figure(data=[go.Pie(labels=labels_total, values=ratio_total)])
fig.update_layout(margin=dict(t=20, b=20, l=10, r=20))
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [115]:
fig = px.bar(df.groupby(['date_added_yr','country_edit',],as_index=False).agg(country_count = ('show_id','count')), x="date_added_yr", y="country_count", color="country_edit", text_auto=True)
fig.show()

In [116]:
import plotly.graph_objects as go
ratio_total   = list(df[df['date_added_yr']==2016].groupby(['country_edit'],as_index=False).agg(country_edit_count=('show_id','count')).loc[:,'country_edit_count'])
labels_total  = list(df[df['date_added_yr']==2016].groupby(['country_edit'],as_index=False).agg(country_edit_count=('show_id','count')).loc[:,'country_edit'])

# pull is given as a fraction of the pie radius
fig = go.Figure(data=[go.Pie(labels=labels_total, values=ratio_total)])
fig.update_layout(margin=dict(t=20, b=20, l=10, r=20))
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [117]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


fig = make_subplots(1, 2, specs=[[{'type':'domain'}, {'type':'domain'}]])
#############################
#2016
ratio_total_2016   = list(df[df['date_added_yr']==2016].groupby(['country_edit'],as_index=False).agg(country_edit_count=('show_id','count')).loc[:,'country_edit_count'])
labels_total_2016  = list(df[df['date_added_yr']==2016].groupby(['country_edit'],as_index=False).agg(country_edit_count=('show_id','count')).loc[:,'country_edit'])
fig.add_trace(go.Pie(labels=ratio_total_2016, values=labels_total_2016), 1, 1)

#2017
ratio_total_2017   = list(df[df['date_added_yr']==2017].groupby(['country_edit'],as_index=False).agg(country_edit_count=('show_id','count')).loc[:,'country_edit_count'])
labels_total_2017  = list(df[df['date_added_yr']==2017].groupby(['country_edit'],as_index=False).agg(country_edit_count=('show_id','count')).loc[:,'country_edit'])
fig.add_trace(go.Pie(labels=ratio_total_2017, values=labels_total_2017), 1, 2)


fig.show()

In [118]:
labels = ["Asia", "Europe", "Africa", "Americas", "Oceania"]

fig = make_subplots(1, 2, specs=[[{'type':'domain'}, {'type':'domain'}]],
                    subplot_titles=['1980', '2007'])
fig.add_trace(go.Pie(labels=labels, values=[4, 7, 1, 7, 0.5], scalegroup='one',
                     name="World GDP 1980"), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=[21, 15, 3, 19, 1], scalegroup='one',
                     name="World GDP 2007"), 1, 2)

fig.update_layout(title_text='World GDP')
fig.show()

## zenre

In [120]:
import plotly.graph_objects as go
ratio_total   = list(df.groupby(['zenre'],as_index=False).agg(zenre_count=('show_id','count')).loc[:,'zenre_count'])
labels_total  = list(df.groupby(['zenre'],as_index=False).agg(zenre_count=('show_id','count')).loc[:,'zenre'])

# pull is given as a fraction of the pie radius
fig = go.Figure(data=[go.Pie(labels=labels_total, values=ratio_total)])
fig.update_layout(margin=dict(t=20, b=20, l=10, r=20))
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [121]:
fig = px.bar(df.groupby(['date_added_yr','zenre',],as_index=False).agg(country_count = ('show_id','count')), x="date_added_yr", y="country_count", color="zenre", text_auto=True)
fig.show()

## rating

In [122]:
import plotly.graph_objects as go
ratio_total   = list(df.groupby(['rating_summary'],as_index=False).agg(date_added_yr_count=('show_id','count')).loc[:,'date_added_yr_count'])
labels_total  = list(df.groupby(['rating_summary'],as_index=False).agg(date_added_yr_count=('show_id','count')).loc[:,'rating_summary'])

# pull is given as a fraction of the pie radius
fig = go.Figure(data=[go.Pie(labels=labels_total, values=ratio_total)])
fig.update_layout(margin=dict(t=20, b=20, l=10, r=20))
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [126]:
fig = px.bar(df.groupby(['date_added_yr','rating_summary',],as_index=False).agg(country_count = ('show_id','count')), x="date_added_yr", y="country_count", color="rating_summary",category_orders={1:'Adults',2:'Teens',3:'Older kids',4:'Kids'} , text_auto=True)
fig.show()