## Import Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

In [2]:
df = pd.read_csv('E:/Data Analyst Portofilio Data/Datasets/Netflex/disney_plus_titles.csv')
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Duck the Halls: A Mickey Mouse Christmas Special,"Alonso Ramirez Ramos, Dave Wasson","Chris Diamantopoulos, Tony Anselmo, Tress MacN...",,"November 26, 2021",2016,TV-G,23 min,"Animation, Family",Join Mickey and the gang as they duck the halls!
1,s2,Movie,Ernest Saves Christmas,John Cherry,"Jim Varney, Noelle Parker, Douglas Seale",,"November 26, 2021",1988,PG,91 min,Comedy,Santa Claus passes his magic bag to a new St. ...
2,s3,Movie,Ice Age: A Mammoth Christmas,Karen Disher,"Raymond Albert Romano, John Leguizamo, Denis L...",United States,"November 26, 2021",2011,TV-G,23 min,"Animation, Comedy, Family",Sid the Sloth is on Santa's naughty list.
3,s4,Movie,The Queen Family Singalong,Hamish Hamilton,"Darren Criss, Adam Lambert, Derek Hough, Alexa...",,"November 26, 2021",2021,TV-PG,41 min,Musical,"This is real life, not just fantasy!"
4,s5,TV Show,The Beatles: Get Back,,"John Lennon, Paul McCartney, George Harrison, ...",,"November 25, 2021",2021,,1 Season,"Docuseries, Historical, Music",A three-part documentary from Peter Jackson ca...
...,...,...,...,...,...,...,...,...,...,...,...,...
1445,s1446,Movie,X-Men Origins: Wolverine,Gavin Hood,"Hugh Jackman, Liev Schreiber, Danny Huston, wi...","United States, United Kingdom","June 4, 2021",2009,PG-13,108 min,"Action-Adventure, Family, Science Fiction",Wolverine unites with legendary X-Men to fight...
1446,s1447,Movie,Night at the Museum: Battle of the Smithsonian,Shawn Levy,"Ben Stiller, Amy Adams, Owen Wilson, Hank Azar...","United States, Canada","April 2, 2021",2009,PG,106 min,"Action-Adventure, Comedy, Family",Larry Daley returns to rescue some old friends...
1447,s1448,Movie,Eddie the Eagle,Dexter Fletcher,"Tom Costello, Jo Hartley, Keith Allen, Dickon ...","United Kingdom, Germany, United States","December 18, 2020",2016,PG-13,107 min,"Biographical, Comedy, Drama","True story of Eddie Edwards, a British ski-jum..."
1448,s1449,Movie,Bend It Like Beckham,Gurinder Chadha,"Parminder Nagra, Keira Knightley, Jonathan Rhy...","United Kingdom, Germany, United States","September 18, 2020",2003,PG-13,112 min,"Buddy, Comedy, Coming of Age",Despite the wishes of their traditional famili...


## Data Preprocessing

In [3]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Duck the Halls: A Mickey Mouse Christmas Special,"Alonso Ramirez Ramos, Dave Wasson","Chris Diamantopoulos, Tony Anselmo, Tress MacN...",,"November 26, 2021",2016,TV-G,23 min,"Animation, Family",Join Mickey and the gang as they duck the halls!
1,s2,Movie,Ernest Saves Christmas,John Cherry,"Jim Varney, Noelle Parker, Douglas Seale",,"November 26, 2021",1988,PG,91 min,Comedy,Santa Claus passes his magic bag to a new St. ...
2,s3,Movie,Ice Age: A Mammoth Christmas,Karen Disher,"Raymond Albert Romano, John Leguizamo, Denis L...",United States,"November 26, 2021",2011,TV-G,23 min,"Animation, Comedy, Family",Sid the Sloth is on Santa's naughty list.
3,s4,Movie,The Queen Family Singalong,Hamish Hamilton,"Darren Criss, Adam Lambert, Derek Hough, Alexa...",,"November 26, 2021",2021,TV-PG,41 min,Musical,"This is real life, not just fantasy!"
4,s5,TV Show,The Beatles: Get Back,,"John Lennon, Paul McCartney, George Harrison, ...",,"November 25, 2021",2021,,1 Season,"Docuseries, Historical, Music",A three-part documentary from Peter Jackson ca...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450 entries, 0 to 1449
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       1450 non-null   object
 1   type          1450 non-null   object
 2   title         1450 non-null   object
 3   director      977 non-null    object
 4   cast          1260 non-null   object
 5   country       1231 non-null   object
 6   date_added    1447 non-null   object
 7   release_year  1450 non-null   int64 
 8   rating        1447 non-null   object
 9   duration      1450 non-null   object
 10  listed_in     1450 non-null   object
 11  description   1450 non-null   object
dtypes: int64(1), object(11)
memory usage: 136.1+ KB


In [5]:
# Convert data type for date_added colum to datetime
df["date_added"] = pd.to_datetime(df['date_added'])

# Add new columns
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
df['day_added'] = df['date_added'].dt.day

In [6]:
# check null values
df.isnull().sum()

show_id           0
type              0
title             0
director        473
cast            190
country         219
date_added        3
release_year      0
rating            3
duration          0
listed_in         0
description       0
year_added        3
month_added       3
day_added         3
dtype: int64

In [7]:
# drop  unnecessary features/columns

# drop column description
df.drop('show_id',axis=1,inplace=True)

# drop column director
df.drop('director',axis=1,inplace=True)

# drop column cast
df.drop('cast',axis=1,inplace=True)

# drop column description
df.drop('description',axis=1,inplace=True)

In [8]:
# the mode value of country column
df['country'].value_counts().idxmax()

'United States'

In [9]:
# the mode value of rating column
df['rating'].value_counts().idxmax()

'TV-G'

In [10]:
# Replacing null values by the mode of column

df['country'].replace(np.nan,'United States',inplace=True)

df['rating'].replace(np.nan,'TV-MA',inplace=True)

# drop the null values for rest columns

df.dropna(inplace=True)

In [11]:
# check null values
df.isnull().sum()

type            0
title           0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
year_added      0
month_added     0
day_added       0
dtype: int64

In [12]:
# check duplicated values
df.duplicated().value_counts()

False    1447
dtype: int64

In [13]:
df.head()

Unnamed: 0,type,title,country,date_added,release_year,rating,duration,listed_in,year_added,month_added,day_added
0,Movie,Duck the Halls: A Mickey Mouse Christmas Special,United States,2021-11-26,2016,TV-G,23 min,"Animation, Family",2021.0,11.0,26.0
1,Movie,Ernest Saves Christmas,United States,2021-11-26,1988,PG,91 min,Comedy,2021.0,11.0,26.0
2,Movie,Ice Age: A Mammoth Christmas,United States,2021-11-26,2011,TV-G,23 min,"Animation, Comedy, Family",2021.0,11.0,26.0
3,Movie,The Queen Family Singalong,United States,2021-11-26,2021,TV-PG,41 min,Musical,2021.0,11.0,26.0
4,TV Show,The Beatles: Get Back,United States,2021-11-25,2021,TV-MA,1 Season,"Docuseries, Historical, Music",2021.0,11.0,25.0


In [14]:
df[['year_added','month_added','day_added']] = df[['year_added','month_added','day_added']].astype('int')

In [15]:
# Calculate summary statistics for categorical columns
df.describe(include='object')

Unnamed: 0,type,title,country,rating,duration,listed_in
count,1447,1447,1447,1447,1447,1447
unique,2,1447,89,10,158,329
top,Movie,Duck the Halls: A Mickey Mouse Christmas Special,United States,TV-G,1 Season,"Animation, Comedy, Family"
freq,1052,1,1222,318,219,124


In [16]:
df['type'].value_counts()

Movie      1052
TV Show     395
Name: type, dtype: int64

In [17]:
df['country'].value_counts()

United States                                                  1222
United States, Canada                                            30
United Kingdom                                                   23
United States, United Kingdom                                    22
Canada, United States                                            17
                                                               ... 
Ireland, United States                                            1
Ireland, United Kingdom, United States, South Korea, Canada       1
United States, South Korea, France                                1
United States, Australia, France, Canada                          1
United States, Ireland                                            1
Name: country, Length: 89, dtype: int64

In [18]:
df['rating'].value_counts()

TV-G        318
TV-PG       301
G           253
PG          236
TV-Y7       129
TV-14        79
PG-13        66
TV-Y         49
TV-Y7-FV     13
TV-MA         3
Name: rating, dtype: int64

In [19]:
df['duration'].value_counts()

1 Season     219
2 Seasons     81
3 Seasons     43
44 min        42
7 min         42
            ... 
120 min        1
56 min         1
19 min         1
30 min         1
68 min         1
Name: duration, Length: 158, dtype: int64

In [20]:
df['listed_in'].value_counts()

Animation, Comedy, Family                        124
Action-Adventure, Animation, Comedy               77
Action-Adventure, Animation, Kids                 45
Animals & Nature, Documentary, Family             40
Action-Adventure, Animation, Family               40
                                                ... 
Docuseries, Kids                                   1
Animals & Nature, Family, Reality                  1
Dance, Music, Musical                              1
Comedy, Family, Lifestyle                          1
Action-Adventure, Animals & Nature, Animation      1
Name: listed_in, Length: 329, dtype: int64

In [21]:
df['release_year'].value_counts()

2021    125
2020    114
2019     99
2017     69
2018     65
       ... 
1928      1
1944      1
1970      1
1945      1
1968      1
Name: release_year, Length: 90, dtype: int64

In [22]:
df.head()

Unnamed: 0,type,title,country,date_added,release_year,rating,duration,listed_in,year_added,month_added,day_added
0,Movie,Duck the Halls: A Mickey Mouse Christmas Special,United States,2021-11-26,2016,TV-G,23 min,"Animation, Family",2021,11,26
1,Movie,Ernest Saves Christmas,United States,2021-11-26,1988,PG,91 min,Comedy,2021,11,26
2,Movie,Ice Age: A Mammoth Christmas,United States,2021-11-26,2011,TV-G,23 min,"Animation, Comedy, Family",2021,11,26
3,Movie,The Queen Family Singalong,United States,2021-11-26,2021,TV-PG,41 min,Musical,2021,11,26
4,TV Show,The Beatles: Get Back,United States,2021-11-25,2021,TV-MA,1 Season,"Docuseries, Historical, Music",2021,11,25


## Exploratory Data Analysis

In [23]:
df.sample(10)

Unnamed: 0,type,title,country,date_added,release_year,rating,duration,listed_in,year_added,month_added,day_added
762,Movie,Breaking2,United States,2019-11-12,2017,TV-PG,55 min,"Documentary, Sports",2019,11,12
475,Movie,Hamilton: History Has Its Eyes on You,United States,2020-07-10,2020,TV-PG,47 min,"Historical, Musical",2020,7,10
530,Movie,Star Wars: The Rise of Skywalker (Episode IX),United States,2020-05-04,2019,PG-13,142 min,"Action-Adventure, Family, Science Fiction",2020,5,4
685,Movie,Marvel Studios' Thor: Ragnarok,United States,2019-12-05,2017,PG-13,132 min,"Action-Adventure, Comedy, Fantasy",2019,12,5
1321,Movie,The Princess and the Frog,United States,2019-11-12,2009,G,99 min,"Animation, Comedy, Family",2019,11,12
514,TV Show,The Greeks,United States,2020-06-05,2016,TV-PG,1 Season,"Docuseries, Historical",2020,6,5
1393,Movie,Twas the Night,United States,2019-11-12,2001,TV-G,87 min,"Comedy, Crime, Fantasy",2019,11,12
1386,Movie,Treasure of Matecumbe,United States,2019-11-12,1976,G,116 min,"Action-Adventure, Family, Mystery",2019,11,12
908,Movie,Geek Charming,United States,2019-11-12,2011,TV-G,99 min,"Comedy, Coming of Age, Drama",2019,11,12
807,Movie,"Davy Crockett, King of the Wild Frontier",United States,2019-11-12,1955,PG,93 min,"Action-Adventure, Family, Historical",2019,11,12


In [24]:
df_movie = df[df['type'] == 'Movie']

df_tv_show = df[df['type'] == 'TV Show']

In [25]:
# 10 country that give highest movies

df_movie[['type','country']].groupby(['country']).count()['type'].nlargest(10).to_frame()

Unnamed: 0_level_0,type
country,Unnamed: 1_level_1
United States,883
"United States, Canada",25
"United States, United Kingdom",20
"Canada, United States",16
"United Kingdom, United States",13
United Kingdom,11
"United States, Australia",8
"United States, France",4
"United States, United Kingdom, Australia",4
"Australia, United States",3


In [26]:
# 10 country that give highest TV Show

df_tv_show[['type','country']].groupby(['country']).count()['type'].nlargest(10).to_frame()

Unnamed: 0_level_0,type
country,Unnamed: 1_level_1
United States,339
United Kingdom,12
"United States, Canada",5
"United States, South Korea",3
Argentina,2
Canada,2
"United States, United Kingdom",2
"Argentina, Mexico",1
"Australia, United Kingdom",1
"Canada, United States",1


In [27]:
# 10 highest duration for movies

df_movie[['type','duration']].groupby(['duration']).count()['type'].nlargest(10).to_frame()

Unnamed: 0_level_0,type
duration,Unnamed: 1_level_1
44 min,42
7 min,42
8 min,41
9 min,33
45 min,31
94 min,28
90 min,27
93 min,27
88 min,26
89 min,24


In [28]:
# 10 highest duration for TV Show

df_tv_show[['type','duration']].groupby(['duration']).count()['type'].nlargest(10).to_frame()

Unnamed: 0_level_0,type
duration,Unnamed: 1_level_1
1 Season,219
2 Seasons,81
3 Seasons,43
4 Seasons,24
5 Seasons,11
7 Seasons,5
6 Seasons,3
10 Seasons,2
8 Seasons,2
9 Seasons,2


In [29]:
# 10 highest rating for movies

df_movie[['type','rating']].groupby(['rating']).count()['type'].nlargest(10).to_frame()

Unnamed: 0_level_0,type
rating,Unnamed: 1_level_1
G,253
PG,235
TV-G,233
TV-PG,181
PG-13,66
TV-14,37
TV-Y7,36
TV-Y7-FV,7
TV-Y,3
TV-MA,1


In [30]:
# 10 highest rating for TV Show

df_tv_show[['type','rating']].groupby(['rating']).count()['type'].nlargest(10).to_frame()

Unnamed: 0_level_0,type
rating,Unnamed: 1_level_1
TV-PG,120
TV-Y7,93
TV-G,85
TV-Y,46
TV-14,42
TV-Y7-FV,6
TV-MA,2
PG,1


In [31]:
# 10 highest listed-in for movies

df_movie[['type','listed_in']].groupby(['listed_in']).count()['type'].nlargest(10).to_frame()

Unnamed: 0_level_0,type
listed_in,Unnamed: 1_level_1
"Animation, Comedy, Family",119
"Action-Adventure, Animation, Comedy",44
"Action-Adventure, Animation, Family",40
"Animals & Nature, Documentary, Family",40
"Animals & Nature, Documentary",35
"Animation, Family, Fantasy",31
"Action-Adventure, Comedy, Family",28
"Animation, Family",25
Documentary,25
"Documentary, Historical",21


In [32]:
# 10 highest listed-in for TV Show

df_tv_show[['type','listed_in']].groupby(['listed_in']).count()['type'].nlargest(10).to_frame()

Unnamed: 0_level_0,type
listed_in,Unnamed: 1_level_1
"Action-Adventure, Animation, Kids",39
"Animals & Nature, Docuseries, Family",39
"Action-Adventure, Animation, Comedy",33
"Animation, Kids",15
"Action-Adventure, Animation, Fantasy",12
"Action-Adventure, Animals & Nature, Docuseries",10
"Animals & Nature, Docuseries",10
"Docuseries, Historical",10
"Comedy, Coming of Age, Family",8
"Comedy, Coming of Age, Kids",7


In [33]:
# 10 highest Year for Added movies on Netflix

df_movie[['type','year_added']].groupby(['year_added']).count()['type'].nlargest(10).to_frame()

Unnamed: 0_level_0,type
year_added,Unnamed: 1_level_1
2019,630
2020,230
2021,192


In [34]:
# 10 highest Year for Added TV Show on Netflix

df_tv_show[['type','year_added']].groupby(['year_added']).count()['type'].nlargest(10).to_frame()

Unnamed: 0_level_0,type
year_added,Unnamed: 1_level_1
2021,138
2019,135
2020,122


In [35]:
# 10 highest Year release for movies on Netflix

df_movie[['type','release_year']].groupby(['release_year']).count()['type'].nlargest(10).to_frame()

Unnamed: 0_level_0,type
release_year,Unnamed: 1_level_1
2020,74
2021,70
2019,61
2011,35
2014,34
2017,33
2009,32
2018,32
2010,30
2016,30


In [36]:
# 10 highest Year release for TV Show on Netflix

df_tv_show[['type','release_year']].groupby(['release_year']).count()['type'].nlargest(10).to_frame()

Unnamed: 0_level_0,type
release_year,Unnamed: 1_level_1
2021,55
2020,40
2019,38
2017,36
2018,33
2016,30
2015,23
2011,17
2014,15
2012,12
