# Library

In [1502]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ast

# Import Data

In [1503]:
df_data = pd.read_csv('../data/data-raw/netflix_dataset.csv', delimiter=',', encoding='utf-8')
df_data

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


# Simple EDA

In [1504]:
df_data.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

In [1505]:
df_data.shape

(8807, 12)

In [1506]:
df_data.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [1507]:
df_data.duplicated().sum()

np.int64(0)

In [1508]:
unique_counts = pd.DataFrame({
    'kolom': df_data.columns,
    'jumlah_data_unik': [df_data[col].nunique() for col in df_data.columns]
})

unique_counts

Unnamed: 0,kolom,jumlah_data_unik
0,show_id,8807
1,type,2
2,title,8807
3,director,4528
4,cast,7692
5,country,748
6,date_added,1767
7,release_year,74
8,rating,17
9,duration,220


# Data Manipulation

In [1509]:
df_data = df_data.drop(columns=['show_id', 'rating'])

print(df_data.shape)
df_data.columns

(8807, 10)


Index(['type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'duration', 'listed_in', 'description'],
      dtype='object')

In [1510]:
df_data = df_data.rename(columns={
    'type': 'tipe',
    'title': 'judul film',
    'director': 'sutradara',
    'cast': 'pemeran',
    'country': 'negara',
    'date_added': 'tanggal rilis netflix',
    'release_year': 'tahun rilis',
    'duration': 'durasi film',
    'listed_in': 'genre',
    'description': 'synopsis'
})

In [1511]:
df_data = df_data[df_data['tipe'] != 'TV Show']

df_data['tipe'].unique()

array(['Movie'], dtype=object)

In [1512]:
df_data[['sutradara', 'pemeran', 'negara']] = df_data[['sutradara', 'pemeran', 'negara']].fillna('Unknow')

df_data.isnull().sum()

tipe                     0
judul film               0
sutradara                0
pemeran                  0
negara                   0
tanggal rilis netflix    0
tahun rilis              0
durasi film              3
genre                    0
synopsis                 0
dtype: int64

In [1513]:
df_data.dropna(inplace=True)

df_data.isnull().sum()

tipe                     0
judul film               0
sutradara                0
pemeran                  0
negara                   0
tanggal rilis netflix    0
tahun rilis              0
durasi film              0
genre                    0
synopsis                 0
dtype: int64

In [1514]:
df_data['durasi film'] = (
    df_data['durasi film']
    .astype(str) # Pastikan ini selalu string sebelum operasi .str
    .str.replace(" min", "", regex=False)
    .str.replace(" Seasons", "", regex=False)
    .str.replace(" Season", "", regex=False)
)
df_data['durasi film']

0        90
6        91
7       125
9       104
12      127
       ... 
8801     96
8802    158
8804     88
8805     88
8806    111
Name: durasi film, Length: 6128, dtype: object

In [1515]:
# Ubah release_year menjadi integer
df_data['tahun rilis'] = df_data['tahun rilis'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df_data['tahun rilis'] = df_data['tahun rilis'].astype(int)

# Ubah duration menjadi menit (integer)
df_data['durasi film'] = df_data['durasi film'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df_data['durasi film'] = df_data['durasi film'].astype(int)

# Ubah listed_in menjadi list
df_data['genre'] = df_data['genre'].apply(lambda x: [i.strip() for i in x.split(',')] if isinstance(x, str) else [])

# Ubah tanggal rilis netflix menjadi datetime
df_data['tanggal rilis netflix'] = pd.to_datetime(df_data['tanggal rilis netflix'])


df_data.dtypes

tipe                             object
judul film                       object
sutradara                        object
pemeran                          object
negara                           object
tanggal rilis netflix    datetime64[ns]
tahun rilis                       int64
durasi film                       int64
genre                            object
synopsis                         object
dtype: object

In [1516]:
df_data['genre_temp'] = (
    df_data['genre']
    .astype(str) # Pastikan ini selalu string sebelum operasi .str
    .str.replace("[", "", regex=False)
    .str.replace("]", "", regex=False)
    .str.replace("'", "", regex=False)
    .str.replace("'s ", "", regex=False)
    .str.replace("\"", "", regex=False)
    .str.replace("TV Shows", "", regex=False)
    .str.replace("Movies", "", regex=False)
    .str.replace("TV ", "", regex=False)
    .str.replace("TV", "", regex=False)
    .str.replace("Series", "", regex=False)
    .str.replace(" ,", ",", regex=False)
    .str.replace("  ", " ", regex=False)

    .str.replace("Stand-Up Comedy & Talk Shows", "Stand-Up Comedy", regex=False)
    .str.replace("Classic & Cult", "Cult", regex=False)
    .str.replace("Spanish-Language", "Spanish", regex=False)
)

print(df_data['genre'].isnull().sum())

df_data[['genre', 'genre_temp']]

0


Unnamed: 0,genre,genre_temp
0,[Documentaries],Documentaries
6,[Children & Family Movies],Children & Family
7,"[Dramas, Independent Movies, International Mov...","Dramas, Independent, International"
9,"[Comedies, Dramas]","Comedies, Dramas"
12,"[Dramas, International Movies]","Dramas, International"
...,...,...
8801,"[Dramas, International Movies, Thrillers]","Dramas, International, Thrillers"
8802,"[Cult Movies, Dramas, Thrillers]","Cult, Dramas, Thrillers"
8804,"[Comedies, Horror Movies]","Comedies, Horror"
8805,"[Children & Family Movies, Comedies]","Children & Family, Comedies"


In [1517]:
df_data['negara'] = (
    df_data['negara']
    .astype(str) # Pastikan ini selalu string sebelum operasi .str
    .str.replace(",$", "", regex=True)
)

df_data['negara']

0                                           United States
6                                                  Unknow
7       United States, Ghana, Burkina Faso, United Kin...
9                                           United States
12                                Germany, Czech Republic
                              ...                        
8801                         United Arab Emirates, Jordan
8802                                        United States
8804                                        United States
8805                                        United States
8806                                                India
Name: negara, Length: 6128, dtype: object

In [1518]:
# --- Proses untuk kolom 'genre' ---
df_data['genre_list'] = df_data['genre_temp'].str.split(', ')
df_data = df_data.explode('genre_list')
df_data['genre'] = df_data['genre_list'].str.strip()
df_data = df_data.drop(columns=['genre_temp', 'genre_list'])

# --- Proses untuk kolom 'pemeran' ---
df_data['pemeran'] = df_data['pemeran'].astype(str)
df_data['pemeran_list'] = df_data['pemeran'].str.split(', ')
df_data = df_data.explode('pemeran_list')
df_data['pemeran'] = df_data['pemeran_list'].str.strip()
df_data = df_data.drop(columns=['pemeran_list'])

# --- Proses untuk kolom 'sutradara' ---
df_data['sutradara'] = df_data['sutradara'].astype(str)
df_data['sutradara_list'] = df_data['sutradara'].str.split(', ')
df_data = df_data.explode('sutradara_list')
df_data['sutradara'] = df_data['sutradara_list'].str.strip()
df_data = df_data.drop(columns=['sutradara_list'])

# --- Proses untuk kolom 'negara' ---
df_data['negara'] = df_data['negara'].astype(str)
df_data['negara_list'] = df_data['negara'].str.split(', ')
df_data = df_data.explode('negara_list')
df_data['negara'] = df_data['negara_list'].str.strip()
df_data = df_data.drop(columns=['negara_list'])

In [1519]:
# --- Proses untuk kolom 'genre' ---
print('Jumlah data duplikat: ', df_data.duplicated().sum())
print('jumalah string kosong: ', df_data['genre'].astype(str).eq('').sum())

# --- Proses untuk kolom 'negara' ---
print('Jumlah data duplikat: ', df_data.duplicated().sum())
print('jumalah string kosong: ', df_data['negara'].astype(str).eq('').sum())

Jumlah data duplikat:  55
jumalah string kosong:  409
Jumlah data duplikat:  55
jumalah string kosong:  24


In [1520]:
df_data = df_data.drop_duplicates()
print('Jumlah data duplikat: ', df_data.duplicated().sum(), '\n')

df_data['genre'] = df_data['genre'].replace('', np.nan)
df_data = df_data.dropna(subset=['genre'])
nan_akhir = df_data['genre'].isnull().sum()
print(f"Jumlah nilai kosong : {nan_akhir}")

df_data['genre'].unique()

Jumlah data duplikat:  0 

Jumlah nilai kosong : 0


array(['Documentaries', 'Children & Family', 'Dramas', 'Independent',
       'International', 'Comedies', 'Thrillers', 'Romantic',
       'Music & Musicals', 'Horror', 'Sci-Fi & Fantasy',
       'Action & Adventure', 'Classic', 'Anime Features', 'Sports',
       'Cult', 'Faith & Spirituality', 'LGBTQ', 'Stand-Up Comedy'],
      dtype=object)

In [1521]:
df_data['negara'] = df_data['negara'].replace('', np.nan)
df_data = df_data.dropna(subset=['negara'])
nan_akhir = df_data['negara'].isnull().sum()
print(f"Jumlah nilai kosong : {nan_akhir}")

df_data['negara'].unique()

Jumlah nilai kosong : 0


array(['United States', 'Unknow', 'Ghana', 'Burkina Faso',
       'United Kingdom', 'Germany', 'Ethiopia', 'Czech Republic', 'India',
       'France', 'China', 'Canada', 'South Africa', 'Japan', 'Nigeria',
       'Spain', 'Australia', 'Mexico', 'Italy', 'Romania', 'Argentina',
       'Venezuela', 'Hong Kong', 'Nepal', 'New Zealand', 'Brazil',
       'Greece', 'Colombia', 'Belgium', 'Switzerland', 'Bulgaria',
       'Algeria', 'Poland', 'Israel', 'Saudi Arabia', 'Thailand',
       'Indonesia', 'Egypt', 'Denmark', 'Kuwait', 'Netherlands',
       'Singapore', 'Malaysia', 'South Korea', 'Vietnam', 'Hungary',
       'Lebanon', 'Syria', 'Philippines', 'United Arab Emirates',
       'Sweden', 'Qatar', 'Mauritius', 'Austria', 'Turkey', 'Russia',
       'Taiwan', 'Cameroon', 'Palestine', 'Ireland', 'Kenya', 'Chile',
       'Uruguay', 'Cambodia', 'Bangladesh', 'Portugal', 'Cayman Islands',
       'Norway', 'Iceland', 'Serbia', 'Malta', 'Luxembourg', 'Namibia',
       'Angola', 'Peru', 'Mozambiqu

# Final Preview

In [1522]:
df_data

Unnamed: 0,tipe,judul film,sutradara,pemeran,negara,tanggal rilis netflix,tahun rilis,durasi film,genre,synopsis
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknow,United States,2021-09-25,2020,90,Documentaries,"As her father nears the end of his life, filmm..."
6,Movie,My Little Pony: A New Generation,Robert Cullen,Vanessa Hudgens,Unknow,2021-09-24,2021,91,Children & Family,Equestria's divided. But a bright-eyed hero be...
6,Movie,My Little Pony: A New Generation,José Luis Ucha,Vanessa Hudgens,Unknow,2021-09-24,2021,91,Children & Family,Equestria's divided. But a bright-eyed hero be...
6,Movie,My Little Pony: A New Generation,Robert Cullen,Kimiko Glenn,Unknow,2021-09-24,2021,91,Children & Family,Equestria's divided. But a bright-eyed hero be...
6,Movie,My Little Pony: A New Generation,José Luis Ucha,Kimiko Glenn,Unknow,2021-09-24,2021,91,Children & Family,Equestria's divided. But a bright-eyed hero be...
...,...,...,...,...,...,...,...,...,...,...
8806,Movie,Zubaan,Mozez Singh,Manish Chaudhary,India,2019-03-02,2015,111,Music & Musicals,A scrappy but poor boy worms his way into a ty...
8806,Movie,Zubaan,Mozez Singh,Meghna Malik,India,2019-03-02,2015,111,Music & Musicals,A scrappy but poor boy worms his way into a ty...
8806,Movie,Zubaan,Mozez Singh,Malkeet Rauni,India,2019-03-02,2015,111,Music & Musicals,A scrappy but poor boy worms his way into a ty...
8806,Movie,Zubaan,Mozez Singh,Anita Shabdish,India,2019-03-02,2015,111,Music & Musicals,A scrappy but poor boy worms his way into a ty...


# Export

In [1524]:
df_data.to_excel('../data/data-processed/netflix-film-tv-show.xlsx', index=False)