# Data Cleansing

In [138]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None

## Movies

In [139]:
movies_df = pd.read_csv('datasets/movies.csv')
movies_df

Unnamed: 0,id,name,date,tagline,description,minute,rating
0,1000001,Barbie,2023.0,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114.0,3.86
1,1000002,Parasite,2019.0,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133.0,4.56
2,1000003,Everything Everywhere All at Once,2022.0,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140.0,4.30
3,1000004,Fight Club,1999.0,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139.0,4.27
4,1000005,La La Land,2016.0,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129.0,4.09
...,...,...,...,...,...,...,...
941592,1941593,神笛,,,,,
941593,1941594,蟲極道蜜団子抗争編 壱ノ巻,,,Shinjuku forest at night. In the sap taverns o...,30.0,
941594,1941595,蟲極道蜜団子抗争編 弐ノ巻,,,"The city that never sleeps, where insects gath...",30.0,
941595,1941596,重生,,,"In a world where order has broken down, darkne...",,


In [140]:
movies_df.dtypes

id               int64
name            object
date           float64
tagline         object
description     object
minute         float64
rating         float64
dtype: object

In [141]:
movies_df.shape

(941597, 7)

In [142]:
movies_df[movies_df.isna().any(axis=1)]

Unnamed: 0,id,name,date,tagline,description,minute,rating
34,1000035,Black Swan,2010.0,,A journey through the psyche of a young baller...,108.0,4.15
68,1000069,Past Lives,2023.0,,"Nora and Hae Sung, two childhood friends, are ...",106.0,4.18
133,1000134,Toy Story,1995.0,,"Led by Woody, Andy's toys live happily in his ...",81.0,4.12
146,1000147,Requiem for a Dream,2000.0,,The hopes and dreams of four ambitious people ...,102.0,4.10
162,1000163,Asteroid City,2023.0,,Set in a fictional American desert town circa ...,105.0,3.52
...,...,...,...,...,...,...,...
941592,1941593,神笛,,,,,
941593,1941594,蟲極道蜜団子抗争編 壱ノ巻,,,Shinjuku forest at night. In the sap taverns o...,30.0,
941594,1941595,蟲極道蜜団子抗争編 弐ノ巻,,,"The city that never sleeps, where insects gath...",30.0,
941595,1941596,重生,,,"In a world where order has broken down, darkne...",,


In [143]:
movies_df.isna().sum()

id                  0
name               10
date            91913
tagline        802210
description    160812
minute         181570
rating         850598
dtype: int64

In [144]:
# removing NaN 'name' records
movies_df = movies_df[movies_df['name'].notna()]

In [145]:
# check if 'id' column have duplicate values
movies_df[movies_df["id"].duplicated()].shape[0]

0

In [146]:
# then set index as id
movies_df.set_index("id", inplace=True)

In [147]:
# typing columns

# check if necessary
# movies_df['name'] = movies_df['name'].astype('string')
# movies_df['tagline'] = movies_df['tagline'].astype('string')
# movies_df['description'] = movies_df['description'].astype('string')

movies_df['date'] = movies_df['date'].astype('Int64')

movies_df['minute'] = movies_df['minute'].astype('Int64')

In [148]:
# rename columns
movies_df.rename(columns={'minute': 'duration_in_minutes', 'date': 'release_year'}, inplace=True)

# Languages

In [149]:
lang_df = pd.read_csv('datasets/languages.csv')
lang_df

Unnamed: 0,id,type,language
0,1000001,Language,English
1,1000002,Primary language,Korean
2,1000002,Spoken language,English
3,1000002,Spoken language,German
4,1000002,Spoken language,Korean
...,...,...,...
1038757,1941593,Language,Chinese
1038758,1941594,Language,English
1038759,1941595,Language,English
1038760,1941596,Language,Chinese


In [157]:
lang_df.dtypes

id             int64
type        category
language      object
dtype: object

In [159]:
# check NaN values
lang_df.isna().sum()

id          0
type        0
language    0
dtype: int64

In [160]:
# typing columns
lang_df['type'] = lang_df['type'].astype('category')
lang_df['type'].unique()

['Language', 'Primary language', 'Spoken language']
Categories (3, object): ['Language', 'Primary language', 'Spoken language']

In [167]:
# check for duplicate values
lang_df.duplicated().sum()

np.int64(0)

# Actors

In [170]:
actors_df = pd.read_csv('datasets/actors.csv')
actors_df

Unnamed: 0,id,name,role
0,1000001,Margot Robbie,Barbie
1,1000001,Ryan Gosling,Ken
2,1000001,America Ferrera,Gloria
3,1000001,Ariana Greenblatt,Sasha
4,1000001,Issa Rae,Barbie
...,...,...,...
5798445,1941596,Marc Ma,Ba Cai/巴莱
5798446,1941596,线雨轩,Tata/塔塔
5798447,1941596,Jiang Yixuan,Zuo Yila（Zoila）/佐伊拉
5798448,1941597,Hiroshi Mikami,


In [172]:
actors_df.dtypes

id       int64
name    object
role    object
dtype: object

In [185]:
# check NaN values
actors_df.isna().sum()

id            0
name          0
role    1361121
dtype: int64

In [197]:
# check duplicated values

actors_df.duplicated().sum()
actors_df = actors_df.drop_duplicates()

actors_df[actors_df['role'].isna()].duplicated(subset=['id', 'name']).sum()

np.int64(0)

In [184]:
# removing name NaN
actors_df = actors_df.dropna(subset=['name'])

# Countries

In [199]:
countries_df = pd.read_csv('datasets/countries.csv')
countries_df

Unnamed: 0,id,country
0,1000001,UK
1,1000001,USA
2,1000002,South Korea
3,1000003,USA
4,1000004,Germany
...,...,...
693471,1941593,China
693472,1941594,USA
693473,1941595,USA
693474,1941596,China


In [201]:
countries_df.dtypes

id          int64
country    object
dtype: object

In [203]:
# check for NaN values
countries_df.isna().sum()

id         0
country    0
dtype: int64

In [205]:
# check for duplicated values
countries_df.duplicated().sum()

np.int64(0)

# Crew

In [208]:
crew_df = pd.read_csv('datasets/crew.csv')
crew_df

Unnamed: 0,id,role,name
0,1000001,Director,Greta Gerwig
1,1000001,Producer,Tom Ackerley
2,1000001,Producer,Margot Robbie
3,1000001,Producer,Robbie Brenner
4,1000001,Producer,David Heyman
...,...,...,...
4720178,1941596,Casting,线雨轩
4720179,1941596,Editor,Eric Kwong Chi-Leung
4720180,1941596,Cinematography,Kenny Tse
4720181,1941596,Composer,胡小欧


In [211]:
crew_df.dtypes

id       int64
role    object
name    object
dtype: object

In [229]:
# check for NaN values
crew_df.isna().sum()
crew_df['name'] = crew_df['name'].replace('Unknown', np.nan)