# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None

## Movies

In [None]:
movies_df = pd.read_csv('datasets/movies.csv')
movies_df

In [None]:
movies_df.dtypes

In [None]:
movies_df.shape

In [None]:
movies_df[movies_df.isna().any(axis=1)]

In [None]:
movies_df.isna().sum()

In [None]:
# removing NaN 'name' records
movies_df = movies_df[movies_df['name'].notna()]

In [None]:
# check if 'id' column have duplicate values
movies_df[movies_df["id"].duplicated()].shape[0]

In [None]:
# then set index as id
movies_df.set_index("id", inplace=True)

In [None]:
# typing columns

# check if necessary
# movies_df['name'] = movies_df['name'].astype('string')
# movies_df['tagline'] = movies_df['tagline'].astype('string')
# movies_df['description'] = movies_df['description'].astype('string')

movies_df['date'] = movies_df['date'].astype('Int64')

movies_df['minute'] = movies_df['minute'].astype('Int64')

In [None]:
# rename columns
movies_df.rename(columns={'minute': 'duration_in_minutes', 'date': 'release_year'}, inplace=True)

## Languages

In [78]:
lang_df = pd.read_csv('datasets/languages.csv')
lang_df

Unnamed: 0,id,type,language
0,1000001,Language,English
1,1000002,Primary language,Korean
2,1000002,Spoken language,English
3,1000002,Spoken language,German
4,1000002,Spoken language,Korean
...,...,...,...
1038757,1941593,Language,Chinese
1038758,1941594,Language,English
1038759,1941595,Language,English
1038760,1941596,Language,Chinese


In [None]:
lang_df.dtypes

In [None]:
# check NaN values
lang_df.isna().sum()

In [80]:
# typing columns
lang_df['type'] = lang_df['type'].astype('category')
lang_df['type'].unique()

192

In [None]:
# check for duplicate values
lang_df.duplicated().sum()

## Actors

In [None]:
actors_df = pd.read_csv('datasets/actors.csv')
actors_df

In [None]:
actors_df.dtypes

In [None]:
# check NaN values
actors_df.isna().sum()

In [None]:
# check duplicated values

actors_df.duplicated().sum()
actors_df = actors_df.drop_duplicates()

actors_df[actors_df['role'].isna()].duplicated(subset=['id', 'name']).sum()

In [None]:
# removing name NaN
actors_df = actors_df.dropna(subset=['name'])

## Countries

In [None]:
countries_df = pd.read_csv('datasets/countries.csv')
countries_df

In [None]:
countries_df.dtypes

In [None]:
# check for NaN values
countries_df.isna().sum()

In [None]:
# check for duplicated values
countries_df.duplicated().sum()

## Crew

In [54]:
crew_df = pd.read_csv('datasets/crew.csv')
crew_df

Unnamed: 0,id,role,name
0,1000001,Director,Greta Gerwig
1,1000001,Producer,Tom Ackerley
2,1000001,Producer,Margot Robbie
3,1000001,Producer,Robbie Brenner
4,1000001,Producer,David Heyman
...,...,...,...
4720178,1941596,Casting,线雨轩
4720179,1941596,Editor,Eric Kwong Chi-Leung
4720180,1941596,Cinematography,Kenny Tse
4720181,1941596,Composer,胡小欧


In [68]:
crew_df.dtypes

id         int64
role    category
name      object
dtype: object

In [67]:
# typing columns
# crew_df['role'].unique()

crew_df['role'] = crew_df['role'].astype('category')

In [None]:
# check for NaN values
crew_df.isna().sum()
crew_df['name'] = crew_df['name'].replace('Unknown', np.nan)

In [74]:
# check for duplicated values
crew_df.duplicated().sum()
crew_df[crew_df.duplicated(keep=False)]

crew_df = crew_df.drop_duplicates()

## Genres

In [40]:
genres_df = pd.read_csv('datasets/genres.csv')
genres_df

Unnamed: 0,id,genre
0,1000001,Comedy
1,1000001,Adventure
2,1000002,Comedy
3,1000002,Thriller
4,1000002,Drama
...,...,...
1046844,1941563,Drama
1046845,1941566,Crime
1046846,1941569,Crime
1046847,1941596,Action


In [43]:
genres_df.dtypes

id        int64
genre    object
dtype: object

In [45]:
# check for NaN values
genres_df.isna().sum()

id       0
genre    0
dtype: int64

In [47]:
# check for duplicated values
genres_df.duplicated().sum()

np.int64(0)

## Posters

In [2]:
poster_df = pd.read_csv('datasets/posters.csv')
poster_df

Unnamed: 0,id,link
0,1000001,https://a.ltrbxd.com/resized/film-poster/2/7/7...
1,1000002,https://a.ltrbxd.com/resized/film-poster/4/2/6...
2,1000003,https://a.ltrbxd.com/resized/film-poster/4/7/4...
3,1000004,https://a.ltrbxd.com/resized/film-poster/5/1/5...
4,1000005,https://a.ltrbxd.com/resized/film-poster/2/4/0...
...,...,...
941592,1941593,
941593,1941594,
941594,1941595,https://a.ltrbxd.com/resized/film-poster/1/1/8...
941595,1941596,https://a.ltrbxd.com/resized/film-poster/1/1/8...


In [5]:
poster_df.dtypes

id       int64
link    object
dtype: object

In [12]:
# check for NaN values
poster_df.isna().sum()
poster_df[poster_df['link'].isna()]

poster_df = poster_df.dropna()

In [13]:
# check for duplicated values
poster_df.duplicated().sum()

np.int64(0)

## Releases

In [29]:
releases_df = pd.read_csv('datasets/releases.csv')
releases_df

Unnamed: 0,id,country,date,type,rating
0,1000001,Andorra,2023-07-21,Theatrical,
1,1000001,Argentina,2023-07-20,Theatrical,ATP
2,1000001,Australia,2023-07-19,Theatrical,PG
3,1000001,Australia,2023-10-01,Digital,PG
4,1000001,Austria,2023-07-20,Theatrical,
...,...,...,...,...,...
1332777,1940967,USA,1909-01-01,Theatrical,
1332778,1940968,Sweden,1908-11-11,Theatrical,
1332779,1940969,France,1902-01-01,Theatrical,
1332780,1940970,France,1902-01-01,Theatrical,


In [28]:
releases_df.dtypes

id                  int64
country            object
date       datetime64[ns]
type               object
rating             object
dtype: object

In [41]:
# typing columns
releases_df['date'] = pd.to_datetime(releases_df['date'], format='%Y-%m-%d')

releases_df['type'].unique()
releases_df['type'] = releases_df['type'].astype('category')

In [42]:
# check for NaN values
releases_df.isna().sum()
# releases_df[releases_df['date'].str.len() != 10]

id              0
country         0
date            0
type            0
rating     998802
dtype: int64

In [45]:
# check for duplicates values
releases_df.duplicated().sum()

np.int64(0)

## Studios

In [47]:
studios_df = pd.read_csv('datasets/studios.csv')
studios_df

Unnamed: 0,id,studio
0,1000001,LuckyChap Entertainment
1,1000001,Heyday Films
2,1000001,NB/GG Pictures
3,1000001,Mattel
4,1000001,Warner Bros. Pictures
...,...,...
679278,1941596,上海猫眼影业有限公司
679279,1941596,坏小子（北京）传媒有限公司
679280,1941596,亚太国影（重庆）文化传媒有限公司
679281,1941596,凤凰传奇影业有限公司


In [49]:
studios_df.dtypes

id         int64
studio    object
dtype: object

In [56]:
# check for NaN values
studios_df.isna().sum()
studios_df[studios_df['studio'].isna()]

studios_df = studios_df.dropna()

In [64]:
# check for duplicated values
studios_df.duplicated().sum()
studios_df[studios_df.duplicated(keep=False)]

studios_df = studios_df.drop_duplicates()

## Themes

In [68]:
themes_df = pd.read_csv('datasets/themes.csv')
themes_df

Unnamed: 0,id,theme
0,1000001,Humanity and the world around us
1,1000001,Crude humor and satire
2,1000001,Moving relationship stories
3,1000001,Emotional and captivating fantasy storytelling
4,1000001,Surreal and thought-provoking visions of life ...
...,...,...
125636,1835643,Noir and dark crime dramas
125637,1835643,Intriguing and suspenseful murder mysteries
125638,1849827,Faith and religion
125639,1849827,Faith and spiritual journeys


In [70]:
themes_df.dtypes

id        int64
theme    object
dtype: object

In [74]:
len(themes_df['theme'].unique())

themes_df['theme'].unique()

array(['Humanity and the world around us', 'Crude humor and satire',
       'Moving relationship stories',
       'Emotional and captivating fantasy storytelling',
       'Surreal and thought-provoking visions of life and death',
       'Quirky and endearing relationships',
       'Amusing jokes and witty satire',
       'Laugh-out-loud relationship entanglements',
       'Intense violence and sexual transgression',
       'Twisted dark psychological thriller',
       'Heartbreaking and moving family drama',
       'Enduring stories of family and marital drama',
       'Touching and sentimental family stories',
       'Intense political and terrorist thrillers',
       'Powerful stories of heartbreak and suffering',
       'Dreamlike, quirky, and surreal storytelling',
       'Challenging or sexual themes & twists',
       'Graphic violence and brutal revenge', 'Song and dance',
       'Dazzling vocal performances and musicals',
       'Captivating relationships and charming romance',


In [76]:
# check for NaN values
themes_df.isna().sum()

id       0
theme    0
dtype: int64

In [77]:
# check for duplicated values
themes_df.duplicated().sum()

np.int64(0)

## The Oscar Awards

In [122]:
oscars_df = pd.read_csv('datasets/the_oscar_awards.csv')
oscars_df

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False
...,...,...,...,...,...,...,...
10884,2023,2024,96,WRITING (Original Screenplay),Written by Celine Song,Past Lives,False
10885,2023,2024,96,JEAN HERSHOLT HUMANITARIAN AWARD,,,True
10886,2023,2024,96,HONORARY AWARD,"To Angela Bassett, who has inspired audiences ...",,True
10887,2023,2024,96,HONORARY AWARD,"To Mel Brooks, for his comedic brilliance, pro...",,True


In [123]:
oscars_df.dtypes

year_film         int64
year_ceremony     int64
ceremony          int64
category         object
name             object
film             object
winner             bool
dtype: object

In [124]:
# typing columns
oscars_df['ceremony'].unique()

# year_film always <= year_ceremony
# oscars_df[oscars_df['year_film'] > oscars_df['year_ceremony']]

oscars_df['category'] = oscars_df['category'].astype('category')

In [128]:
# check for NaN values
oscars_df.isna().sum()

oscars_df[oscars_df['category'] == "JEAN HERSHOLT HUMANITARIAN AWARD"]

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
3259,1956,1957,29,JEAN HERSHOLT HUMANITARIAN AWARD,Y. Frank Freeman,,True
3357,1957,1958,30,JEAN HERSHOLT HUMANITARIAN AWARD,Samuel Goldwyn,,True
3589,1959,1960,32,JEAN HERSHOLT HUMANITARIAN AWARD,Bob Hope,,True
3710,1960,1961,33,JEAN HERSHOLT HUMANITARIAN AWARD,Sol Lesser,,True
3833,1961,1962,34,JEAN HERSHOLT HUMANITARIAN AWARD,George Seaton,,True
3955,1962,1963,35,JEAN HERSHOLT HUMANITARIAN AWARD,Steve Broidy,,True
4324,1965,1966,38,JEAN HERSHOLT HUMANITARIAN AWARD,Edmond L. DePatie,,True
4447,1966,1967,39,JEAN HERSHOLT HUMANITARIAN AWARD,George Bagnall,,True
4557,1967,1968,40,JEAN HERSHOLT HUMANITARIAN AWARD,Gregory Peck,,True
4665,1968,1969,41,JEAN HERSHOLT HUMANITARIAN AWARD,Martha Raye,,True


In [117]:
# check for duplicated values
oscars_df.duplicated().sum()

np.int64(10774)