# Data Cleaning

In [None]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None

## Movies

In [None]:
movies_df = pd.read_csv('datasets/movies.csv')
movies_df

In [None]:
movies_df.dtypes

In [None]:
movies_df.shape

In [None]:
movies_df[movies_df.isna().any(axis=1)]

In [None]:
movies_df.isna().sum()

In [None]:
# removing NaN 'name' records
movies_df = movies_df[movies_df['name'].notna()]

In [None]:
# check if 'id' column have duplicate values
movies_df[movies_df["id"].duplicated()].shape[0]

In [None]:
# then set index as id
movies_df.set_index("id", inplace=True)

In [None]:
# typing columns

# check if necessary
# movies_df['name'] = movies_df['name'].astype('string')
# movies_df['tagline'] = movies_df['tagline'].astype('string')
# movies_df['description'] = movies_df['description'].astype('string')

movies_df['date'] = movies_df['date'].astype('Int64')

movies_df['minute'] = movies_df['minute'].astype('Int64')

In [None]:
# rename columns
movies_df.rename(columns={'minute': 'duration_in_minutes', 'date': 'release_year'}, inplace=True)

## Languages

In [None]:
lang_df = pd.read_csv('datasets/languages.csv')
lang_df

In [None]:
lang_df.dtypes

In [None]:
# check NaN values
lang_df.isna().sum()

In [None]:
# typing columns
lang_df['type'] = lang_df['type'].astype('category')
lang_df['type'].unique()

In [None]:
# check for duplicate values
lang_df.duplicated().sum()

## Actors

In [None]:
actors_df = pd.read_csv('datasets/actors.csv')
actors_df

In [None]:
actors_df.dtypes

In [None]:
# check NaN values
actors_df.isna().sum()

In [None]:
# check duplicated values

actors_df.duplicated().sum()
actors_df = actors_df.drop_duplicates()

actors_df[actors_df['role'].isna()].duplicated(subset=['id', 'name']).sum()

In [None]:
# removing name NaN
actors_df = actors_df.dropna(subset=['name'])

## Countries

In [None]:
countries_df = pd.read_csv('datasets/countries.csv')
countries_df

In [None]:
countries_df.dtypes

In [None]:
# check for NaN values
countries_df.isna().sum()

In [None]:
# check for duplicated values
countries_df.duplicated().sum()

## Crew

In [None]:
crew_df = pd.read_csv('datasets/crew.csv')
crew_df

In [None]:
crew_df.dtypes

In [None]:
# typing columns
# crew_df['role'].unique()

crew_df['role'] = crew_df['role'].astype('category')

In [None]:
# check for NaN values
crew_df.isna().sum()
crew_df['name'] = crew_df['name'].replace('Unknown', np.nan)

In [None]:
# check for duplicated values
crew_df.duplicated().sum()
crew_df[crew_df.duplicated(keep=False)]

crew_df = crew_df.drop_duplicates()

## Genres

In [None]:
genres_df = pd.read_csv('datasets/genres.csv')
genres_df

In [None]:
genres_df.dtypes

In [None]:
# check for NaN values
genres_df.isna().sum()

In [None]:
# check for duplicated values
genres_df.duplicated().sum()

## Posters

In [None]:
poster_df = pd.read_csv('datasets/posters.csv')
poster_df

In [None]:
poster_df.dtypes

In [None]:
# check for NaN values
poster_df.isna().sum()
poster_df[poster_df['link'].isna()]

poster_df = poster_df.dropna()

In [None]:
# check for duplicated values
poster_df.duplicated().sum()

## Releases

In [None]:
releases_df = pd.read_csv('datasets/releases.csv')
releases_df

In [None]:
releases_df.dtypes

In [None]:
# typing columns
releases_df['date'] = pd.to_datetime(releases_df['date'], format='%Y-%m-%d')

releases_df['type'].unique()
releases_df['type'] = releases_df['type'].astype('category')

In [None]:
# check for NaN values
releases_df.isna().sum()
# releases_df[releases_df['date'].str.len() != 10]

In [None]:
# check for duplicates values
releases_df.duplicated().sum()

## Studios

In [None]:
studios_df = pd.read_csv('datasets/studios.csv')
studios_df

In [None]:
studios_df.dtypes

In [None]:
# check for NaN values
studios_df.isna().sum()
studios_df[studios_df['studio'].isna()]

studios_df = studios_df.dropna()

In [None]:
# check for duplicated values
studios_df.duplicated().sum()
studios_df[studios_df.duplicated(keep=False)]

studios_df = studios_df.drop_duplicates()

## Themes

In [None]:
themes_df = pd.read_csv('datasets/themes.csv')
themes_df

In [None]:
themes_df.dtypes

In [None]:
len(themes_df['theme'].unique())

themes_df['theme'].unique()

In [None]:
# check for NaN values
themes_df.isna().sum()

In [None]:
# check for duplicated values
themes_df.duplicated().sum()

## The Oscar Awards

In [None]:
oscars_df = pd.read_csv('datasets/the_oscar_awards.csv')
oscars_df

In [None]:
oscars_df.dtypes

In [None]:
# typing columns
oscars_df['ceremony'].unique()

# year_film always <= year_ceremony
# oscars_df[oscars_df['year_film'] > oscars_df['year_ceremony']]

oscars_df['category'] = oscars_df['category'].astype('category')

In [None]:
# check for NaN values
oscars_df.isna().sum()

oscars_df[oscars_df['category'] == "JEAN HERSHOLT HUMANITARIAN AWARD"]

In [None]:
# check for duplicated values
oscars_df.duplicated().sum()

oscars_df[oscars_df.duplicated(keep=False)]

## Rotten Tomatoes Reviews

In [None]:
reviews_df = pd.read_csv('datasets/rotten_tomatoes_reviews.csv')
reviews_df

In [None]:
reviews_df.dtypes

In [None]:
# typing columns
reviews_df["review_type"] = reviews_df["review_type"].astype('category')

reviews_df["review_date"] = pd.to_datetime(reviews_df["review_date"], format='%Y-%m-%d')

In [None]:
# rename columns
reviews_df = reviews_df.rename(columns={'review_type': 'type', 'review_score': 'score', 'review_date': 'date', 'review_content': 'content'})

In [None]:
# check for NaN values
reviews_df.isna().sum()

In [None]:
# check for duplicated values
reviews_df.duplicated().sum()

reviews_df = reviews_df.drop_duplicates()