# Data Cleaning
Made a section for each dataset (csv file).
Every section has a:
1. Data Understanding
2. Data Cleaning
3. Final Result

First of all libraries imports and options:

In [None]:
import pandas as pd
import numpy as np

from utils.movies import find_matching

# Suppress warning on reassigning edited columns
pd.options.mode.chained_assignment = None

# BUG ALERT: In PyCharm 2024.3 error "Table data could not be loaded" is often caused by columns exceeding display area and could be resolved by increasing the size of the display area by hiding sidebars as project source files.

## Movies

In [None]:
# Import 'movies.csv' dataset
movies_df = pd.read_csv('datasets/movies.csv')

### 1. Data Understanding

In [None]:
movies_df.head()

In [None]:
movies_df.shape

In [None]:
movies_df.dtypes

### 2. Data Cleaning

In [None]:
# Rename columns
movies_df.rename(columns={'name': 'title', 'minute': 'duration_in_minutes', 'date': 'release_year'}, inplace=True)
movies_df.columns

In [None]:
# Check for null values
movies_df.isna().sum()

There are null values in most of the columns.
'**release_year**', '**tagline**', '**description**', '**duration_in_minutes**' and '**rating**' don't cause any problems so we'll keep them.

In [None]:
# Few movies are without a title and can't be used. Remove them
no_title = movies_df[movies_df['title'].isna()]
movies_df = movies_df.dropna(subset=['title'])
no_title

In [None]:
# Check for duplicate rows
movies_df.duplicated().sum()

In [None]:
# If 'id' column has unique values can be an index
duplicates_id = movies_df[movies_df['id'].duplicated()].shape[0]
movies_df.set_index("id", inplace=True)
print("'id' duplicates:", duplicates_id)

In [None]:
# Setting the correct type for columns
movies_df['release_year'] = movies_df['release_year'].astype('Int64')
movies_df['duration_in_minutes'] = movies_df['duration_in_minutes'].astype('Int64')
movies_df[['release_year', 'duration_in_minutes']].dtypes

#### Deep Clean
Let's look inside some columns to see most frequent values

In [None]:
movies_df['description'].value_counts()

Many descriptions seem to have a description like "Plot Unavailable" or similar instead of a null value. The other fields seem fine.<br>
Let's try to fix as many as possible (fixing only the most frequent variation, not 100% accurate).

In [None]:
from utils.movies import null_description_keywords

# Find many null description
result = find_matching(movies_df, 'description', null_description_keywords)
matches = result.copy()

# Fill with NaN values the result obtained
result['description'] = np.nan

# Manual check to be sure to not overwrite real descriptions
matches['description'].value_counts()

### 3. Final Result

All datasets reference the **movies** dataset. A movie is uniquely identified by his **id** and a movie id has multiple occurrences in other datasets.

In [None]:
movies_df.head()

## Languages

In [None]:
# Import 'languages.csv' dataset
lang_df = pd.read_csv('datasets/languages.csv')

### 1. Data Understanding

In [None]:
lang_df.head()

In [None]:
lang_df.shape

In [None]:
lang_df.dtypes

### 2. Data Cleaning

In [None]:
# Rename columns
lang_df = lang_df.rename(columns={'id': 'movie_id'})

In [None]:
# Check for null values
lang_df.isna().sum()

In [None]:
# check for duplicate row
lang_df.duplicated().sum()

In [None]:
# Setting the category data type for column 'type'
lang_types = lang_df['type'].unique()
lang_df['type'] = lang_df['type'].astype('category')
lang_types

### 3. Final Result

In [None]:
lang_df.head()

## Actors

In [None]:
actors_df = pd.read_csv('datasets/actors.csv')

### 1. Data Understanding

In [None]:
actors_df.head()

In [None]:
actors_df.shape

In [None]:
actors_df.dtypes

### 2. Data Cleaning

In [None]:
# Rename columns
actors_df = actors_df.rename(columns={'id': 'movie_id'})

In [None]:
# check NaN values
actors_df.isna().sum()

There are a lot of missing roles, but there is nothing to do about it.

In [None]:
# Few actors are without a name and can't be used. Remove them
no_name = actors_df[actors_df['name'].isna()]
actors_df = actors_df.dropna(subset=['name'])
no_name

In [None]:
# Check for duplicate rows
print('Duplicated rows:', actors_df.duplicated().sum())
actors_duplicates = actors_df[actors_df.duplicated(keep=False)].head(6)

# Dropping the duplicates
actors_df = actors_df.drop_duplicates()

actors_duplicates

## Countries

In [None]:
countries_df = pd.read_csv('datasets/countries.csv')
countries_df

In [None]:
countries_df.dtypes

In [None]:
# check for NaN values
countries_df.isna().sum()

In [None]:
# check for duplicated values
countries_df.duplicated().sum()

## Crew

In [None]:
crew_df = pd.read_csv('datasets/crew.csv')
crew_df

In [None]:
crew_df.dtypes

In [None]:
# typing columns
# crew_df['role'].unique()

crew_df['role'] = crew_df['role'].astype('category')

In [None]:
# check for NaN values
crew_df.isna().sum()
crew_df['name'] = crew_df['name'].replace('Unknown', np.nan)

In [None]:
# check for duplicated values
crew_df.duplicated().sum()
crew_df[crew_df.duplicated(keep=False)]

crew_df = crew_df.drop_duplicates()

## Genres

In [None]:
genres_df = pd.read_csv('datasets/genres.csv')
genres_df

In [None]:
genres_df.dtypes

In [None]:
# check for NaN values
genres_df.isna().sum()

In [None]:
# check for duplicated values
genres_df.duplicated().sum()

## Posters

In [None]:
poster_df = pd.read_csv('datasets/posters.csv')
poster_df

In [None]:
poster_df.dtypes

In [None]:
# check for NaN values
poster_df.isna().sum()
poster_df[poster_df['link'].isna()]

poster_df = poster_df.dropna()

In [None]:
# check for duplicated values
poster_df.duplicated().sum()

## Releases

In [None]:
releases_df = pd.read_csv('datasets/releases.csv')
releases_df

In [None]:
releases_df.dtypes

In [None]:
# typing columns
releases_df['date'] = pd.to_datetime(releases_df['date'], format='%Y-%m-%d')

releases_df['type'].unique()
releases_df['type'] = releases_df['type'].astype('category')

In [None]:
# check for NaN values
releases_df.isna().sum()
# releases_df[releases_df['date'].str.len() != 10]

In [None]:
# check for duplicates values
releases_df.duplicated().sum()

## Studios

In [None]:
studios_df = pd.read_csv('datasets/studios.csv')
studios_df

In [None]:
studios_df.dtypes

In [None]:
# check for NaN values
studios_df.isna().sum()
studios_df[studios_df['studio'].isna()]

studios_df = studios_df.dropna()

In [None]:
# check for duplicated values
studios_df.duplicated().sum()
studios_df[studios_df.duplicated(keep=False)]

studios_df = studios_df.drop_duplicates()

## Themes

In [None]:
themes_df = pd.read_csv('datasets/themes.csv')
themes_df

In [None]:
themes_df.dtypes

In [None]:
len(themes_df['theme'].unique())

themes_df['theme'].unique()

In [None]:
# check for NaN values
themes_df.isna().sum()

In [None]:
# check for duplicated values
themes_df.duplicated().sum()

## The Oscar Awards

In [None]:
oscars_df = pd.read_csv('datasets/the_oscar_awards.csv')
oscars_df

In [None]:
oscars_df.dtypes

In [None]:
# typing columns
oscars_df['ceremony'].unique()

# year_film always <= year_ceremony
# oscars_df[oscars_df['year_film'] > oscars_df['year_ceremony']]

oscars_df['category'] = oscars_df['category'].astype('category')

In [None]:
# check for NaN values
oscars_df.isna().sum()

oscars_df[oscars_df['category'] == "JEAN HERSHOLT HUMANITARIAN AWARD"]

In [None]:
# check for duplicated values
oscars_df.duplicated().sum()

oscars_df[oscars_df.duplicated(keep=False)]

## Rotten Tomatoes Reviews

In [None]:
reviews_df = pd.read_csv('datasets/rotten_tomatoes_reviews.csv')
reviews_df

In [None]:
reviews_df.dtypes

In [None]:
# typing columns
reviews_df["review_type"] = reviews_df["review_type"].astype('category')

reviews_df["review_date"] = pd.to_datetime(reviews_df["review_date"], format='%Y-%m-%d')

In [None]:
# rename columns
reviews_df = reviews_df.rename(columns={'review_type': 'type', 'review_score': 'score', 'review_date': 'date', 'review_content': 'content'})

In [None]:
# check for NaN values
reviews_df.isna().sum()

In [None]:
# check for duplicated values
reviews_df.duplicated().sum()

reviews_df = reviews_df.drop_duplicates()