## Import

In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


## Exploring data

### Read raw data from file

In [None]:
# Read data from CSV file
movie_df = pd.read_csv('../data/raw/data.csv')

# Display the first 5 lines of data
movie_df.head()

### How many rows and how many columns does the raw data have?

In [None]:
n_rows, n_cols = movie_df.shape

n_rows, n_cols

### What is the meaning of each row?

### Are there duplicated rows?

In [None]:
# Check for duplicate rows
is_duplicate = movie_df.duplicated().all()

is_duplicate

In [None]:
if is_duplicate == True:
    movie_df.drop_duplicates(inplace=True)

### What is the meaning of each column?

- show_id : Unique ID for each movie/ tv show 
- type : Type of the movie/ tv show (Movie or TV Show) 
- title : Title of the movie/ tv show 
- director : Director of the movie/ tv show 
- cast : Cast of the movie/ tv show 
- country : Country where the movie/ tv show was produced 
- date_added : Date when the movie/ tv show was added to Netflix 
- release_year : Year when the movie/ tv show was released 
- rating : Rating of the movie/ tv show 
- duration : Duration of the movie/ tv show 
- listed_in : Genre of the movie/ tv show 
- description : Description of the movie/ tv show 


### What is the current data type of each column? Are there columns having inappropriate data types?


In [None]:
movie_df.info()

In [None]:
# Chuyển cột 'date_added' thành kiểu datetime
movie_df['date_added'] = pd.to_datetime(movie_df['date_added'], errors='coerce')

### With each numerical column, how are values distributed?

In [None]:
num_col_info_df = movie_df.select_dtypes(exclude='object')

def missing_ratio(s):
    return (s.isna().mean() * 100).round(2)
    # raise NotImplementedError()

num_col_info_df = num_col_info_df.agg([missing_ratio, "min", "max"])
num_col_info_df

### With each categorical column, how are values distributed?

In [None]:
cate_col_info_df = movie_df.select_dtypes(include='object')

# Create an empty DataFrame to store the summary information
summary_df = pd.DataFrame(columns=['Column', 'Number of Unique Values', 'Missing Ratio', 'Sample Values'])

# Display distribution, missing values, and unique values for each categorical column
for column in cate_col_info_df.columns:
    unique_values = cate_col_info_df[column].nunique()
    missing_values = missing_ratio(cate_col_info_df[column])
    
    sample_values = cate_col_info_df[column].unique()[:5]
    
    
    # Use loc to add rows to the summary DataFrame
    summary_df.loc[len(summary_df)] = [column, unique_values, missing_values, sample_values]

# Display the summary DataFrame
summary_df


In [None]:
## Drop NaN
movie_df.dropna(inplace=True,axis=0,subset=['date_added','rating'])


movie_df['director'] = movie_df['director'].fillna('Unknown')
movie_df['country'] = movie_df['country'].fillna('Unknown')
movie_df['cast'] = movie_df['cast'].fillna('Unknown')
movie_df['rating'] = movie_df['rating'].fillna('Unknown')
movie_df['duration' ]= movie_df['duration'].fillna('Unknown')

movie_df.info()

### Save to processed.csv

In [None]:
movie_df.to_csv("../data/processed/data.csv",sep = ',', encoding= 'utf-8', index=False) 