## Import

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


## Exploring data

### Read raw data from file

In [2]:
# Read data from CSV file
movie_df = pd.read_csv('../data/raw/data.csv')


In [3]:
# Display the first 5 lines of data
movie_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


### How many rows and how many columns does the raw data have?

In [4]:
n_rows, n_cols = movie_df.shape

n_rows, n_cols

(8807, 12)

### What is the meaning of each row?

- Each row represents information about a movie/ tv show on Netflix

### Are there duplicated rows?

In [5]:
# Check for duplicate rows
is_duplicate = movie_df.duplicated().all()

is_duplicate

False

In [6]:
if is_duplicate == True:
    movie_df.drop_duplicates(inplace=True)

### What is the meaning of each column?

- show_id : Unique ID for each movie/ tv show 
- type : Type of the movie/ tv show (Movie or TV Show) 
- title : Title of the movie/ tv show 
- director : Director of the movie/ tv show 
- cast : Cast of the movie/ tv show 
- country : Country where the movie/ tv show was produced 
- date_added : Date when the movie/ tv show was added to Netflix 
- release_year : Year when the movie/ tv show was released 
- rating : Rating of the movie/ tv show 
- duration : Duration of the movie/ tv show 
- listed_in : Genre of the movie/ tv show 
- description : Description of the movie/ tv show 


### What is the current data type of each column? Are there columns having inappropriate data types?


In [7]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [8]:
# Chuyển cột 'date_added' thành kiểu datetime
movie_df['date_added'] = pd.to_datetime(movie_df['date_added'], errors='coerce')

### With each numerical column, how are values distributed?

In [9]:
num_col_info_df = movie_df.select_dtypes(exclude='object')

def missing_ratio(s):
    return (s.isna().mean() * 100).round(2)
    # raise NotImplementedError()

num_col_info_df = num_col_info_df.agg([missing_ratio, "min", "max"])
num_col_info_df

Unnamed: 0,date_added,release_year
missing_ratio,1.11,0.0
min,2008-01-01 00:00:00,1925.0
max,2021-09-25 00:00:00,2021.0


### With each categorical column, how are values distributed?

In [10]:
cate_col_info_df = movie_df.select_dtypes(include='object')

# Create an empty DataFrame to store the summary information
summary_df = pd.DataFrame(columns=['Column', 'Number of Unique Values', 'Missing Ratio', 'Sample Values'])

# Display distribution, missing values, and unique values for each categorical column
for column in cate_col_info_df.columns:
    unique_values = cate_col_info_df[column].nunique()
    missing_values = missing_ratio(cate_col_info_df[column])
    
    sample_values = cate_col_info_df[column].unique()[:5]
    
    
    # Use loc to add rows to the summary DataFrame
    summary_df.loc[len(summary_df)] = [column, unique_values, missing_values, sample_values]

# Display the summary DataFrame
summary_df


Unnamed: 0,Column,Number of Unique Values,Missing Ratio,Sample Values
0,show_id,8807,0.0,"[s1, s2, s3, s4, s5]"
1,type,2,0.0,"[Movie, TV Show]"
2,title,8807,0.0,"[Dick Johnson Is Dead, Blood & Water, Gangland..."
3,director,4528,29.91,"[Kirsten Johnson, nan, Julien Leclercq, Mike F..."
4,cast,7692,9.37,"[nan, Ama Qamata, Khosi Ngema, Gail Mabalane, ..."
5,country,748,9.44,"[United States, South Africa, nan, India, Unit..."
6,rating,17,0.05,"[PG-13, TV-MA, PG, TV-14, TV-PG]"
7,duration,220,0.03,"[90 min, 2 Seasons, 1 Season, 91 min, 125 min]"
8,listed_in,514,0.0,"[Documentaries, International TV Shows, TV Dra..."
9,description,8775,0.0,"[As her father nears the end of his life, film..."


In [11]:
## Drop NaN
movie_df.dropna(inplace=True,axis=0,subset=['date_added','rating'])


movie_df['director'] = movie_df['director'].fillna('Unknown')
movie_df['country'] = movie_df['country'].fillna('Unknown')
movie_df['cast'] = movie_df['cast'].fillna('Unknown')
movie_df['rating'] = movie_df['rating'].fillna('Unknown')
movie_df['duration' ]= movie_df['duration'].fillna('Unknown')

movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8705 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8705 non-null   object        
 1   type          8705 non-null   object        
 2   title         8705 non-null   object        
 3   director      8705 non-null   object        
 4   cast          8705 non-null   object        
 5   country       8705 non-null   object        
 6   date_added    8705 non-null   datetime64[ns]
 7   release_year  8705 non-null   int64         
 8   rating        8705 non-null   object        
 9   duration      8705 non-null   object        
 10  listed_in     8705 non-null   object        
 11  description   8705 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(10)
memory usage: 884.1+ KB


### Save to processed.csv

In [13]:
movie_df.to_csv("../data/processed/data.csv",sep = ',', encoding= 'utf-8', index=False) 