# MOVIES.CSV CLEANING

In [115]:
import pandas as pd

In [116]:
data_directory = 'data_csv/'

movies_df = pd.read_csv(data_directory + 'movies.csv')
movies_df

Unnamed: 0,id,name,date,tagline,description,minute,rating
0,1000001,Barbie,2023.0,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114.0,3.86
1,1000002,Parasite,2019.0,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133.0,4.56
2,1000003,Everything Everywhere All at Once,2022.0,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140.0,4.30
3,1000004,Fight Club,1999.0,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139.0,4.27
4,1000005,La La Land,2016.0,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129.0,4.09
...,...,...,...,...,...,...,...
941592,1941593,神笛,,,,,
941593,1941594,蟲極道蜜団子抗争編 壱ノ巻,,,Shinjuku forest at night. In the sap taverns o...,30.0,
941594,1941595,蟲極道蜜団子抗争編 弐ノ巻,,,"The city that never sleeps, where insects gath...",30.0,
941595,1941596,重生,,,"In a world where order has broken down, darkne...",,


In [117]:
movies_df.shape

(941597, 7)

In [118]:
movies_df.dtypes

id               int64
name            object
date           float64
tagline         object
description     object
minute         float64
rating         float64
dtype: object

## CHANGING TYPES OF OBJECT COLUMNS

In [119]:
movies_df['name'] = movies_df['name'].astype('string')
movies_df['tagline'] = movies_df['tagline'].astype('string')
movies_df['description'] = movies_df['description'].astype('string')

movies_df['date'] = movies_df['date'].astype('O')
movies_df['date'] = movies_df['date'].astype('Int64')
movies_df.dtypes

id                      int64
name           string[python]
date                    Int64
tagline        string[python]
description    string[python]
minute                float64
rating                float64
dtype: object

## NOW WE TRY TO UNDERSTAND COLUMNS VALUES

In [120]:
column_values = list(movies_df.columns.values)
print(column_values)

['id', 'name', 'date', 'tagline', 'description', 'minute', 'rating']


In [121]:
# changing column names to make them more understandable
movies_df.rename(columns={'id': 'movie_id',
                           'name': 'movie_name',
                           'date': 'year_of_release',
                           'minute': 'minutes'
                            }, inplace=True)
print(list(movies_df.columns.values))

['movie_id', 'movie_name', 'year_of_release', 'tagline', 'description', 'minutes', 'rating']


## ARE THERE NA VALUES IN THE DATASET?

In [122]:
not_available_movie_ids = movies_df['movie_id'].isna().sum()
not_available_movie_names = movies_df['movie_name'].isna().sum()
not_available_years_release = movies_df['year_of_release'].isna().sum()
not_available_taglines = movies_df['tagline'].isna().sum()
not_available_descriptions = movies_df['description'].isna().sum()
not_available_minutes = movies_df['minutes'].isna().sum()
not_available_movies_ratings = movies_df['rating'].isna().sum()

print('Not available movies ids: ', not_available_movie_ids,
      '\nNot available movie names: ', not_available_movie_names,
      '\nNot available years of release: ', not_available_years_release,
      '\nNot available taglines: ', not_available_taglines,
      '\nNot available descriptions: ', not_available_descriptions,
      '\nNot available minutes: ', not_available_minutes,
      '\nNot available movies ratings: ', not_available_movies_ratings)

Not available movies ids:  0 
Not available movie names:  10 
Not available years of release:  91913 
Not available taglines:  802210 
Not available descriptions:  160812 
Not available minutes:  181570 
Not available movies ratings:  850598


we could fill some na values with a default value:

In [123]:
movies_df['tagline'] = movies_df['tagline'].fillna('No tagline available for this movie.')
movies_df['description'] = movies_df['description'].fillna('No description available for this movie.')

In [124]:
movies_df.loc[movies_df['movie_name'].isna()]

Unnamed: 0,movie_id,movie_name,year_of_release,tagline,description,minutes,rating
287514,1287515,,2015.0,No tagline available for this movie.,NONE is a short film that explores the balance...,4.0,
617642,1617643,,,No tagline available for this movie.,No description available for this movie.,,
646520,1646521,,2008.0,No tagline available for this movie.,No description available for this movie.,,
648185,1648186,,,No tagline available for this movie.,No description available for this movie.,,
720294,1720295,,,No tagline available for this movie.,"In this directorial debut of Eden Ewardson, he...",8.0,
725369,1725370,,,No tagline available for this movie.,No description available for this movie.,,
741481,1741482,,,No tagline available for this movie.,No description available for this movie.,90.0,
840337,1840338,,,No tagline available for this movie.,No description available for this movie.,,
883228,1883229,,,No tagline available for this movie.,No description available for this movie.,,
894771,1894772,,,No tagline available for this movie.,No description available for this movie.,,


We have some information to fill some na values. Doing some research on the internet...

In [125]:
movies_df.loc[720294,['movie_name','year_of_release']] = ['Memorandum of Softness Green', 2023]

movies_df.loc[287514, ['movie_name']] = ['NONE']

## ARE THERE ANY DUPLICATES?

In [126]:
num_duplicated_movies = movies_df.duplicated().sum()
print('There are a total of {} duplicated movies'.format(num_duplicated_movies))

There are a total of 0 duplicated movies


## ARE THERE ANY UNREASONABLE VALUES?

In [127]:
movies_df['minutes'].describe()

count    760027.000000
mean         65.776516
std         154.828161
min           1.000000
25%          15.000000
50%          62.000000
75%          92.000000
max       72000.000000
Name: minutes, dtype: float64

In [128]:
movies_df.query('minutes > 240')

Unnamed: 0,movie_id,movie_name,year_of_release,tagline,description,minutes,rating
401,1000402,The Queen's Gambit,2020,No tagline available for this movie.,"In a Kentucky orphanage in the 1950s, a young ...",393.0,4.23
454,1000455,Squid Game,2021,"45,6 Billion Won Is Child's Play",Hundreds of cash-strapped players accept a str...,495.0,3.73
514,1000515,WandaVision,2021,Experience a new vision of reality.,Wanda Maximoff and Vision—two super-powered be...,350.0,3.75
633,1000634,Zack Snyder's Justice League,2021,Us united.,Determined to ensure Superman's ultimate sacri...,242.0,3.41
647,1000648,Loki,2021,Loki's time has come.,After stealing the Tesseract during the events...,615.0,3.78
...,...,...,...,...,...,...,...
940453,1940454,Malinche,2018,No tagline available for this movie.,"The story of Marina, Malintzin or Malinche - a...",250.0,
940531,1940532,Macross Δ,2016,No tagline available for this movie.,"Macross Delta is set in the year 2067, 8 years...",650.0,
940547,1940548,Saturday Wide Theatre: Plastic Surgery and Fac...,2015,No tagline available for this movie.,4 shocking tales revolving around plastic surg...,380.0,
940617,1940618,C³ Cube x Cursed x Curious,2011,No tagline available for this movie.,A high school student named Yachi Haruaki rece...,300.0,


most of these high minute values are tv series, which means that it is not weird to see more than a 3-hour run for some of these movies.

In [129]:
movies_df['year_of_release'].describe()

count       849685.0
mean     1998.522329
std        27.534861
min           1874.0
25%           1986.0
50%           2010.0
75%           2019.0
max           2031.0
Name: year_of_release, dtype: Float64

In [130]:
movies_df['rating'].describe()

count    90999.000000
mean         3.244043
std          0.417281
min          0.880000
25%          3.020000
50%          3.300000
75%          3.510000
max          4.690000
Name: rating, dtype: float64

## SAVING THE CLEANED DATASET

In [131]:
movies_df.to_csv('data_cleaned/movies_cleaned.csv', index=False)

In [132]:
movies_df = None