In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("movies.csv")
df1 = pd.read_csv("TV.csv")

# Concatenate and Drop irrelevant columns

In [3]:
df = pd.concat([df, df1], ignore_index = True)

In [4]:
df = df.drop(['quality', 'netflixid', 'date_released', 'image_portrait', 'image_landscape'], axis = 1)

In [5]:
df.head()

Unnamed: 0,title,type,titlereleased,rating,actors,director,category,imdb,runtime,description,language
0,#Alive,Movie,2020,TV-MA,"Yoo Ah-in, Park Shin-hye",Cho Il,"Horror, Zombie, Korean",6.2/10,98 mins,"As a grisly virus rampages a city, a lone man ...",Korean
1,#AnneFrank - Parallel Stories,Movie,2019,TV-14,"Helen Mirren, Gengher Gatti","Sabina Fedeli, Anna Migotto",Documentary,,94 mins,"Through her diary, Anne Frank's story is retol...",English
2,#FriendButMarried,Movie,2018,TV-G,"Adipati Dolken, Vanesha Prescilla, Refal Hady,...",Rako Prijanto,"Biography, Drama",7.1/10,102 min,"Pining for his high school crush for years, a ...",Indonesian
3,#FriendButMarried 2,Movie,2020,TV-G,"Adipati Dolken, Mawar Eva de Jongh, Vonny Corn...",Rako Prijanto,"Biography, Drama, Romance",,104 mins,As Ayu and Ditto finally transition from best ...,Indonesian
4,#realityhigh,Movie,2017,TV-14,"Nesta Cooper, Kate Walsh, John Michael Higgins...",Fernando Lebrija,,5.2/10,99 minutes,When nerdy high schooler Dani finally attracts...,English


# Split Actors, Directors, and Categories

In [6]:
# actor split
split_df = df['actors'].str.split(pat = ", ", expand = True)
actor_col_list = ['actor_' + str(i) for i in range(len(split_df.columns))]
split_df.columns = actor_col_list
df = pd.concat([df, split_df], axis = 1)

In [7]:
# director split
split_df = df['director'].str.split(pat = ", ", expand = True)
director_col_list = ['director_' + str(i) for i in range(len(split_df.columns))]
split_df.columns = director_col_list
df = pd.concat([df, split_df], axis = 1)

In [8]:
# category split
split_df = df['category'].str.split(pat = ", ", expand = True)
category_col_list = ['category_' + str(i) for i in range(len(split_df.columns))]
split_df.columns = category_col_list
df = pd.concat([df, split_df], axis = 1)

# IMBD Score

In [9]:
df['imdb'] = df['imdb'].str.split('/').str[0]
df['imdb'] = df['imdb'].replace('N', np.nan)
df['imdb'] = df['imdb'].replace('0', np.nan)

Looks like a few of the supposed imdb scores are instead rotten tomatoes scores. I will manually replace scores

In [10]:
df['imdb'] = df['imdb'].replace('0.58', '6.9') # a knight's tale
df['imdb'] = df['imdb'].replace('0.92', '6.9') # funan
df['imdb'] = df['imdb'].replace('0.64', '6.9') # the addams family (1991)
df['imdb'] = df['imdb'].replace('0.19', '6.4') # the ottoman lieutenant

In [11]:
df['imdb'] = df['imdb'].astype(float)

## Runtime

Seems like a fair amount of work, so let's only do this if we're sure we will use this column.

In [12]:
df['runtime'].unique()

array(['98 mins', '94 mins', '102 min', '104 mins', '99 minutes',
       '148 minutes', '82 min', '108 minutes', '44 minutes', '100 mins',
       '111 min', '95 min', '92 min', '130 min', '105 mins', '91 min',
       '100 minutes', '36 minutes', '101 min', '124 minutes', '114 mins',
       '102 minutes', '90 min', '117 min', '119 min', '122 minutes',
       '99 min', '109 min', '144 minutes', '112 min', '163 minutes',
       '15 min', '83 min', '103 min', '111 mins', '115 minutes',
       '92 mins', '81 min', '98 min', '105 minutes', '119 mins',
       '95 minutes', '75 minutes', '128 minutes', '100 min', '77 minutes',
       '147 minutes', '93 min', '89 min', '28 minutes', '94 min',
       '104 minutes', '106 min', '95 mins', '123 min', '87 min',
       '92 minutes', '85 minutes', '93 minutes', '21 minutes',
       '86 minutes', '136 minutes', '107 mins', '60 mins', '67 minutes',
       '120 minutes', '136 min', '29 minutes', '101 mins', '30 mins',
       '167 minutes', '101 minutes',

# Language

A lot of inconsistencies in spacing/naming of languages. I've replaced all that I found, but look through the unique values of the languages column below to see if anything suspicious stands out.

In [13]:
df['language'] = df['language'].replace('Telegu', 'Telugu')
df['language'] = df['language'].replace('Telugu ', 'Telugu')
df['language'] = df['language'].replace('Portuguese ', 'Portuguese')
df['language'] = df['language'].replace('German ', 'German')
df['language'] = df['language'].replace('Indonesian ', 'Indonesian')
df['language'] = df['language'].replace('Cantonese ', 'Cantonese')
df['language'] = df['language'].replace('TV-MA', 'English') # Rolling Stones Documentary 2021
df['language'] = df['language'].replace('Kannada ', 'Kannada')
df['language'] = df['language'].replace('Hungarian ', 'Hungarian')
df['language'] = df['language'].replace('Khmer ', 'Khmer')
df['language'] = df['language'].replace('Filipino ', 'Filipino')
df['language'] = df['language'].replace('Mandarin ', 'Mandarin')
df['language'] = df['language'].replace('Bengali', 'Bangla')
df['language'] = df['language'].replace('Bangla ', 'Bangla')
df['language'] = df['language'].replace('Comedy', 'Portuguese') # Lulli
df['language'] = df['language'].replace('tt7866320', 'Portuguese') # Ride or Die (Japanese)
df['language'] = df['language'].replace('Norwegian ', 'Norwegian')
df['language'] = df['language'].replace(' Spanish', 'Spanish')
df['language'] = df['language'].replace('Arabic ', 'Arabic')
df['language'] = df['language'].replace('Brazilian Portuguese', 'Portuguese')
df['language'] = df['language'].replace('Malayalam, Hindi, Tamil, Telugu, Kannada', 'Malayalam') # Kurup
df['language'] = df['language'].replace('Tamil / Hindi / Telegu / Malayalam / Kannada', 'Tamil') # Annaatthe
df['language'] = df['language'].replace('Tamil, Hindi, Kannada, Malayalam, Telugu', 'Tamil') # Beast, Etharkkum Thunindhavan
df['language'] = df['language'].replace('Tamil & Hindi ', 'Tamil') # Boomika
df['language'] = df['language'].replace('Tamil & Telegu', 'Tamil') # Tughlaq Durbar
df['language'] = df['language'].replace('Telugu, Tamil, Kannada', 'Telugu') # Wild Dog
df['language'] = df['language'].replace('Panjabi', 'Punjabi')
df['language'] = df['language'].replace('Tamil, Telugu, Malayalam, Kannada', 'Tamil') # Katteri


In [14]:
df['language'].unique()

array(['Korean', 'English', 'Indonesian', 'Spanish', 'Arabic', 'Marathi',
       'Italian', nan, 'Hindi', 'Polish', 'Japanese', 'Swahili',
       'Portuguese', 'Mandarin', 'Turkish', 'Cantonese', 'Albanian',
       'Filipino', 'Danish', 'Tamil', 'Telugu', 'Malayalam', 'Urdu',
       'Thai', 'Yoruba', 'Malay', 'German', 'Zulu', 'French', 'Icelandic',
       'Dutch', 'Norwegian', 'Wolof', 'Finnish', 'Kannada', 'Punjabi',
       'Flemish', 'Bangla', 'Swedish', 'Tagalog', 'Vietnamese', 'Chinese',
       'Afrikaans', 'Russian', 'Swiss German', 'Khmer', 'Hebrew', 'Dari',
       'Czech', 'Xhosa', 'Dzongkha', 'Hausa', 'Yiddish', 'Hungarian',
       'Akan', 'Gujarati', 'Romanian', 'Catalan', 'Georgian', 'Pashtu',
       'Persian', 'Fulah', 'Ukranian', 'Galician', 'Luxembourgish',
       'Nepali', 'Croatian'], dtype=object)

# Export

In [15]:
df.to_csv("data_cleaned.csv", index=False)