In [17]:
import pandas as pd
import json
import ast
import numpy as np

In [24]:
'''
low_memory : bool, default True
    Internally process the file in chunks, resulting in lower memory use
    while parsing, but possibly mixed type inference.  To ensure no mixed
    types either set False, or specify the type with the `dtype` parameter.
    Note that the entire file is read into a single DataFrame regardless,
    use the `chunksize` or `iterator` parameter to return the data in chunks.
    (Only valid with C parser).
'''
df = pd.read_csv('movies_metadata.csv', low_memory=False)
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [26]:
# json dictionary is coming in as a string
df['genres'][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [27]:
# Do not need adult, original title, video, or homepage columns
df.drop(columns= ['adult', 'original_title', 'video', 'homepage'], inplace=True)

In [28]:
# converting string json data into json dictionary
json_columns = ['belongs_to_collection', 'genres', 'production_countries', 'production_companies', 'spoken_languages']
# ast.literal_eval() converts json string to json dictionary
for col in json_columns:
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else np.nan)

df.head()

Unnamed: 0,belongs_to_collection,budget,genres,id,imdb_id,original_language,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,tt0114709,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7,5415.0
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,en,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,en,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,tt0114885,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,tt0113041,en,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0


In [69]:
# How to flatten nested columns
print(df.isna().sum())
def pipify(x, key):
    xlist = []
    if type(x) != list:
        return np.nan
    if len(x) == 0:
        return np.nan
    for i in x:
        xlist.append(str(i[key]))
    if len(xlist) == 0:
        return np.nan
    return '|'.join(xlist)

df['belongs_to_collection'] = df['belongs_to_collection'].apply(lambda x: x['name'] if isinstance(x, dict) else np.nan)
df['genres'] = df['genres'].apply(lambda x: pipify(x, 'name')).replace('', np.nan)
df['spoken_languages'] = df['spoken_languages'].apply(lambda x: pipify(x, 'name')).replace('', np.nan)
df['production_companies'] = df['production_companies'].apply(lambda x: pipify(x, 'name')).replace('', np.nan)
print(df.isna().sum())

belongs_to_collection    40972
budget                       0
genres                       0
id                           0
imdb_id                     17
original_language           11
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
vote_average                 6
vote_count                   6
dtype: int64
belongs_to_collection    40975
budget                       0
genres                    2442
id                           0
imdb_id                     17
original_language           11
overview                   954
popularity                   5
poster_path                386
production_companies     11881
production_countries         3
release_date              

In [83]:
# clean numerical columns
df['budget'] = pd.to_numeric(df['budget'], errors = 'coerce').replace(0, np.nan) / 1000000
df['revenue'] = pd.to_numeric(df['revenue'], errors = 'coerce').replace(0, np.nan) / 1000000
df.rename(columns={'budget': 'budget_musd', 'revenue': 'revenue_musd'}, inplace=True)

df['runtime'] = df['runtime'].replace(0, np.nan)
df['id'] = pd.to_numeric(df['id'], errors='coerce')
df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce')
df['vote_count'] = pd.to_numeric(df['vote_count'], errors='coerce')
df['vote_average'] = pd.to_numeric(df['vote_average'], errors='coerce')
df.loc[df['vote_average'] == 0, 'vote_average'] = np.nan

In [97]:
# clean datetime columns
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_date'].value_counts(dropna=False).head(25)

2008-01-01    136
2009-01-01    121
2007-01-01    118
2005-01-01    111
2006-01-01    101
2002-01-01     96
2004-01-01     90
NaT            90
2001-01-01     84
2003-01-01     76
1997-01-01     69
2010-01-01     68
1998-01-01     67
1999-01-01     67
2000-01-01     64
1988-01-01     63
1987-01-01     63
1994-01-01     61
1989-01-01     60
2011-01-01     56
1995-01-01     52
1996-01-01     48
1971-01-01     47
1986-01-01     46
1992-01-01     45
Name: release_date, dtype: int64

In [101]:
# clean string columns
df['overview'].replace(['No overview found.', 'No Overview', ' ', '', 'No movie overview available.', 'No overview yet.', 'Released'], np.nan, inplace = True)
df['overview'].value_counts(dropna=False).head(5)
df['tagline'].replace(['-'], np.nan, inplace = True)
df['tagline'].value_counts(dropna=False).head(5)

NaN                              25058
Based on a true story.               7
Be careful what you wish for.        4
Trust no one.                        4
Documentary                          3
Name: tagline, dtype: int64

In [119]:
# remove duplicates
# to find duplicates
df[df.duplicated(subset='id', keep='first')]
print(df[df['id'] == 105045.0].value_counts('id'))
df.drop_duplicates(subset='id', keep='first', inplace=True)
print(df[df['id'] == 105045.0].value_counts('id'))

id
105045.0    1
dtype: int64

In [129]:
# handle missing values
df.isna().sum()
df[df['title'].isna()]
df[df['id'].isna()]
df.dropna(subset=['id', 'title'], how='any', inplace=True)
df.loc[:, ['id', 'title']].isna().sum()

df['id'] = df['id'].astype('int')

# find number of null values in columns
df.notna().sum(axis=1).value_counts().sort_values(ascending=False)

df.dropna(thresh=10, inplace=True)

In [143]:
df['status'].value_counts()
releasedDF = df[df['status'] == 'Released'].copy()

desiredCols = ['id', 'title', 'tagline', 'release_date', 'genres', 'belongs_to_collection', 'original_language', 'budget_musd', 'revenue_musd', 'production_companies', 'production_countries', 'vote_count', 'vote_average', 'popularity', 'runtime', 'overview', 'spoken_languages', 'poster_path']
releasedDF = releasedDF.loc[:, desiredCols]
releasedDF.reset_index(drop=True, inplace=True)
releasedDF.head()
releasedDF['poster_path'] = "<img src='http://image.tmdb.org/t/p/w185/" + releasedDF['poster_path'] + "' style='height:100px;'>"
releasedDF.to_csv('movies_clean.csv', index=False)