## Import Libraries

Import the necessary Python libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import ast

In [3]:
%matplotlib inline
sns.set_style('whitegrid')

## Load Data

Load movies csv file and perform two tasks while loading:

Convert date field to datetime.date type
Convert all column with json data as json type

In [4]:
def load_movies_metadata(file_path):
    df = pd.read_csv(file_path, dtype='unicode')
    # covert each item of release_date to datetime.date type entity
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: x.date())
    # all json columns`
    json_columns = ['belongs_to_collection', 'genres', 'production_companies', 'production_countries', 'spoken_languages']
    for column in json_columns:
        # use ast because json data has single quotes in the csv, which is invalid for a json object; it should be " normally
        df[column] = df[column].apply(lambda x: np.nan if pd.isnull(x) else ast.literal_eval(x))
    return df

Load the movies metadata csv file

In [5]:
movies = load_movies_metadata(r"D:\uChicago\Classes\Q2\Data Mining\project\the-movies-dataset\movies_metadata.csv")

In [6]:
movies.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92


In [7]:
movies.shape

(45466, 24)

In [8]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [15]:
movies['spoken_languages'][0]

[{'iso_639_1': 'en', 'name': 'English'}]

In [38]:
movies.drop(movies.index[19730],inplace=True)
movies.drop(movies.index[29502],inplace=True)
movies.drop(movies.index[35585],inplace=True)

In [40]:
movies_flattened = pd.DataFrame(None,None,columns=['belongs_to_collection','genre_1','genre_2','genre_3','genre_4',\
                        'production_company_1','production_company_2','production_company_3','production_country_1',\
                        'production_country_2','production_country_3','spoken_language_1','spoken_language_2','spoken_language_3'])

for i,row in movies.iterrows():
    
    # dummy row
    newrow = {'belongs_to_collection':np.nan,'genre_1':np.nan,'genre_2':np.nan,'genre_3':np.nan,'genre_4':np.nan,\
              'production_company_1':np.nan,'production_company_2':np.nan,'production_company_3':np.nan,'production_country_1':np.nan,\
              'production_country_2':np.nan,'production_country_3':np.nan,'spoken_language_1':np.nan,'spoken_language_2':np.nan,\
              'spoken_language_3':np.nan}
    
    # fill belongs_to_collection
    if row['belongs_to_collection'] is not np.nan and 'name' in row['belongs_to_collection']:
        newrow['belongs_to_collection'] = row['belongs_to_collection']['name']
    
    # fill genre
    count=1
    if row['genres'] is not np.nan:
        for item in row['genres']:
            if count==5:
                break
            if 'name' in item:
                newrow['genre_'+str(count)] = item['name']
                count += 1

    # fill production_company
    count=1
    if row['production_companies'] is not np.nan:
        for item in row['production_companies']:
            if count==4:
                break
            if 'name' in item:
                newrow['production_company_'+str(count)] = item['name']
                count += 1

    # fill spoken_language
    if row['production_countries'] is not np.nan:
        count=1
        for item in row['production_countries']:
            if count==4:
                break
            if 'name' in item:
                newrow['production_country_'+str(count)] = item['name']
                count += 1

    # fill production_company
    count=1
    if row['spoken_languages'] is not np.nan:
        for item in row['spoken_languages']:
            if count==4:
                break
            if 'name' in item:
                newrow['spoken_language_'+str(count)] = item['name']
                count += 1

    movies_flattened = movies_flattened.append(newrow,ignore_index=True)

In [41]:
movies_flattened.shape

(45463, 14)

In [42]:
movies_flattened.head()

Unnamed: 0,belongs_to_collection,genre_1,genre_2,genre_3,genre_4,production_company_1,production_company_2,production_company_3,production_country_1,production_country_2,production_country_3,spoken_language_1,spoken_language_2,spoken_language_3
0,Toy Story Collection,Animation,Comedy,Family,,Pixar Animation Studios,,,United States of America,,,English,,
1,,Adventure,Fantasy,Family,,TriStar Pictures,Teitler Film,Interscope Communications,United States of America,,,English,Français,
2,Grumpy Old Men Collection,Romance,Comedy,,,Warner Bros.,Lancaster Gate,,United States of America,,,English,,
3,,Comedy,Drama,Romance,,Twentieth Century Fox Film Corporation,,,United States of America,,,English,,
4,Father of the Bride Collection,Comedy,,,,Sandollar Productions,Touchstone Pictures,,United States of America,,,English,,


In [43]:
merged_all_movies = pd.concat([movies_flattened, movies[['adult','budget','homepage', 'id','imdb_id', 'original_language', 'original_title', 'overview','popularity',\
       'poster_path', 'release_date', 'revenue', 'runtime','status', 'tagline', 'title', 'video','vote_average', 'vote_count']]], axis=1)

In [49]:
merged_all_movies.rename(columns={'id':'movie_id'}, inplace=True)

In [51]:
merged_all_movies.to_csv('flattened_merged_movies.csv',index=False,encoding='utf8')

In [59]:
merged_all_movies['movie_id'] = pd.to_numeric(merged_all_movies['movie_id'])

In [60]:
flattened_credits = pd.read_csv('flattened.csv')

In [61]:
final_merge = pd.merge(flattened_credits,merged_all_movies,on=['movie_id'])

In [62]:
final_merge.shape

(45538, 49)

In [63]:
final_merge.head()

Unnamed: 0,movie_id,actor_1_gender,actor_2_gender,actor_3_gender,actor_4_gender,actor_5_gender,actor_1_name,actor_2_name,actor_3_name,actor_4_name,...,poster_path,release_date,revenue,runtime,status,tagline,title,video,vote_average,vote_count
0,862.0,2.0,2.0,2.0,2.0,2.0,Tom Hanks,Tim Allen,Don Rickles,Jim Varney,...,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1995-10-30,373554033,81.0,Released,,Toy Story,False,7.7,5415
1,8844.0,2.0,2.0,1.0,0.0,1.0,Robin Williams,Jonathan Hyde,Kirsten Dunst,Bradley Pierce,...,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,262797249,104.0,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,15602.0,2.0,2.0,1.0,1.0,1.0,Walter Matthau,Jack Lemmon,Ann-Margret,Sophia Loren,...,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,1995-12-22,0,101.0,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92
3,31357.0,1.0,1.0,1.0,1.0,2.0,Whitney Houston,Angela Bassett,Loretta Devine,Lela Rochon,...,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1995-12-22,81452156,127.0,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34
4,11862.0,2.0,1.0,2.0,1.0,2.0,Steve Martin,Diane Keaton,Martin Short,Kimberly Williams-Paisley,...,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,76578911,106.0,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173


In [None]:
final

In [None]:
credits_flattened.to_csv('master_data.csv',index=False,encoding='utf-8')