In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
file_dir = "/Users/gaucing/DataAnalytics/movies-etl/Resources"

In [3]:
with open(f"{file_dir}/wikipedia-movies.json", mode="r") as file:
    wiki_movies_raw = json.load(file)

In [4]:
kaggle_metadata = pd.read_csv(f"{file_dir}/movies_metadata.csv", low_memory=False)
ratings = pd.read_csv(f"{file_dir}/ratings.csv")

In [5]:
# remove non-films from data
wiki_movies = [movie for movie in wiki_movies_raw
               if ('Director' in movie or 'Directed by' in movie)
                   and 'imdb_link' in movie
                   and 'No. of episodes' not in movie]
len(wiki_movies)

7076

In [41]:
# keys for alternate titles
# each key is a column name whose data is an alternate title for its film
alt_title_keys = ["Also known as","Arabic","Cantonese","Chinese","French",
                  "Hangul","Hebrew","Hepburn","Japanese","Literally",
                  "Mandarin","McCune–Reischauer","Original title","Polish",
                  "Revised Romanization","Romanized","Russian",
                  "Simplified","Traditional","Yiddish"]

# column names that can be consolidated
# each key is a current name and each value is a replacement name
column_names_to_change = {"Animation by" : "Animator(s)",
                          "Directed by" : "Director",
                          "Distributed by" : "Distributor",
                          "Edited by" : "Editor(s)",
                          "Music by" : "Composer(s)",
                          "Theme music composer" : "Composer(s)",
                          "Producer" : "Producer(s)",
                          "Produced by" : "Producer(s)",
                          "Productioncompanies " : "Production company(s)",
                          "Productioncompany " : "Production company(s)",
                          "Released" : "Release date",
                          "Original release" : "Release date",
                          "Length" : "Running time",
                          "Country of origin" : "Country(s)",
                          "Country" : "Country(s)",
                          "Original language(s)" : "Language(s)",
                          "Language" : "Language(s)",
                          "Adaptation by" : "Writer(s)",
                          "Screen story by" : "Writer(s)",
                          "Screenplay by" : "Writer(s)",
                          "Story by" : "Writer(s)",
                          "Written by" : "Writer(s)",
                          "Voices of" : "Starring"
                          }

# func to clean movie data
def clean_movie(movie):
    movie = dict(movie) # create a non-destructive copy
    
    # remove any alternate title info and re-add it under new "Alternate title(s)" column
    alt_titles = {}
    for key in alt_title_keys:
        if key in movie:
            alt_titles[key] = movie.pop(key)
    if len(alt_titles) > 0:
        movie["Alternate title(s)"] = alt_titles
    
    # func to change column name
    def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)
    
    # consolidate column names
    for old_name, new_name in column_names_to_change.items():
        change_column_name(old_name, new_name)
    
    return movie

In [42]:
# clean movie and create new DataFrame
clean_movies = [clean_movie(movie) for movie in wiki_movies]
clean_movies_df = pd.DataFrame(clean_movies)
sorted(clean_movies_df.columns.tolist())

['Alternate title(s)',
 'Animator(s)',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country(s)',
 'Created by',
 'Director',
 'Distributor',
 'Editor(s)',
 'Executive producer(s)',
 'Followed by',
 'Genre',
 'Label',
 'Language(s)',
 'Narrated by',
 'Original network',
 'Picture format',
 'Preceded by',
 'Producer(s)',
 'Production company(s)',
 'Production location(s)',
 'Recorded',
 'Release date',
 'Running time',
 'Starring',
 'Suggested by',
 'Venue',
 'Writer(s)',
 'imdb_link',
 'title',
 'url',
 'year']

In [44]:
# extract IMDB IDs and drop any duplicate films, i.e. films with the same IMDB ID
clean_movies_df["IMDB ID"] = clean_movies_df["imdb_link"].str.extract(r"(tt\d{7})")
clean_movies_df.drop_duplicates(subset="IMDB ID", inplace=True)
print(len(clean_movies_df))
clean_movies_df.head()

7033


Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Narrated by,Cinematography,Release date,Running time,...,Created by,Preceded by,Suggested by,Alternate title(s),Recorded,Venue,Label,Color process,Animator(s),IMDB ID
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...","Andrew ""Dice"" Clay",Oliver Wood,"[July 11, 1990, (, 1990-07-11, )]",102 minutes,...,,,,,,,,,,tt0098987
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",,Mark Plummer,"[May 17, 1990, (, 1990-05-17, ), (Cannes Film ...",114 minutes,...,,,,,,,,,,tt0098994
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",,Roger Deakins,"[August 10, 1990, (, 1990-08-10, )]",113 minutes,...,,,,,,,,,,tt0099005
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",,Carlo Di Palma,"[December 25, 1990, (, 1990-12-25, )]",106 minutes,...,,,,,,,,,,tt0099012
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",,Russell Boyd,"December 19, 1990",95 minutes,...,,,,,,,,,,tt0099018
