In [1]:
import json
import pandas as pd
import numpy as np

import re

from sqlalchemy import create_engine
import psycopg2

# from config import db_password

import time

In [2]:
# 1. Add the clean movie function that takes in the argument, "movie".
#
# This function is part of the execute step of the three-step process of data cleaning:
# .inspect / .plan / .execute
# The .inspect and .plan steps having been finished throughout Module 8, this notebook consists of
# primarily summation .execute steps; a descriptive walk-through of the full process is found in
# the Jupyter notebook file "ETL_StepThree.ipynb," in the repository, or of course in the online
# material for Module 8. Running this function on the (full) set of movies in the current Wikipedia
# file dataset pares what were 193 initial columns to 39 (which have been filtered and merged --
# as printed in commented-out line 39 of the `extract_transform_load()` function below in this
# notebook -- that line is as follows: `print(len(wiki_movies_df.columns))`)
# ... and the list of the 39 columns is:
# ['url', 'year', 'imdb_link', 'title', 'Based on', 'Starring', 'Narrated by', 'Cinematography',
# 'Release date', 'Running time', 'Country', 'Language', 'Budget', 'Box office', 'Director',
# 'Distributor', 'Editor(s)', 'Composer(s)', 'Producer(s)', 'Production company(s)', 'Writer(s)',
# 'Genre', 'Original language(s)', 'Original network', 'Executive producer(s)',
# 'Production location(s)', 'Picture format', 'Audio format', 'Voices of', 'Followed by',
# 'Created by', 'Preceded by', 'Suggested by', 'alt_titles', 'Recorded', 'Venue', 'Label',
# 'Color process', 'Animator(s)'].
#
# Below is a list of the initial 193 columns (found from D1 `ETL_function_test.ipynb` Step 4
# DataFrame, or commented-out line 34 [which is: `wiki_movies_df = pd.DataFrame(wiki_movies_raw)`]
# of the `extract_transform_load()` function below in *this* notebook) ...
# ['url', 'year', 'imdb_link', 'title', 'Directed by', 'Produced by', 'Screenplay by',
# 'Story by', 'Based on', 'Starring', 'Narrated by', 'Music by', 'Cinematography', 'Edited by',
# 'Productioncompany ', 'Distributed by', 'Release date', 'Running time', 'Country', 'Language',
# 'Budget', 'Box office', 'Written by', 'Genre', 'Theme music composer', 'Country of origin',
# 'Original language(s)', 'Producer(s)', 'Editor(s)', 'Production company(s)', 'Original network',
# 'Original release', 'Productioncompanies ', 'Executive producer(s)', 'Production location(s)',
# 'Distributor', 'Picture format', 'Audio format', 'Voices of', 'Followed by', 'Composer(s)',
# 'Created by', 'Also known as', 'Opening theme', 'No. of episodes', 'Preceded by', 'Author',
# 'Publisher', 'Publication date', 'Media type', 'Pages', 'ISBN', 'OCLC', 'LC Class',
# 'Cover artist', 'Series', 'Set in', 'Adaptation by', 'Suggested by', 'Biographical data',
# 'Born', 'Died', 'Resting place', 'Occupation', 'Years active', 'Spouse(s)', 'Children',
# 'Parent(s)', 'Genres', 'Instruments', 'Labels', 'Website', 'Traditional', 'Mandarin', 'Type',
# 'Industry', 'Fate', 'Founded', 'Founder', 'Headquarters', 'Parent', 'Released', 'Recorded',
# 'Venue', 'Length', 'Label', 'Director', 'Producer', 'Area', 'Coordinates', 'Status',
# 'Opening date', 'Closing date', 'Replaced', 'Replaced by', 'Name', 'Attraction type', 'Music',
# 'Duration', 'Simplified Chinese', 'Traditional Chinese', 'Hanyu Pinyin', 'Literal meaning',
# 'Transcriptions', 'Bopomofo', 'Gwoyeu Romatzyh', 'Wade–Giles', 'IPA', 'Yale Romanization',
# 'Jyutping', 'Hokkien POJ', 'Animation by', 'Color process', 'Engine(s)', 'Genre(s)',
# 'Actor control', 'Production company', 'Release(s)', 'Format(s)', 'Simplified', 'Characters',
# 'Date premiered', 'Place premiered', 'Setting', 'Original language', 'Subject', 'Published',
# 'Dewey Decimal', 'Text', 'Illustrator', 'Original title', 'Published in English', 'French',
# 'Developed by', 'Ending theme', 'No. of seasons', 'Nationality', 'Portrayed by', 'Alias',
# 'Species', 'Gender', 'Family', 'Alma mater', 'Camera setup', 'Novel(s)', 'Comics', 'Film(s)',
# 'Screen story by', 'Hangul', 'Revised Romanization', 'McCune–Reischauer', 'Developer(s)',
# 'Publisher(s)', 'Designer(s)', 'Programmer(s)', 'Artist(s)', 'Writer(s)', 'Engine',
# 'Platform(s)', 'Release', 'Mode(s)', 'Original work', 'Television series', 'Japanese',
# 'Hepburn', 'Literally', 'Cantonese', 'Full name', 'Height', 'Seasons', 'Chinese',
# 'Other names', 'Relatives', 'Yiddish', 'Formerly', 'Key people', 'Total assets', 'Owner', 
# 'Number of employees', 'Divisions', 'Subsidiaries', 'Arabic', 'Romanized', 'Predecessor',
# 'Founders', 'Area served', 'Products', 'Services', 'Russian', 'Hebrew', 'Revenue',
# 'Operating income', 'Polish']

def clean_movie(movie):
    movie = dict(movie) #create a non-destructive copy
    # make an empty dict to hold all of the alternative titles...
    alt_titles = {}
    # combine alternate titles into one list
    for key in ['Also known as',
                'Arabic',
                'Cantonese',
                'Chinese',
                'French',
                'Hangul',
                'Hebrew',
                'Hepburn',
                'Japanese',
                'Literally',
                'Mandarin',
                'McCune–Reischauer',
                'Original title',
                'Polish',
                'Revised Romanization',
                'Romanized',
                'Russian',
                'Simplified',
                'Traditional',
                'Yiddish'\
                #'title'
               ]:
        if key in movie:
            alt_titles[key] = movie[key]
            movie.pop(key)
    if len(alt_titles) > 0:
        movie['alt_titles'] = alt_titles
    
    # merge column names
    def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)

    # following two columns of the 193 could be joined perhaps into a column called 'Narrator(s)'
    # but leaving two separate columns for now ...
    #  'Narrated by',
    #  'Voices of',
    #
    # Determine new names
    # two comment lines below concern column-new-name tally totals
    #  19(9)[5]{2} : to be changed(no change needed)[already new/keep as is]{to remove/combine later?}
    #   : red text(denoted as "already default")[all lowercase name]{left #comment, no notation}
    change_column_name( 'Adaptation by', 'Writer(s)' )
    change_column_name( 'Animation by', 'Animator(s)' )
#    change_column_name( 'Composer(s)', '?name')              #already default
#    change_column_name( 'Country', '?name')                  #already default
    change_column_name( 'Country of origin', 'Country' )
    change_column_name( 'Directed by', 'Director' )
#    change_column_name( 'Director', '?name')                 #already default

    change_column_name( 'Distributed by', 'Distributor' )
#     change_column_name( 'Distributed by', 'Distributor(s)' )  # use default in line above instead
#     change_column_name( 'Distributor', 'Distributor(s)' )  # make default instead of changing

    change_column_name( 'Edited by', 'Editor(s)' )
#    change_column_name( 'Editor(s)', '?name')                #already default
    change_column_name( 'Length', 'Running time' )
    change_column_name( 'Music by', 'Composer(s)' )
#    change_column_name( 'Narrated by', 'Narrator(s)' )
    change_column_name( 'Original release', 'Release date' )
    change_column_name( 'Produced by', 'Producer(s)' )
    change_column_name( 'Producer', 'Producer(s)' )
#    change_column_name( 'Producer(s)', '?name')              #already default
#    change_column_name( 'Production company(s)', '?name')    #already default
    change_column_name( 'Productioncompanies ', 'Production company(s)' )
    change_column_name( 'Productioncompany ', 'Production company(s)' )
#    change_column_name( 'Release date', '?name')             #already default
    change_column_name( 'Released', 'Release date' )
#    change_column_name( 'Running time', '?name')             #already default
    change_column_name( 'Screen story by', 'Writer(s)' )
    change_column_name( 'Screenplay by', 'Writer(s)' )
    change_column_name( 'Story by', 'Writer(s)' )
    change_column_name( 'Theme music composer', 'Composer(s)' )
#    change_column_name( 'Voices of', 'Narrator(s)' )
    change_column_name( 'Written by', 'Writer(s)' )

    return movie

In [3]:
# 2 Add the function that takes in three arguments;
# Wikipedia data, Kaggle metadata, and MovieLens rating data (from Kaggle)

def extract_transform_load():  # function_name():
    # Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.
    kaggle_metadata = pd.read_csv(kaggle_file, low_memory=False)
    ratings = pd.read_csv(ratings_file)

    # Open and read the Wikipedia data JSON file.
    with open(wiki_file, mode='r') as file:
        wiki_movies_raw = json.load(file)
        
    # 3. Write a list comprehension to filter out TV shows.
    #   # wiki_movies = [ movie for movie in wiki_movies_raw
    #   #                 if 'No. of episodes' not in movie ]

    #   # wiki_movies = [ movie for movie in wiki_movies_raw
    #   #                 if 'No. of episodes' not in movie
    #   #                 and ('Director' in movie or 'Directed by' in movie)
    #   #                 and 'imdb_link' in movie
    #   #               ]
    wiki_movies = [ movie for movie in wiki_movies_raw
                    if
                    ('Director' in movie or 'Directed by' in movie)
                    and 'imdb_link' in movie \
                    and 'No. of episodes' not in movie
                  ]

    # 4. Write a list comprehension to iterate through the cleaned wiki movies list
    # and call the clean_movie function on each movie.
    clean_movies = [clean_movie(movie) for movie in wiki_movies]

    # 5. Read in the cleaned movies list from Step 4 as a DataFrame.
    # wiki_movies_df = pd.DataFrame(wiki_movies_raw)
    wiki_movies_df = pd.DataFrame(clean_movies)

    #   print(f"len (raw):{len(wiki_movies_raw)}, len (pared):{len(clean_movies)};\nDataFrame (clean):\
    #       \n    len ... {len(wiki_movies_df)}\n    columns ... {len(wiki_movies_df.columns)}")
    #   # print(len(wiki_movies_df.columns))
    #   print(wiki_movies_df.columns.to_list())
    
    # 6. Write a try-except block to catch errors while extracting the IMDb ID using a regular expression string and
    #  dropping any imdb_id duplicates. If there is an error, capture and print the exception.
    try:
#         wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(
#                             r'(tt\d{7})')
        
#         wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(
#                 r'(tt\d{7})')[0].drop_duplicates(subset=['imdb_id'], keep=False)

#         imdb_id = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')
        
#         wiki_movies_df['imdb_id'] = imdb_id[0].drop_duplicates(keep=False)

#         wiki_movies_df['imdb_id'] = (wiki_movies_df['imdb_link'].str.extract(
#             r'(tt\d{7})')).drop_duplicates(keep=False)       

        wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')       
        wiki_movies_df.drop_duplicates(subset=['imdb_id'], inplace=True)
        # wiki_movies_df.drop_duplicates(subset=['imdb_id'], keep=False, inplace=True)
                
        #code using lambda function modified from Shift-Tab info. for .loc property
        #has_duplicate = len(
        #    wiki_movies_df.loc[lambda df: df['imdb_id'] == string_check]) > 1
        
        #if has_duplicate:
        #    print(f"already existing 'imdb_id' = '{string_check}', ... skipping.")
        #else:
        #    wiki_movies_df['imdb_id'] = string_check       
    except AttributeError:
        print(f"An error (AttributeError) occurred.")
    except KeyError: 
        print(f"An error (KeyError) occurred.")
    except SyntaxError:
        print("An error (SyntaxError) occurred.")
    except TypeError:
        print("An error (TypeError) occurred.")
    # except ValueError:
    #     print("A ValueError occurred.")
    
    #  7. Write a list comprehension to keep the columns that don't have null values from the wiki_movies_df DataFrame.
    # wiki_columns_to_keep = [column for column in wiki_movies_df.columns\
    #       if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]

    # wiki_columns_to_keep =\
    #     [column for column in wiki_movies_df.columns\
    #         if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df)]

    # keep columns that are 10% or more non-null values (less than 90% is null)
    wiki_columns_to_keep = [column for column in wiki_movies_df.columns\
          if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]

    # print(f'{wiki_movies_df[wiki_columns_to_keep].isnull().sum()}')
    wiki_movies_df = wiki_movies_df[wiki_columns_to_keep]
    
    # 8. Create a variable that will hold the non-null values from the “Box office” column.
    #has_box_office = (wiki_movies_df['Box office'].isnull() == False)
    box_office = wiki_movies_df['Box office'].dropna()
    
    # 9. Convert the box office data created in Step 8 to string values using the lambda and join functions.
    box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)

    # 10. Write a regular expression to match the six elements of "form_one" of the box office data.
    #   # "Create the First Form ['$123.4 million' (or billion) -- Module 8.3.10]
    #   # For the first form (f1 comments below), our pattern match string will include six elements
    #   # in the following order:
    #   #
    #f1 # f1_1. A dollar sign
    #f1 # f1_2. An arbitrary (but non-zero) number of digits
    #f1 # f1_3. An optional decimal point
    #f1 # f1_4. An arbitrary (but possibly zero) number of more digits
    #f1 # f1_5. A space (maybe more than one)
    #f1 # f1_6. The word 'million' or 'billion'"
    
    #form_one = r'\$\d+\.?\d*\s*[mb]ill?i?on'
    form_one = r'\$\d+\.?\d*\s*[mb]illion'

    # 11. Write a regular expression to match the three elements of "form_two" of the box office data.
    #   # "Create the Second Form ['$123,456,789' -- Module 8.3.10]
    #   # Next ... the numbers of our second form (f2 comments below) ... pattern match string will
    #   # include the following (three) elements:
    #   #
    #f2 # f2_1. A dollar sign
    #f2 # f2_2. A group of one to three digits
    #f2 # f2_3. At least one group starting with a comma and followed by exactly three digits    
    
    form_two = r'\$\d{1,3}(?:,\d{3})+'

    # 12. Add the parse_dollars function.
    def parse_dollars(s):
        # if s is not a string, return NaN
        if type(s) != str:
            return np.nan

        # if input is of the form $###.# million
        #if re.match(r'\$\s*\d+\.?\d*\s*mill?i?on', s, flags=re.I):
        if re.match(r'\$\s*\d+\.?\d*\s*million', s, flags=re.I):

            # remove dollar sign and " million"
            s = re.sub('\$|\s|[a-zA-Z]', '', s)

            # convert to float and multiply by a million
            value = float(s) * 10**6

            # return value
            return value

        # if input is of the form $###.# billion
        #elif re.match(r'\$\s*\d+\.?\d*\s*bill?i?on', s, flags=re.I):
        elif re.match(r'\$\s*\d+\.?\d*\s*billion', s, flags=re.I):

            # remove dollar sign and " billion"
            s = re.sub('\$|\s|[a-zA-Z]', '', s)

            # convert to float and multiply by a billion
            value = float(s) * 10**9

            # return value
            return value

        # if input is of the form $###,###,###
        elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)', s, flags=re.I):
    
            # remove dollar sign and commas
            s = re.sub('\$|,', '', s)

            # convert to float
            value = float(s)

            # return value
            return value

        # otherwise, return NaN
        else:
            return np.nan
    
    # 13. Clean the box office column in the wiki_movies_df DataFrame.
    #wiki_movies_df['Box office'] = \
    wiki_movies_df['box_office'] = \
              box_office.str.extract(f'({form_one}|{form_two})',
                         flags=re.I)[0].apply(parse_dollars)

    # Drop the newly converted original (now no longer needed) Box Office column
    wiki_movies_df.drop('Box office', axis=1, inplace=True)
    
    # 14. Clean the budget column in the wiki_movies_df DataFrame.
    budget = wiki_movies_df['Budget'].dropna()
    
    #wiki_movies_df['Budget'] = \
    wiki_movies_df['budget'] = \
              budget.str.extract(f'({form_one}|{form_two})',
                           flags=re.I)[0].apply(parse_dollars)

#     # Drop the newly converted original (now no longer needed) Budget column
#     wiki_movies_df.drop('Budget', axis=1, inplace=True)

    # 15. Clean the release date column in the wiki_movies_df DataFrame.
    # "The (four) forms we'll be parsing are:"
    #  -1} Full month name, one- to two-digit day, four-digit year
    #      (i.e., January 1, 2000)
    #  -2} Four-digit year, two-digit month, two-digit day, with
    #      any separator (i.e., 2000-01-01)
    #  -3} Full month name, four-digit year (i.e., January 2000)
    #  -4} Four-digit year
    #
    date_form_one = \
        r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s[123]?\d,\s\d{4}'
    date_form_two = r'\d{4}.[01]\d.[0123]\d'
    date_form_three = \
        r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}'
    date_form_four = r'\d{4}'    

    # parse the dates with built-in Pandas method
    # ..First, make a variable that holds the non-null values of
    # Release date in the DataFrame, converting lists to strings:"
    release_date = \
        wiki_movies_df['Release date'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)
    
    #wiki_movies_df['Release date'] = pd.to_datetime(
    wiki_movies_df['release_date'] = pd.to_datetime(
        release_date.str.extract(
            f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})')[0],
            infer_datetime_format=True)
    
    # 16. Clean the running time column in the wiki_movies_df DataFrame.
    # Check for data and parse
    running_time = wiki_movies_df['Running time'].dropna().apply(
            lambda x: ' '.join(x) if type(x) == list else x)
    
    # Extract the digits, allowing for alternate patterns
    running_time_extract = running_time.str.extract(
            r'(\d+)\s*ho?u?r?s?\s*(\d*)|(\d+)\s*m')
    
    # Convert strings to numeric; coercing the errors to turn any empty
    # strings into Not a Number (NaN), then using `fillna()` to change
    # all the NaNs to zeros."
    running_time_extract = running_time_extract.apply(
            lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)

    # Convert the capture groups to minutes ([0][1] are hours and minutes,
    # respectively; capture group [2] is strictly minutes)
    wiki_movies_df['running_time'] = running_time_extract.\
        apply(lambda row: row[0]*60 + row[1] if row[2] == 0 else row[2], axis=1)
    
    # Drop the now converted original `Running time` column from the dataset:
    wiki_movies_df.drop('Running time', axis=1, inplace=True)
    
    # Return three variables. The first is the wiki_movies_df DataFrame
    
    return wiki_movies_df, kaggle_metadata, ratings 

In [4]:
# 17. Create the path to your file directory and variables for the three files.
file_dir = "../../../DataBootcamp/Mod_8/"
# The Wikipedia data
#wiki_file = f'{file_dir}/wikipedia_movies.json'
wiki_file = f'{file_dir}/wikipedia-movies.json'
# The Kaggle metadata
kaggle_file = f'{file_dir}/movies_metadata.csv'
# The MovieLens rating data.
ratings_file = f'{file_dir}/ratings.csv'

In [5]:
# 18. Set the three variables equal to the function created in D1.
wiki_file, kaggle_file, ratings_file = extract_transform_load()

In [6]:
# 19. Set the wiki_movies_df equal to the wiki_file variable. 
wiki_movies_df = wiki_file

In [7]:
# 20. Check that the wiki_movies_df DataFrame looks like this. 
wiki_movies_df.head()

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Cinematography,Release date,Country,Language,...,Editor(s),Composer(s),Producer(s),Production company(s),Writer(s),imdb_id,box_office,budget,release_date,running_time
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",Oliver Wood,"[July 11, 1990, (, 1990-07-11, )]",United States,English,...,Michael Tronick,"[Cliff Eidelman, Yello]","[Steve Perry, Joel Silver]",Silver Pictures,"[David Arnott, James Cappe]",tt0098987,21400000.0,20000000.0,1990-07-11,102.0
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",Mark Plummer,"[May 17, 1990, (, 1990-05-17, ), (Cannes Film ...",United States,English,...,Howard E. Smith,Maurice Jarre,"[Ric Kidney, Robert Redlin]",Avenue Pictures,"[James Foley, Robert Redlin]",tt0098994,2700000.0,6000000.0,1990-05-17,114.0
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",Roger Deakins,"[August 10, 1990, (, 1990-08-10, )]",United States,"[English, Lao]",...,"[John Bloom, Lois Freeman-Fox]",Charles Gross,Daniel Melnick,"[Carolco Pictures, IndieProd Company]","[John Eskow, Richard Rush]",tt0099005,57718089.0,35000000.0,1990-08-10,113.0
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",Carlo Di Palma,"[December 25, 1990, (, 1990-12-25, )]",United States,English,...,Susan E. Morse,,Robert Greenhut,,Woody Allen,tt0099012,7331647.0,12000000.0,1990-12-25,106.0
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",Russell Boyd,"December 19, 1990",US,English,...,David Stiven,Maurice Jarre,John Cornell,,Paul Hogan,tt0099018,6939946.0,25000000.0,1990-12-19,95.0


In [8]:
# 21. Check that wiki_movies_df DataFrame columns are correct. 
wiki_movies_df.columns.to_list()

['url',
 'year',
 'imdb_link',
 'title',
 'Based on',
 'Starring',
 'Cinematography',
 'Release date',
 'Country',
 'Language',
 'Budget',
 'Director',
 'Distributor',
 'Editor(s)',
 'Composer(s)',
 'Producer(s)',
 'Production company(s)',
 'Writer(s)',
 'imdb_id',
 'box_office',
 'budget',
 'release_date',
 'running_time']