In [1]:
# import the packages we expect to use

import pandas as pd
import sqlite3
import re

In [2]:
# read in the relevant CSV files

tn = pd.read_csv('dsc-phase-2-project-v3/zippedData/tn.movie_budgets.csv.gz', encoding='utf-8')
bom = pd.read_csv('dsc-phase-2-project-v3/zippedData/bom.movie_gross.csv.gz')

In [3]:
# connect to the IMDB database file

conn = sqlite3.connect('Phase_2_Repo\zippedData\im.db')

In [4]:
# load movie_basics into a DataFrame

query = """
    SELECT *
    FROM movie_basics
;
"""
df_basics = pd.read_sql(query, conn)

In [5]:
# close the database connection

conn.close()

In [6]:
# create replacement dictionary

replace_dict = {'â\x80\x99': "'",
                'â\x80\x94': ' - ',
                'â\x80\x93': ' - ',
                'â\x80¦': '…',
                'Ã\xad': 'í',
                'Ä\x81': 'ā',
                'Ã¤': 'ä',
                'Ã¥': 'å',
                'Ã¡': 'á',
                'Ã§': 'ç',
                'Ã©': 'é',
                'Ã¨': 'è',
                'Ã«': 'ë',
                'Ãª': 'ê',
                'à¬': 'ì',
                'Ä«': 'ī',
                'Ã´': 'ô',
                'Ã³': 'ó',
                'Ã¸': 'ø',
                'Ã¹': 'ù',
                'Ã¼': 'ü',
                'Ã»': 'ū',
                'Ã': 'à',    # This has to be after the two-digit patterns
                'Â': '',     # I think this is a non-breaking space
               }

# record 5707 ("I love you, don't touch me") has the comma coded as an ellipsis
# record 2398 ("Chai Dan Zhuan Jia") has an extra space between da and n

In [7]:
# this identifies the correct pattern, and replaces all particular bads with goods

tn['movie'] = tn['movie'].replace(replace_dict, regex=True)
df_basics['primary_title'] = df_basics['primary_title'].replace(replace_dict, regex=True)

In [8]:
merged_df = tn.merge(df_basics, how='left', right_on='primary_title', left_on='movie')

In [9]:
# this code converts the release date column to datetime

tn['release_date'] = pd.to_datetime(tn['release_date'])

In [10]:
# this code removes $ and , from currency columns.  It takes a Series names as an argument and removes
# those two characters from each value, then recasts the value as int64

def clean_currency(columns_to_clean):
    replace_chars = '\$|,'
    return tn[columns_to_clean].apply(lambda x: x.str.replace(replace_chars,'').astype('int64'))

In [11]:
# Uses the currency cleaning function to clean three columns

columns_to_clean = ['production_budget', 'domestic_gross', 'worldwide_gross']
tn[columns_to_clean] = clean_currency(columns_to_clean)