In [2]:
import pickle
import pandas as pd
import numpy as np
mojodf = pickle.load( open( "mojo_movies.pkl", "rb" ) )
numdf = pickle.load(open("numdf.pkl","rb"))

In [3]:
numdf.head()

Unnamed: 0,budget,domestic_gross,movie_title,release_date,worldwide_gross
0,"$175,000,000","$102,491,776",The Mummy: Tomb of the Dragon Emperor,8/1/2008,"$405,760,225"
1,"$175,000,000","$100,289,690",Evan Almighty,6/22/2007,"$174,131,329"
2,"$175,000,000","$88,246,220",Waterworld,7/28/1995,"$264,246,220"
3,"$175,000,000","$39,175,066",King Arthur: Legend of the Sword,5/12/2017,"$139,950,708"
4,"$175,000,000","$38,362,475",47 Ronin,12/25/2013,"$151,716,815"


In [4]:
# Convert dollar amounts to integers
dol_to_int = lambda x: int(x.replace('$','').replace(',',''))
numdf['budget'] = numdf['budget'].apply(dol_to_int)

In [5]:
numdf['domestic_gross'] = numdf['domestic_gross'].apply(dol_to_int)
numdf['worldwide_gross'] = numdf['worldwide_gross'].apply(dol_to_int)
numdf['release_date'] = pd.to_datetime(numdf['release_date']) 

In [6]:
numdf.head()

Unnamed: 0,budget,domestic_gross,movie_title,release_date,worldwide_gross
0,175000000,102491776,The Mummy: Tomb of the Dragon Emperor,2008-08-01,405760225
1,175000000,100289690,Evan Almighty,2007-06-22,174131329
2,175000000,88246220,Waterworld,1995-07-28,264246220
3,175000000,39175066,King Arthur: Legend of the Sword,2017-05-12,139950708
4,175000000,38362475,47 Ronin,2013-12-25,151716815


In [7]:
mojodf.head()

Unnamed: 0,domestic total gross,genre,movie title,opening weekend gross,opening weekend percentage,rating,release date,runtime
0,65187603,Adventure,Jack the Giant Slayer,27202226,41.7,PG-13,"March 1, 2013",1 hrs 54 min
1,177243721,Family Adventure,Night at the Museum:\nBattle of the Smithsonian,54173286,30.6,PG,"May 22, 2009",1 hrs 45 min
2,65187603,Adventure,Jack the Giant Slayer,27202226,41.7,PG-13,"March 1, 2013",1 hrs 54 min
3,177243721,Family Adventure,Night at the Museum:\nBattle of the Smithsonian,54173286,30.6,PG,"May 22, 2009",1 hrs 45 min
4,57138719,Thriller,Unlawful Entry,10067609,17.6,R,"June 26, 1992",1 hrs 57 min


In [8]:
mojodf.columns = (['domestic_gross','genre','movie_title','opening_weekend_gross',
                   'opening_weekend_percentage','rating','release_date','runtime'])

In [9]:
#clean strings
string_clean = lambda x: str(x).lower().strip().replace(':','').replace('\n', ' ')

In [10]:
mojodf['movie_title'] = mojodf['movie_title'].apply(string_clean)
numdf['movie_title'] = numdf['movie_title'].apply(string_clean)

In [11]:
# combine the two dataframes using Levenshtein Distance to match titles with slight differences
import difflib
def title_matcher(x):
    try:
        #difflib returns a LIST of close matches, we are choosing the first member of the list 
        matches = (difflib.get_close_matches(x, mojodf['movie_title']))
        return matches[0]
    except:
        # if there is no match, a null value will be inserted into the data frame 
        return np.NaN

In [12]:
numdf['likely_match'] = numdf['movie_title'].apply(title_matcher)

In [13]:
workingcopy = numdf
numdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5494 entries, 0 to 5493
Data columns (total 6 columns):
budget             5494 non-null int64
domestic_gross     5494 non-null int64
movie_title        5494 non-null object
release_date       5494 non-null datetime64[ns]
worldwide_gross    5494 non-null int64
likely_match       4725 non-null object
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 257.6+ KB


In [14]:
#drop entries from 
workingcopy.dropna(axis=0,subset=['likely_match'],inplace=True)
workingcopy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4725 entries, 0 to 5493
Data columns (total 6 columns):
budget             4725 non-null int64
domestic_gross     4725 non-null int64
movie_title        4725 non-null object
release_date       4725 non-null datetime64[ns]
worldwide_gross    4725 non-null int64
likely_match       4725 non-null object
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 258.4+ KB


In [15]:
workingcopy['movie_title'] = workingcopy['likely_match']

In [16]:
cinedf = pd.merge(mojodf, workingcopy, on='movie_title')

In [17]:
cinedf.head()

Unnamed: 0,domestic_gross_x,genre,movie_title,opening_weekend_gross,opening_weekend_percentage,rating,release_date_x,runtime,budget,domestic_gross_y,release_date_y,worldwide_gross,likely_match
0,65187603,Adventure,jack the giant slayer,27202226,41.7,PG-13,"March 1, 2013",1 hrs 54 min,100000,10178331,2006-09-29,10243159,jack the giant slayer
1,65187603,Adventure,jack the giant slayer,27202226,41.7,PG-13,"March 1, 2013",1 hrs 54 min,100000,10178331,2006-09-29,10243159,jack the giant slayer
2,177243721,Family Adventure,night at the museum battle of the smithsonian,54173286,30.6,PG,"May 22, 2009",1 hrs 45 min,150000000,177243721,2009-05-22,402231063,night at the museum battle of the smithsonian
3,177243721,Family Adventure,night at the museum battle of the smithsonian,54173286,30.6,PG,"May 22, 2009",1 hrs 45 min,150000000,177243721,2009-05-22,402231063,night at the museum battle of the smithsonian
4,87242834,Fantasy,miss peregrine's home for peculiar children,28871140,33.1,PG-13,"September 30, 2016",2 hrs 7 min,110000000,87242834,2016-09-30,296642834,miss peregrine's home for peculiar children


In [18]:
cinedf.isnull().values.any()

True

In [19]:
cinedf.dropna(inplace=True)
cinedf.reset_index(drop=True, inplace=True)

In [20]:
cinedf.isnull().values.any()

False

In [21]:
with open('cinedf.pkl','wb') as picklefile:
    pickle.dump(cinedf,picklefile)

In [22]:
cinedf.drop(['domestic_gross_y','release_date_y','likely_match'], axis=1,inplace=True)

In [23]:
cinedf.columns = ['domestic_gross', 'genre', 'movie_title','opening_weekend_gross','opening_weekend_percentage','rating','release_date','runtime','budget','worldwide_gross']

In [24]:
dummy = pd.get_dummies(cinedf['rating'])

In [25]:
cinedf = pd.concat([cinedf.drop('rating', axis=1), dummy], axis=1)

In [26]:
#Convert runtime to numeric value in minutes
def runtime_parser(runtime):
    rt = (int(runtime.split()[0]) * 60) + int(runtime.split()[2])
    return rt

In [27]:
cinedf.runtime = cinedf['runtime'].apply(runtime_parser)
cinedf['runtime'] = pd.to_numeric(cinedf['runtime'])

In [28]:
#convert release date to numeric value
cinedf['release_date'] = pd.to_datetime(cinedf['release_date'])

In [29]:
cinedf.drop_duplicates(inplace=True)

In [31]:
with open('data_clean.pkl','wb') as picklefile:
    pickle.dump(cinedf,picklefile)