In [10]:
import os
import json
from pprint import pprint
from collections import Counter
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import math
import pandas as pd
import statsmodels.api as sm
import numpy as np
import datetime as dt


#Put all movie data into a list of dictionaries
def movie_data(movie_file):
    movie_info_path = os.path.join('../data', movie_file)
    json_movie_list = os.listdir(movie_info_path)
    python_movie_list = []
    for json_movie in json_movie_list:
        json_path = os.path.join(movie_info_path, json_movie)
        with open(json_path, 'r') as file:
            python_movie = json.load(file)
            python_movie_list.append(python_movie)
    return python_movie_list

mojo = movie_data('boxofficemojo')
metacritic = movie_data('metacritic')

metacritic_clean = [item for item in metacritic if isinstance(item, dict)]

mojo_df = pd.DataFrame(mojo)
metacritic_df = pd.DataFrame(metacritic_clean)

merged_data = pd.merge(left=mojo_df, right=metacritic_df, how='inner', on=['title'])

def consolodate_director(col_x, col_y):
    director = []
    for i in xrange(len(col_x)):
        if col_x[i] == None:
            director.append(col_y[i])
        if col_y[i] == None:
            director.append(col_x[i])
        director.append(col_x[i])
    return pd.Series(director)

def review_breakdown(review_col, index):
    lst = []
    for group in review_col:
        lst.append(group[index])
    return pd.Series(lst)

def season_by_month(month):
    if month in (12,1,2):
      return 'winter'
    elif month in (3,4,5):
      return 'spring'
    elif month in (6,7,8):
      return 'summer'
    elif month in (9,10,11):
      return 'fall'

def group_ratings(rating):
    if rating not in ('R','PG','PG-13','Not Rated'):
        return 'other'
    else:
        return rating

merged_data['director'] = consolodate_director(merged_data['director_x'], merged_data['director_y'])
merged_data['rating_categories'] = merged_data['rating'].apply(group_ratings)
merged_data['pos_user_reviews'] = review_breakdown(merged_data['num_user_reviews'], 0)
merged_data['nut_user_reviews'] = review_breakdown(merged_data['num_user_reviews'], 1)
merged_data['neg_user_reviews'] = review_breakdown(merged_data['num_user_reviews'], 2)
merged_data['tot_user_reviews'] = review_breakdown(merged_data['num_user_reviews'], 3)
merged_data['pos_critic_reviews'] = review_breakdown(merged_data['num_critic_reviews'], 0)
merged_data['nut_critic_reviews'] = review_breakdown(merged_data['num_critic_reviews'], 1)
merged_data['neg_critic_reviews'] = review_breakdown(merged_data['num_critic_reviews'], 2)
merged_data['tot_critic_reviews'] = review_breakdown(merged_data['num_critic_reviews'], 3)

merged_data.rename(columns={'year_x':'year'}, inplace=True)

del merged_data['year_y']
del merged_data['director_x']
del merged_data['director_y']

merged_data_dropna = merged_data[['production_budget','opening_weekend_take','domestic_gross',
                                  'release_date_wide','widest_release','worldwide_gross','year', 'runtime_minutes',
                                  'metascore','user_score','pos_user_reviews','nut_user_reviews','neg_user_reviews',
                                  'tot_user_reviews','pos_critic_reviews','nut_critic_reviews','neg_critic_reviews',
                                  'tot_critic_reviews','rating','rating_categories']].dropna()

merged_data_dropna['release_month'] = merged_data_dropna['release_date_wide'].apply(lambda x: x[5:7]).astype(int)
merged_data_dropna['season'] = merged_data_dropna['release_month'].apply(season_by_month)

In [1]:
import holidays
from collections import Counter
import datetime

In [2]:
def gen_holiday(end_yr):
    us_hol = []
    for yr in range(1900,end_yr):
        for date in sorted(holidays.US(years=yr).items()):
            us_hol.append(date[0])
    return us_hol

In [3]:
us_holidays = gen_holiday(2015)

In [4]:
def make_date(dt_str):
    if dt_str == None:
        dt_conv = datetime.date(2014,1,3)
    else:
        dt_conv = datetime.date(int(dt_str[0:4]),int(dt_str[5:7]),int(dt_str[8:10]))
    return dt_conv

In [5]:
def hol_weekend(release_date):
    if make_date(release_date) in us_holidays:
        return 1
    elif make_date(release_date) + datetime.timedelta(days=1) in us_holidays:
        return 1
    elif make_date(release_date) + datetime.timedelta(days=2) in us_holidays:
        return 1
    elif make_date(release_date) + datetime.timedelta(days=3) in us_holidays:
        return 1
    elif make_date(release_date) + datetime.timedelta(days=4) in us_holidays:
        return 1
    elif make_date(release_date) + datetime.timedelta(days=5) in us_holidays:
        return 1
    else:
        return 0

In [6]:
# M == 0
def day_of_week(release_date):
    dt_conv = make_date(release_date)
    return dt_conv.weekday()        

In [11]:
merged_data['holiday_weekend'] = merged_data['release_date_wide'].apply(hol_weekend)

In [12]:
merged_data['day_of_week'] = merged_data['release_date_wide'].apply(day_of_week)

In [119]:
merged_data.loc[lambda df: df.holiday_weekend == True, :].head()

Unnamed: 0,alt_title,domestic_gross,mojo_slug,opening_per_theater,opening_weekend_take,production_budget,release_date_limited,release_date_wide,title,widest_release,...,pos_user_reviews,nut_user_reviews,neg_user_reviews,tot_user_reviews,pos_critic_reviews,nut_critic_reviews,neg_critic_reviews,tot_critic_reviews,holiday_weekend,day_of_week
19,27 Dresses (2008),76808654.0,27dresses,7526.0,23007725.0,30000000.0,,2008-01-18,27 Dresses,3074.0,...,25,6,9,40,7,22,2,31,1,4
36,47 Ronin (2013),38362475.0,47ronin,3686.0,9910310.0,175000000.0,,2013-12-25,47 Ronin,2690.0,...,74,22,23,119,1,8,12,21,1,2
38,50 First Dates (2004),120908074.0,50firstdates,11097.0,39852237.0,75000000.0,,2004-02-13,50 First Dates,3612.0,...,52,4,9,65,14,18,6,38,1,4
45,8 Mile (2002),116750901.0,8mile,20745.0,51240555.0,41000000.0,,2002-11-08,8 Mile,2585.0,...,113,7,8,128,34,4,0,38,1,4
52,About Time (2013),15322921.0,abouttime,3965.0,4758070.0,,2013-11-01,2013-11-08,About Time,1280.0,...,37,2,6,45,18,13,3,34,1,4


In [13]:
def in_list(alist, i):
    if isinstance(alist, list):
        return i in alist
    return False
 
def add_dummy(df, var_name):
    dummy = [in_list(i, var_name)*1 for i in df['genre']]
    df[var_name] = dummy
       
pprint(merged_data['genre'].head())
add_dummy(merged_data, 'Drama')
add_dummy(merged_data, 'Comedy')
add_dummy(merged_data, 'Thriller')
add_dummy(merged_data, 'Action')
add_dummy(merged_data, 'Romance')
pprint(merged_data.head())


0                          [Comedy, Romance]
1                   [Drama, Comedy, Romance]
2                              [Documentary]
3    [Adventure, Biography, Drama, Thriller]
4                  [Action, Thriller, Crime]
Name: genre, dtype: object
                           alt_title  domestic_gross              mojo_slug  \
0  10 Things I Hate About You (1999)      38178166.0  10thingsihateaboutyou   
1                    10 Years (2012)        203373.0                10years   
2               The 11th Hour (2007)        707343.0               11thhour   
3                   127 Hours (2010)      18335230.0               127hours   
4                   12 Rounds (2009)      12234694.0               12rounds   

   opening_per_theater  opening_weekend_take  production_budget  \
0               3668.0             8330681.0         30000000.0   
1               7569.0               22707.0                NaN   
2              15213.0               60853.0                NaN   
3    

In [15]:
merged_data_dropna = merged_data[['production_budget','opening_weekend_take','domestic_gross',
                                  'release_date_wide','widest_release','worldwide_gross','year', 'runtime_minutes',
                                  'metascore','user_score','pos_user_reviews','nut_user_reviews','neg_user_reviews',
                                  'tot_user_reviews','pos_critic_reviews','nut_critic_reviews','neg_critic_reviews',
                                  'tot_critic_reviews','rating','rating_categories','holiday_weekend','day_of_week',
                                  'Drama','Comedy','Thriller','Action','Romance']].dropna()
merged_data_dropna['release_month'] = merged_data_dropna['release_date_wide'].apply(lambda x: x[5:7]).astype(int)
merged_data_dropna['season'] = merged_data_dropna['release_month'].apply(season_by_month)

In [19]:
merged_data_dropna['widest_release_sq'] = merged_data_dropna['widest_release'].apply(lambda X:np.log(X))
merged_data_dropna['holiday_weekend_sq'] = merged_data_dropna['holiday_weekend'].apply(lambda X:X**3)

In [21]:
msk = np.random.rand(len(merged_data_dropna)) < 0.8
train = merged_data_dropna[msk]
test = merged_data_dropna[~msk]

In [27]:
X = sm.add_constant(train[['production_budget','runtime_minutes','holiday_weekend',
                            'widest_release_sq','holiday_weekend_sq','widest_release','metascore','day_of_week']].join(
 
linmodel = sm.OLS(Y,X).fit()

linmodel.summary()
                           pd.get_dummies(train['season']).join(
                            pd.get_dummies(train['rating_categories']))))
Y = train['opening_weekend_take']


SyntaxError: invalid syntax (<ipython-input-27-6a87ca2f65ab>, line 6)

In [29]:
X = sm.add_constant(train[['production_budget','runtime_minutes','holiday_weekend',
                            'widest_release_sq','holiday_weekend_sq','widest_release','metascore',
                          'Thriller']].join(
                            pd.get_dummies(train['season']).join(
                            pd.get_dummies(train['rating_categories'])).join(pd.get_dummies(train['day_of_week']))))
Y = train['opening_weekend_take']

linmodel = sm.OLS(Y,X).fit()

linmodel.summary()

0,1,2,3
Dep. Variable:,opening_weekend_take,R-squared:,0.601
Model:,OLS,Adj. R-squared:,0.592
Method:,Least Squares,F-statistic:,70.03
Date:,"Fri, 12 Aug 2016",Prob (F-statistic):,1.7000000000000001e-161
Time:,10:51:51,Log-Likelihood:,-16016.0
No. Observations:,905,AIC:,32070.0
Df Residuals:,885,BIC:,32170.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,3.818e+05,3.48e+06,0.110,0.913,-6.44e+06 7.2e+06
production_budget,0.1126,0.013,8.446,0.000,0.086 0.139
runtime_minutes,1.556e+04,2.4e+04,0.648,0.517,-3.15e+04 6.26e+04
holiday_weekend,6.14e+05,5.86e+05,1.048,0.295,-5.36e+05 1.76e+06
widest_release_sq,-5.807e+06,7.96e+05,-7.296,0.000,-7.37e+06 -4.24e+06
holiday_weekend_sq,6.14e+05,5.86e+05,1.048,0.295,-5.36e+05 1.76e+06
widest_release,1.483e+04,923.242,16.060,0.000,1.3e+04 1.66e+04
metascore,1.952e+05,2.51e+04,7.773,0.000,1.46e+05 2.44e+05
Thriller,2.242e+06,8.89e+05,2.523,0.012,4.98e+05 3.99e+06

0,1,2,3
Omnibus:,650.077,Durbin-Watson:,1.775
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16726.659
Skew:,2.941,Prob(JB):,0.0
Kurtosis:,23.224,Cond. No.,1.73e+24


In [240]:
train_pred = pd.DataFrame(linmodel.predict(X),index=train.index)
final = pd.concat([train_pred,Y,X],axis=1)
final

Unnamed: 0,0,opening_weekend_take,const,production_budget,runtime_minutes,holiday_weekend,day_of_week,widest_release_sq,holiday_weekend_sq,widest_release,metascore,fall,spring,summer,winter,Not Rated,PG,PG-13,R,other
0,1.846268e+07,8330681.0,1,30000000.0,97.0,0,2,7.745436,0,2311.0,70.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2.790757e+05,2136801.0,1,18000000.0,94.0,0,4,6.820016,0,916.0,82.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5,3.229317e+07,21054283.0,1,37000000.0,98.0,0,4,8.146999,0,3453.0,57.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,2.191149e+07,20617667.0,1,25000000.0,104.0,0,4,7.913155,0,2733.0,64.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
11,4.449252e+07,65237614.0,1,200000000.0,158.0,0,4,8.144389,0,3444.0,49.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12,2.895258e+06,113074.0,1,12000000.0,129.0,0,4,4.110874,0,61.0,78.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
14,2.367606e+07,24105943.0,1,35000000.0,123.0,0,4,7.990238,0,2952.0,48.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
20,1.867456e+07,10310672.0,1,43000000.0,103.0,0,4,7.833204,0,2523.0,46.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
24,3.140203e+07,50472480.0,1,76000000.0,107.0,0,4,8.136811,0,3418.0,38.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
25,2.701089e+07,27059130.0,1,61000000.0,109.0,0,4,8.015658,0,3028.0,55.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [25]:
X = sm.add_constant(test[['production_budget','runtime_minutes','holiday_weekend',
                           'widest_release_sq','holiday_weekend_sq','day_of_week','widest_release','metascore']].join(
                                            pd.get_dummies(test['season']).join(
                                            pd.get_dummies(test['rating_categories']))))
Y = test['opening_weekend_take']

In [242]:
test_pred = pd.DataFrame(linmodel.predict(X),index=test.index)

In [243]:
final = pd.concat([test_pred,Y,X],axis=1)

In [244]:
final['diff'] = final[0] - final['opening_weekend_take']

In [245]:
final

Unnamed: 0,0,opening_weekend_take,const,production_budget,runtime_minutes,holiday_weekend,widest_release_sq,holiday_weekend_sq,day_of_week,widest_release,...,fall,spring,summer,winter,Not Rated,PG,PG-13,R,other,diff
15,6.057528e+07,8754168.0,1,13000000.0,93.0,0,7.926964,0,4,2771.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.182111e+07
16,2.901233e+07,274454.0,1,20000000.0,124.0,0,6.018593,0,4,411.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.873787e+07
17,7.578342e+07,36302612.0,1,42000000.0,109.0,0,8.054523,0,4,3148.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.948081e+07
18,2.780529e+07,108865.0,1,5000000.0,135.0,0,6.204558,0,3,495.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.769642e+07
19,6.172319e+07,23007725.0,1,30000000.0,111.0,1,8.030735,1,4,3074.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.871547e+07
27,5.655411e+07,7160521.0,1,62000000.0,125.0,0,7.841886,0,4,2545.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,4.939359e+07
30,6.582604e+07,13330118.0,1,28000000.0,83.0,0,7.968320,0,4,2888.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,5.249593e+07
36,6.719046e+07,9910310.0,1,175000000.0,118.0,1,7.897296,1,2,2690.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,5.728015e+07
47,5.387585e+07,10740446.0,1,30000000.0,79.0,0,7.630461,0,2,2060.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.313540e+07
61,7.130928e+07,21157730.0,1,50200000.0,99.0,0,7.954021,0,4,2847.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.015155e+07


In [18]:
final.summary()

NameError: name 'final' is not defined