In [1]:
import os
import pandas as pd
import morejson as json
import holcrawl
import holcrawl.shared
from tqdm import tqdm
import numpy as np

In [2]:
UNITED_DIR_PATH = holcrawl.shared._get_united_dir_path()
verbose = True

In [3]:
DEMOGRAPHICS = [
    'aged_under_18',
    'males_under_18',
    'males_aged_45+',
    'females',
    'males_aged_18-29',
    'imdb_staff',
    'imdb_users',
    'males',
    'aged_30-44',
    'females_aged_45+',
    'aged_18-29',
    'females_aged_18-29',
    'aged_45+',
    'males_aged_30-44',
    'top_1000_voters',
    'females_under_18',
    'females_aged_30-44',
    'us_users',
    'non-us_users'
]

In [4]:
def _decompose_dict_column(df, colname, allowed_cols):
    newdf = df[colname].apply(pd.Series)
    newdf = newdf.drop([
        col for col in newdf.columns if col not in allowed_cols], axis=1)
    newdf.columns = [colname+'.'+col for col in newdf.columns]
    return pd.concat([df.drop([colname], axis=1), newdf], axis=1)


def _dummy_list_column(df, colname):
    value_set = set([
        value for value_list in df[colname].dropna() for value in value_list])
    def _value_list_to_dict(value_list):
        try:
            return {
                value : 1 if value in value_list else 0
                for value in value_set}
        except TypeError:
            return {value : 0 for value in value_set}
    df[colname] = df[colname].apply(_value_list_to_dict)
    return _decompose_dict_column(df, colname, list(value_set))

In [5]:
def _list_to_sorted_str(listi):
    return '_'.join(sorted([string.replace('-', '') for string in listi]))

In [6]:
profiles = []
profile_files = os.listdir(UNITED_DIR_PATH)
if verbose:
    profile_files = tqdm(profile_files)
for profile_file in profile_files:
    if verbose:
        profile_files.set_description('Reading {}'.format(profile_file))
    file_path = os.path.join(UNITED_DIR_PATH, profile_file)
    _, ext = os.path.splitext(file_path)
    if ext == '.json':
        with open(file_path, 'r') as json_file:
            profiles.append(json.load(json_file))

Reading woman_in_gold.json: 100%|██████████| 180/180 [00:00<00:00, 675.89it/s]           


In [11]:
df['opening_weekend_date'][0]

datetime.date(2014, 4, 18)

In [None]:
def _avg_review_by_opening_generator(colname):
    def _avg_review_by_opening(row):
        return np.mean([
            review['score'] for review in row[colname] 
            if review['review_date'] <= row['opening_weekend_date']
        ])
    return _avg_review_by_opening

In [9]:
df = pd.DataFrame(profiles)

In [23]:
df['opening_weekend_date'].map(lambda opendate: opendate.year)

0      2014
1      2014
2      2014
3      2014
4      2014
5      2014
6      2014
7      2014
8      2015
9      2014
10     2014
11     2015
12     2014
13     2014
14     2015
15     2014
16     2015
17     2014
18     2014
19     2015
20     2014
21     2015
22     2015
23     2015
24     2014
25     2014
26     2015
27     2014
28     2015
29     2015
       ... 
150    2015
151    2015
152    2014
153    2014
154    2014
155    2014
156    2014
157    2015
158    2015
159    2014
160    2015
161    2015
162    2015
163    2014
164    2015
165    2015
166    2014
167    2014
168    2014
169    2015
170    2015
171    2014
172    2014
173    2015
174    2014
175    2015
176    2014
177    2014
178    2014
179    2015
Name: opening_weekend_date, dtype: int64

In [22]:
df['opening_weekend_date'].map(lambda opendate: opendate.timetuple().tm_yday)

0      108
1      164
2       66
3       52
4       87
5      108
6      262
7       45
8      114
9       45
10     283
11     149
12     206
13     276
14     198
15     241
16     121
17      52
18      80
19     107
20     311
21     261
22      30
23      16
24     143
25     115
26      23
27      94
28      65
29     107
      ... 
150    100
151    275
152    262
153     38
154     17
155     38
156    199
157    359
158     65
159    269
160    212
161     16
162      2
163    262
164    142
165    198
166    108
167    262
168    360
169     65
170    107
171     38
172     73
173    331
174    122
175    149
176    283
177    339
178    199
179     93
Name: opening_weekend_date, dtype: int64

In [None]:
df = _decompose_dict_column(df, 'avg_rating_per_demo', DEMOGRAPHICS)
df = _decompose_dict_column(df, 'votes_per_demo', DEMOGRAPHICS)
df = _decompose_dict_column(
    df, 'rating_freq', [str(i) for i in range(1, 11)])
df = _dummy_list_column(df, 'genres')
# df.genres = df.genres.apply(_list_to_sorted_str)
df['avg_mc_critic_by_opening'] = df.apply(_avg_review_by_opening_generator('mc_pro_critic_reviews'), axis=1)
df['avg_mc_user_by_opening'] = df.apply(_avg_review_by_opening_generator('mc_user_reviews'), axis=1)
df['opening_month'] = df['opening_weekend_date'].map(lambda opendate: opendate.month)
df['opening_day'] = df['opening_weekend_date'].map(lambda opendate: opendate.day)
df['opening_day_of_year'] = df['opening_weekend_date'].map(lambda opendate: opendate.timetuple().tm_yday)

In [None]:
df.columns

In [15]:
pd.options.display.max_columns = 999
df

Unnamed: 0,avg_screens,budget,budget_currency,closing_date,critic_review_count,duration,gross_income,max_screens,mc_avg_user_score,mc_metascore,mc_mixed_rating_frequency,mc_movie_name,mc_negative_rating_frequency,mc_positive_rating_frequency,mc_pro_critic_reviews,mc_user_reviews,metascore,name,num_weekends,opening_weekend_date,opening_weekend_income,opening_weekend_income_currency,rating,rating_count,release_day,release_month,release_year,user_review_count,year,avg_rating_per_demo.aged_18-29,avg_rating_per_demo.aged_30-44,avg_rating_per_demo.aged_45+,avg_rating_per_demo.aged_under_18,avg_rating_per_demo.females,avg_rating_per_demo.females_aged_18-29,avg_rating_per_demo.females_aged_30-44,avg_rating_per_demo.females_aged_45+,avg_rating_per_demo.females_under_18,avg_rating_per_demo.imdb_staff,avg_rating_per_demo.imdb_users,avg_rating_per_demo.males,avg_rating_per_demo.males_aged_18-29,avg_rating_per_demo.males_aged_30-44,avg_rating_per_demo.males_aged_45+,avg_rating_per_demo.males_under_18,avg_rating_per_demo.non-us_users,avg_rating_per_demo.top_1000_voters,avg_rating_per_demo.us_users,votes_per_demo.aged_18-29,votes_per_demo.aged_30-44,votes_per_demo.aged_45+,votes_per_demo.aged_under_18,votes_per_demo.females,votes_per_demo.females_aged_18-29,votes_per_demo.females_aged_30-44,votes_per_demo.females_aged_45+,votes_per_demo.females_under_18,votes_per_demo.imdb_staff,votes_per_demo.imdb_users,votes_per_demo.males,votes_per_demo.males_aged_18-29,votes_per_demo.males_aged_30-44,votes_per_demo.males_aged_45+,votes_per_demo.males_under_18,votes_per_demo.non-us_users,votes_per_demo.top_1000_voters,votes_per_demo.us_users,rating_freq.1,rating_freq.10,rating_freq.2,rating_freq.3,rating_freq.4,rating_freq.5,rating_freq.6,rating_freq.7,rating_freq.8,rating_freq.9,genres.action,genres.adventure,genres.animation,genres.biography,genres.comedy,genres.crime,genres.documentary,genres.drama,genres.family,genres.fantasy,genres.history,genres.horror,genres.music,genres.musical,genres.mystery,genres.romance,genres.sci-fi,genres.sport,genres.thriller,genres.war,avg_mc_critic_by_opening,avg_mc_user_by_opening
0,45.000000,4000000.0,$,2014-04-18,94,93,9134,45,6.8,44,10,13 Sins,3,23,"[{'publication': 'RogerEbert.com', 'review_dat...","[{'total_reactions': 1, 'neg_reactions': 0, 'u...",44,13 Sins,1,2014-04-18,9134,$,6.3,25068,,,,69,2014,6.4,6.2,6.3,7.3,6.4,6.5,6.3,6.5,7.6,7.0,6.3,6.3,6.4,6.2,6.3,7.3,6.2,5.8,6.4,8575.0,10068.0,2256.0,55.0,3968.0,1882.0,1525.0,325.0,11.0,4.0,25068.0,17778.0,6605.0,8426.0,1889.0,44.0,12964.0,230.0,3845.0,337,1254,280,544,1255,3351,6991,6984,3093,979,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,49.700000,
1,2605.375000,50000000.0,$,2014-11-07,335,112,191616238,3426,8.1,71,81,22 Jump Street,42,621,"[{'publication': 'Boston Globe', 'review_date'...","[{'total_reactions': 1, 'neg_reactions': 0, 'u...",71,22 Jump Street,8,2014-06-13,57071445,$,7.1,274236,13.0,June,2014.0,322,2014,7.3,6.7,6.3,7.9,7.1,7.3,6.8,6.4,8.2,6.1,7.1,7.0,7.3,6.7,6.3,7.8,6.9,6.2,7.2,128688.0,65135.0,9028.0,2267.0,40031.0,25654.0,8858.0,1383.0,620.0,16.0,274236.0,180467.0,101712.0,55439.0,7490.0,1628.0,119172.0,506.0,35147.0,2883,25688,1901,3165,6626,16774,43760,81836,64989,26614,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,72.844444,7.833333
2,1891.833333,110000000.0,$,2014-05-16,367,102,106369117,3490,6.0,48,179,300: Rise of an Empire,105,277,"[{'publication': 'Chicago Sun-Times', 'review_...","[{'total_reactions': 4, 'neg_reactions': 0, 'u...",48,300: Rise of an Empire,12,2014-03-07,45038460,$,6.2,234420,7.0,March,2014.0,532,2014,6.3,6.1,6.1,6.7,6.3,6.3,6.2,6.2,6.8,6.3,6.2,6.2,6.3,6.1,6.1,6.7,6.1,5.7,6.2,95933.0,73988.0,11634.0,691.0,20214.0,10390.0,6941.0,1219.0,78.0,11.0,234420.0,174235.0,84680.0,66014.0,10209.0,610.0,110596.0,496.0,22633.0,10053,15924,4188,7262,13817,29217,53207,56500,31549,12703,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,49.833333,7.142857
3,938.750000,28000000.0,$,2014-05-09,215,117,30688364,2872,5.6,40,51,3 Days to Kill,24,41,"[{'publication': 'Chicago Sun-Times', 'review_...","[{'total_reactions': 1, 'neg_reactions': 0, 'u...",40,3 Days to Kill,12,2014-02-21,12242218,$,6.2,72337,21.0,February,2014.0,216,2014,6.3,6.0,6.3,6.7,6.3,6.4,6.2,6.5,6.7,6.1,6.2,6.2,6.3,6.0,6.3,6.7,6.1,5.7,6.1,21120.0,28352.0,8328.0,182.0,7668.0,3020.0,2906.0,1089.0,39.0,6.0,72337.0,53624.0,17895.0,25103.0,7092.0,142.0,36147.0,381.0,9738.0,1167,3518,1138,2125,4308,9575,19788,19413,8586,2719,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,45.523810,6.500000
4,29.000000,10000000.0,$,2014-10-10,8,110,1063723,133,8.1,37,0,50 to 1,1,6,"[{'publication': 'Washington Post', 'review_da...","[{'total_reactions': 0, 'neg_reactions': 0, 'u...",37,50 to 1,14,2014-03-28,39638,$,6.6,1482,,,,55,2014,6.5,6.2,6.7,,7.6,6.8,7.4,8.1,,,6.6,6.2,6.4,5.9,6.3,,6.2,5.2,6.7,174.0,456.0,533.0,,457.0,55.0,147.0,235.0,,,1482.0,747.0,117.0,306.0,290.0,,480.0,48.0,405.0,36,433,25,38,56,123,231,279,169,92,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,,
5,1303.600000,4000000.0,$,2014-06-06,69,86,17314483,2310,3.8,17,22,A Haunted House 2,57,27,"[{'publication': 'St. Louis Post-Dispatch', 'r...","[{'total_reactions': 2, 'neg_reactions': 0, 'u...",17,A Haunted House 2,5,2014-04-18,8843875,$,4.7,15409,18.0,April,2014.0,50,2014,5.0,4.3,4.0,5.7,4.8,5.0,4.5,4.1,6.3,,4.7,4.6,4.9,4.3,4.0,5.6,4.5,3.9,4.4,6946.0,4103.0,788.0,161.0,1932.0,1157.0,499.0,104.0,30.0,,15409.0,10838.0,5726.0,3563.0,672.0,131.0,6458.0,172.0,2592.0,1663,1620,1096,1384,1921,2701,2185,1535,902,402,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,21.785714,4.000000
6,2714.000000,28000000.0,$,2014-10-17,256,114,25977365,2714,6.5,57,53,A Walk Among the Tombstones,13,95,"[{'publication': 'RogerEbert.com', 'review_dat...","[{'total_reactions': 4, 'neg_reactions': 0, 'u...",57,A Walk Among the Tombstones,1,2014-09-19,12758780,$,6.5,92251,19.0,September,2014.0,208,2014,6.6,6.5,6.6,7.1,6.5,6.5,6.5,6.8,7.1,6.8,6.5,6.5,6.6,6.5,6.6,7.1,6.5,6.4,6.7,29490.0,34653.0,8845.0,238.0,9328.0,3785.0,3604.0,1130.0,42.0,12.0,92251.0,68213.0,25446.0,30638.0,7580.0,195.0,46413.0,435.0,11518.0,835,3565,592,1158,3042,9591,26634,31444,12168,3222,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,60.323529,5.333333
7,778.090909,12500000.0,$,2014-04-25,90,100,48637684,2253,6.1,62,17,About Last Night,12,27,"[{'publication': 'Tampa Bay Times', 'review_da...","[{'total_reactions': 1, 'neg_reactions': 0, 'u...",62,About Last Night,11,2014-02-14,27838201,$,6.1,17804,14.0,February,2014.0,42,2014,6.3,5.9,5.6,7.1,6.3,6.4,6.0,6.2,6.8,5.3,6.1,6.0,6.2,5.8,5.4,7.3,5.9,5.0,6.2,7620.0,5049.0,1092.0,62.0,4567.0,2619.0,1251.0,284.0,28.0,3.0,17804.0,10224.0,4919.0,3744.0,787.0,34.0,7521.0,160.0,3056.0,545,1392,297,499,1074,2472,4809,4049,1885,782,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,64.642857,8.000000
8,22.250000,,,2015-05-29,50,92,104415,54,6.6,57,6,Adult Beginners,1,5,"[{'publication': 'The Playlist', 'review_date'...","[{'total_reactions': 1, 'neg_reactions': 0, 'u...",57,Adult Beginners,4,2015-04-24,36657,$,5.6,6010,,,,23,2014,5.7,5.5,5.4,5.8,5.8,5.9,5.8,5.8,5.8,7.0,5.6,5.5,5.7,5.4,5.3,5.9,5.4,4.8,5.9,2166.0,2287.0,475.0,17.0,1409.0,724.0,502.0,94.0,6.0,2.0,6010.0,3708.0,1403.0,1746.0,376.0,10.0,2737.0,105.0,1330.0,209,207,151,214,571,1367,1824,1040,323,104,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,59.166667,
9,6.500000,,,2014-03-07,40,97,16740,11,7.1,61,4,Adult World,3,12,"[{'publication': 'Village Voice', 'review_date...","[{'total_reactions': 0, 'neg_reactions': 0, 'u...",61,Adult World,2,2014-02-14,4368,$,6.2,13983,14.0,February,2014.0,44,2013,6.3,5.9,6.0,7.3,6.5,6.5,6.2,6.4,7.4,6.5,6.2,6.0,6.2,5.8,6.0,7.3,6.1,5.5,6.2,6289.0,3941.0,1150.0,75.0,4535.0,3160.0,876.0,210.0,54.0,2.0,13983.0,7277.0,3045.0,3004.0,917.0,19.0,6558.0,156.0,2721.0,234,853,210,389,883,2014,3842,3416,1549,593,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,59.312500,


In [16]:
dataset_dir = holcrawl.shared._get_dataset_dir_path()

In [17]:
df = pd.read_csv(os.path.join(dataset_dir, 'movies_dataset.csv'))

In [18]:
pd.options.display.max_columns = 999
df.ix[0:2]

Unnamed: 0,avg_screens,budget,budget_currency,closing_date,critic_review_count,duration,gross_income,max_screens,mc_avg_user_score,mc_metascore,mc_mixed_rating_frequency,mc_movie_name,mc_negative_rating_frequency,mc_positive_rating_frequency,mc_pro_critic_reviews,mc_user_reviews,metascore,name,num_weekends,opening_weekend_date,opening_weekend_income,opening_weekend_income_currency,rating,rating_count,release_day,release_month,release_year,user_review_count,year,avg_rating_per_demo.aged_18-29,avg_rating_per_demo.aged_30-44,avg_rating_per_demo.aged_45+,avg_rating_per_demo.aged_under_18,avg_rating_per_demo.females,avg_rating_per_demo.females_aged_18-29,avg_rating_per_demo.females_aged_30-44,avg_rating_per_demo.females_aged_45+,avg_rating_per_demo.females_under_18,avg_rating_per_demo.imdb_staff,avg_rating_per_demo.imdb_users,avg_rating_per_demo.males,avg_rating_per_demo.males_aged_18-29,avg_rating_per_demo.males_aged_30-44,avg_rating_per_demo.males_aged_45+,avg_rating_per_demo.males_under_18,avg_rating_per_demo.non-us_users,avg_rating_per_demo.top_1000_voters,avg_rating_per_demo.us_users,votes_per_demo.aged_18-29,votes_per_demo.aged_30-44,votes_per_demo.aged_45+,votes_per_demo.aged_under_18,votes_per_demo.females,votes_per_demo.females_aged_18-29,votes_per_demo.females_aged_30-44,votes_per_demo.females_aged_45+,votes_per_demo.females_under_18,votes_per_demo.imdb_staff,votes_per_demo.imdb_users,votes_per_demo.males,votes_per_demo.males_aged_18-29,votes_per_demo.males_aged_30-44,votes_per_demo.males_aged_45+,votes_per_demo.males_under_18,votes_per_demo.non-us_users,votes_per_demo.top_1000_voters,votes_per_demo.us_users,rating_freq.1,rating_freq.10,rating_freq.2,rating_freq.3,rating_freq.4,rating_freq.5,rating_freq.6,rating_freq.7,rating_freq.8,rating_freq.9,genres.action,genres.adventure,genres.animation,genres.biography,genres.comedy,genres.crime,genres.documentary,genres.drama,genres.family,genres.fantasy,genres.history,genres.horror,genres.music,genres.musical,genres.mystery,genres.romance,genres.sci-fi,genres.sport,genres.thriller,genres.war,avg_mc_critic_by_opening,avg_mc_user_by_opening
0,45.0,4000000.0,$,2014-04-18,94,93,9134,45,6.8,44,10,13 Sins,3,23,"[{'critic': 'Christy Lemire', 'publication': '...","[{'total_reactions': 1, 'user': 'Tss5078', 'po...",44,13 Sins,1,2014-04-18,9134,$,6.3,25068,,,,69,2014,6.4,6.2,6.3,7.3,6.4,6.5,6.3,6.5,7.6,7.0,6.3,6.3,6.4,6.2,6.3,7.3,6.2,5.8,6.4,8575.0,10068.0,2256.0,55.0,3968.0,1882.0,1525.0,325.0,11.0,4.0,25068.0,17778.0,6605.0,8426.0,1889.0,44.0,12964.0,230.0,3845.0,337,1254,280,544,1255,3351,6991,6984,3093,979,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,49.7,
1,2605.375,50000000.0,$,2014-11-07,335,112,191616238,3426,8.1,71,81,22 Jump Street,42,621,"[{'critic': 'Ty Burr', 'publication': 'Boston ...","[{'total_reactions': 1, 'user': 'travellyn', '...",71,22 Jump Street,8,2014-06-13,57071445,$,7.1,274236,13.0,June,2014.0,322,2014,7.3,6.7,6.3,7.9,7.1,7.3,6.8,6.4,8.2,6.1,7.1,7.0,7.3,6.7,6.3,7.8,6.9,6.2,7.2,128688.0,65135.0,9028.0,2267.0,40031.0,25654.0,8858.0,1383.0,620.0,16.0,274236.0,180467.0,101712.0,55439.0,7490.0,1628.0,119172.0,506.0,35147.0,2883,25688,1901,3165,6626,16774,43760,81836,64989,26614,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,72.844444,7.833333
2,1891.833333,110000000.0,$,2014-05-16,367,102,106369117,3490,6.0,48,179,300: Rise of an Empire,105,277,"[{'critic': 'Richard Roeper', 'publication': '...","[{'total_reactions': 4, 'user': 'LXQUICKJUSTIC...",48,300: Rise of an Empire,12,2014-03-07,45038460,$,6.2,234420,7.0,March,2014.0,532,2014,6.3,6.1,6.1,6.7,6.3,6.3,6.2,6.2,6.8,6.3,6.2,6.2,6.3,6.1,6.1,6.7,6.1,5.7,6.2,95933.0,73988.0,11634.0,691.0,20214.0,10390.0,6941.0,1219.0,78.0,11.0,234420.0,174235.0,84680.0,66014.0,10209.0,610.0,110596.0,496.0,22633.0,10053,15924,4188,7262,13817,29217,53207,56500,31549,12703,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,49.833333,7.142857
