In [1]:
import os
import pandas as pd
import morejson as json
import holcrawl
import holcrawl.shared
from tqdm import tqdm
import numpy as np

In [None]:
UNITED_DIR_PATH = holcrawl.shared._get_united_dir_path()
verbose = True

In [None]:
DEMOGRAPHICS = [
    'aged_under_18',
    'males_under_18',
    'males_aged_45+',
    'females',
    'males_aged_18-29',
    'imdb_staff',
    'imdb_users',
    'males',
    'aged_30-44',
    'females_aged_45+',
    'aged_18-29',
    'females_aged_18-29',
    'aged_45+',
    'males_aged_30-44',
    'top_1000_voters',
    'females_under_18',
    'females_aged_30-44',
    'us_users',
    'non-us_users'
]

In [None]:
def _decompose_dict_column(df, colname, allowed_cols):
    newdf = df[colname].apply(pd.Series)
    newdf = newdf.drop([
        col for col in newdf.columns if col not in allowed_cols], axis=1)
    newdf.columns = [colname+'.'+col for col in newdf.columns]
    return pd.concat([df.drop([colname], axis=1), newdf], axis=1)


def _dummy_list_column(df, colname):
    value_set = set([
        value for value_list in df[colname].dropna() for value in value_list])
    def _value_list_to_dict(value_list):
        try:
            return {
                value : 1 if value in value_list else 0
                for value in value_set}
        except TypeError:
            return {value : 0 for value in value_set}
    df[colname] = df[colname].apply(_value_list_to_dict)
    return _decompose_dict_column(df, colname, list(value_set))

In [None]:
def _list_to_sorted_str(listi):
    return '_'.join(sorted([string.replace('-', '') for string in listi]))

In [None]:
profiles = []
profile_files = os.listdir(UNITED_DIR_PATH)
if verbose:
    profile_files = tqdm(profile_files)
for profile_file in profile_files:
    if verbose:
        profile_files.set_description('Reading {}'.format(profile_file))
    file_path = os.path.join(UNITED_DIR_PATH, profile_file)
    _, ext = os.path.splitext(file_path)
    if ext == '.json':
        with open(file_path, 'r') as json_file:
            profiles.append(json.load(json_file))

In [None]:
def _avg_review_by_opening_generator(colname):
    def _avg_review_by_opening(row):
        return np.mean([
            review['score'] for review in row[colname] 
            if review['review_date'] <= row['opening_weekend_date']
        ])
    return _avg_review_by_opening

In [None]:
df = pd.DataFrame(profiles)

In [None]:
df = _decompose_dict_column(df, 'avg_rating_per_demo', DEMOGRAPHICS)
df = _decompose_dict_column(df, 'votes_per_demo', DEMOGRAPHICS)
df = _decompose_dict_column(
    df, 'rating_freq', [str(i) for i in range(1, 11)])
df = _dummy_list_column(df, 'genres')
# df.genres = df.genres.apply(_list_to_sorted_str)
df['avg_mc_critic_by_opening'] = df.apply(_avg_review_by_opening_generator('mc_pro_critic_reviews'), axis=1)
df['avg_mc_user_by_opening'] = df.apply(_avg_review_by_opening_generator('mc_user_reviews'), axis=1)

In [None]:
df.columns

In [None]:
pd.options.display.max_columns = 999
df

In [7]:
dataset_dir = holcrawl.shared._get_dataset_dir_path()

In [11]:
df = pd.read_csv(os.path.join(dataset_dir, 'movies_dataset.csv'))

In [12]:
pd.options.display.max_columns = 999
df.ix[0:2]

Unnamed: 0.1,Unnamed: 0,avg_screens,budget,budget_currency,closing_date,critic_review_count,duration,gross_income,max_screens,mc_avg_user_score,mc_metascore,mc_mixed_rating_frequency,mc_movie_name,mc_negative_rating_frequency,mc_positive_rating_frequency,mc_pro_critic_reviews,mc_user_reviews,metascore,name,num_weekends,opening_weekend_date,opening_weekend_income,opening_weekend_income_currency,rating,rating_count,release_day,release_month,release_year,user_review_count,year,avg_rating_per_demo.aged_18-29,avg_rating_per_demo.aged_30-44,avg_rating_per_demo.aged_45+,avg_rating_per_demo.aged_under_18,avg_rating_per_demo.females,avg_rating_per_demo.females_aged_18-29,avg_rating_per_demo.females_aged_30-44,avg_rating_per_demo.females_aged_45+,avg_rating_per_demo.females_under_18,avg_rating_per_demo.imdb_staff,avg_rating_per_demo.imdb_users,avg_rating_per_demo.males,avg_rating_per_demo.males_aged_18-29,avg_rating_per_demo.males_aged_30-44,avg_rating_per_demo.males_aged_45+,avg_rating_per_demo.males_under_18,avg_rating_per_demo.non-us_users,avg_rating_per_demo.top_1000_voters,avg_rating_per_demo.us_users,votes_per_demo.aged_18-29,votes_per_demo.aged_30-44,votes_per_demo.aged_45+,votes_per_demo.aged_under_18,votes_per_demo.females,votes_per_demo.females_aged_18-29,votes_per_demo.females_aged_30-44,votes_per_demo.females_aged_45+,votes_per_demo.females_under_18,votes_per_demo.imdb_staff,votes_per_demo.imdb_users,votes_per_demo.males,votes_per_demo.males_aged_18-29,votes_per_demo.males_aged_30-44,votes_per_demo.males_aged_45+,votes_per_demo.males_under_18,votes_per_demo.non-us_users,votes_per_demo.top_1000_voters,votes_per_demo.us_users,rating_freq.1,rating_freq.10,rating_freq.2,rating_freq.3,rating_freq.4,rating_freq.5,rating_freq.6,rating_freq.7,rating_freq.8,rating_freq.9,genres.action,genres.adventure,genres.animation,genres.biography,genres.comedy,genres.crime,genres.documentary,genres.drama,genres.family,genres.fantasy,genres.history,genres.horror,genres.music,genres.musical,genres.mystery,genres.romance,genres.sci-fi,genres.sport,genres.thriller,genres.war,avg_mc_critic_by_opening,avg_mc_user_by_opening
0,0,45.0,4000000.0,$,2014-04-18,94,93,9134,45,6.8,44,10,13 Sins,3,23,"[{'summary': 'Darkly funny and deeply twisted,...","[{'pos_reactions': 1, 'user': 'Tss5078', 'neg_...",44,13 Sins,1,2014-04-18,9134,$,6.3,25068,,,,69,2014,6.4,6.2,6.3,7.3,6.4,6.5,6.3,6.5,7.6,7.0,6.3,6.3,6.4,6.2,6.3,7.3,6.2,5.8,6.4,8575.0,10068.0,2256.0,55.0,3968.0,1882.0,1525.0,325.0,11.0,4.0,25068.0,17778.0,6605.0,8426.0,1889.0,44.0,12964.0,230.0,3845.0,337,1254,280,544,1255,3351,6991,6984,3093,979,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,49.7,
1,1,2605.375,50000000.0,$,2014-11-07,335,112,191616238,3426,8.1,71,81,22 Jump Street,42,621,[{'summary': 'A hugely enjoyable shambles. It’...,"[{'pos_reactions': 1, 'user': 'travellyn', 'ne...",71,22 Jump Street,8,2014-06-13,57071445,$,7.1,274236,13.0,June,2014.0,322,2014,7.3,6.7,6.3,7.9,7.1,7.3,6.8,6.4,8.2,6.1,7.1,7.0,7.3,6.7,6.3,7.8,6.9,6.2,7.2,128688.0,65135.0,9028.0,2267.0,40031.0,25654.0,8858.0,1383.0,620.0,16.0,274236.0,180467.0,101712.0,55439.0,7490.0,1628.0,119172.0,506.0,35147.0,2883,25688,1901,3165,6626,16774,43760,81836,64989,26614,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,72.844444,7.833333
2,2,1891.833333,110000000.0,$,2014-05-16,367,102,106369117,3490,6.0,48,179,300: Rise of an Empire,105,277,[{'summary': 'Even with the uniformly good per...,"[{'pos_reactions': 4, 'user': 'LXQUICKJUSTICE'...",48,300: Rise of an Empire,12,2014-03-07,45038460,$,6.2,234420,7.0,March,2014.0,532,2014,6.3,6.1,6.1,6.7,6.3,6.3,6.2,6.2,6.8,6.3,6.2,6.2,6.3,6.1,6.1,6.7,6.1,5.7,6.2,95933.0,73988.0,11634.0,691.0,20214.0,10390.0,6941.0,1219.0,78.0,11.0,234420.0,174235.0,84680.0,66014.0,10209.0,610.0,110596.0,496.0,22633.0,10053,15924,4188,7262,13817,29217,53207,56500,31549,12703,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,49.833333,7.142857


In [10]:
[col for col in df.columns if 'unnamed' in col.lower()]

['Unnamed: 0']