In [2]:
import pandas as pd
import numpy as np

import json
from joblib import Parallel, delayed

In [3]:
train = pd.read_csv('data/raw/train.csv', chunksize=10000)

In [4]:
little_columns = {
    'version',
    'castles_placed',
    'molds','sand','filled','movie_id','options',
    'animals', 'round_target.size', 'round_target.type', 'round_target.animal',
    'item_type', 'position', 'animal', 'correct',
    'misses', 'holding_shell', 'has_water', 'shells', 'holes',
    'shell_size', 'hole_position', 'cloud', 'cloud_size',
    'water_level', 'time_played', 'houses', 'dinosaurs',
    'dinosaur', 'dinosaurs_placed', 'house.size', 'house.position',
    'rocket', 'height', 'launched', 'flowers', 'flower',
    'growth', 'stumps', 'destination', 'session_duration',
    'exit_type', 'distance', 'target_distances',
    'round_prompt', 'target_size', 'resources', 'object_type',
    'group', 'bug', 'buglength', 'stage_number', 'hat',
    'caterpillar', 'hats', 'caterpillars', 'bird_height', 'target_containers', 'container_type',
    'containers', 'current_containers', 'total_containers', 'toy_earned', 'object', 'previous_jars', 'bottles',
    'bottle.amount', 'bottle.color', 'jar', 'jar_filled', 'tutorial_step', 'hats_placed',
    'toy', 'diet', 'target_weight', 'weight', 'scale_weight', 'scale_contents',
    'target_water_level', 'buckets', 'target_bucket', 'mode', 'prompt', 'round_number',
    'bucket', 'buckets_placed', 'cauldron','layout.left.chickens',
    'layout.left.pig', 'layout.right.chickens', 'layout.right.pig', 'side',
}

In [5]:
def process(i, data):
    event_data = pd.io.json.json_normalize(data.event_data.apply(json.loads))
    zero_df = pd.DataFrame(np.zeros([event_data.shape[0], len(little_columns)]), columns=little_columns, dtype=np.uint8)
    columns = little_columns & set(event_data.columns)
    zero_df.loc[:, columns] = np.where(event_data[columns].isna().values, np.uint8(0), np.uint8(1)).astype(np.uint8)
    return i, zero_df

In [6]:
compiled_data = Parallel(n_jobs=-1)(
             [delayed(process)(i, data) for i, data in enumerate(train)]
)

In [7]:
compiled_data.sort(key=lambda x: x[0])
compiled_data = [t[1] for t in compiled_data]

In [8]:
compiled_data = pd.concat(compiled_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [11]:
compiled_data = compiled_data.reset_index(drop=True)

In [13]:
compiled_data.tail()

Unnamed: 0,animal,animals,bird_height,bottle.amount,bottle.color,bottles,bucket,buckets,buckets_placed,bug,...,target_water_level,target_weight,time_played,total_containers,toy,toy_earned,tutorial_step,version,water_level,weight
11341037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11341038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11341039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11341040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11341041,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
del train