In [8]:
import pandas as pd
import os
import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mode
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [9]:
def _log(str):
    os.system(f'echo \"{str}\"')
    print(str)

In [10]:
NAN = '__NAN__'
INPUT_ROOT = '../input/data-science-bowl-2019'
JOIN_KEY = ['installation_id', 'game_session', 'title']
TARGET = 'accuracy_group'
FEATURES = {
    'event_id', 
    'game_session', 
    'timestamp', 
    'installation_id', 
    'event_count',
    'event_code', 
    'game_time', 
    'title', 
    'type', 
    'world'
}
CATEGORICAL_FEATURES = {
    'event_id', 
    'game_session',
    'timestamp',
    'installation_id',
    'event_code',
    'title', 
    'type', 
    'world'
}

In [11]:
def _init():
    # Characters such as empty strings '' or numpy.inf are considered NA values
    pd.set_option('use_inf_as_na', True)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.max_rows', 500)
    
    
_init()

In [12]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk(INPUT_ROOT):
    for filename in filenames:
        _log(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

../input/data-science-bowl-2019\test.csv
../input/data-science-bowl-2019\test.csv.zip
../input/data-science-bowl-2019\train.csv
../input/data-science-bowl-2019\train.csv.zip
../input/data-science-bowl-2019\train_labels.csv
../input/data-science-bowl-2019\train_labels.csv.zip


In [13]:
%%time
train_raw = pd.read_csv(f'{INPUT_ROOT}/train.csv', usecols=FEATURES)
train_labels = pd.read_csv(f'{INPUT_ROOT}/train_labels.csv', usecols=JOIN_KEY + [TARGET])
test_raw = pd.read_csv(f'{INPUT_ROOT}/test.csv', usecols=FEATURES)
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11341042 entries, 0 to 11341041
Data columns (total 10 columns):
event_id           object
game_session       object
timestamp          object
installation_id    object
event_count        int64
event_code         int64
game_time          int64
title              object
type               object
world              object
dtypes: int64(3), object(7)
memory usage: 865.3+ MB
Wall time: 33.8 s


In [14]:
train_labels.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17690 entries, 0 to 17689
Data columns (total 4 columns):
game_session       17690 non-null object
installation_id    17690 non-null object
title              17690 non-null object
accuracy_group     17690 non-null int64
dtypes: int64(1), object(3)
memory usage: 552.9+ KB


In [15]:
test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156414 entries, 0 to 1156413
Data columns (total 10 columns):
event_id           1156414 non-null object
game_session       1156414 non-null object
timestamp          1156414 non-null object
installation_id    1156414 non-null object
event_count        1156414 non-null int64
event_code         1156414 non-null int64
game_time          1156414 non-null int64
title              1156414 non-null object
type               1156414 non-null object
world              1156414 non-null object
dtypes: int64(3), object(7)
memory usage: 88.2+ MB


In [16]:
vs = sorted(train_raw['type'].unique())
_log(f'{len(vs)} train_raw type={vs}')

4 train_raw type=['Activity', 'Assessment', 'Clip', 'Game']


In [17]:
vs = sorted(train_raw['world'].unique())
_log(f'{len(vs)} train_raw type={vs}')

4 train_raw type=['CRYSTALCAVES', 'MAGMAPEAK', 'NONE', 'TREETOPCITY']


In [18]:
vs = sorted(train_raw['event_code'].unique())
_log(f'{len(vs)} train_raw type={vs}')

42 train_raw type=[2000, 2010, 2020, 2025, 2030, 2035, 2040, 2050, 2060, 2070, 2075, 2080, 2081, 2083, 3010, 3020, 3021, 3110, 3120, 3121, 4010, 4020, 4021, 4022, 4025, 4030, 4031, 4035, 4040, 4045, 4050, 4070, 4080, 4090, 4095, 4100, 4110, 4220, 4230, 4235, 5000, 5010]


In [19]:
vs = sorted(train_raw['title'].unique())
_log(f'{len(vs)} train_raw titles={vs}')

44 train_raw titles=['12 Monkeys', 'Air Show', 'All Star Sorting', 'Balancing Act', 'Bird Measurer (Assessment)', 'Bottle Filler (Activity)', 'Bubble Bath', 'Bug Measurer (Activity)', 'Cart Balancer (Assessment)', 'Cauldron Filler (Assessment)', 'Chest Sorter (Assessment)', 'Chicken Balancer (Activity)', 'Chow Time', 'Costume Box', 'Crystal Caves - Level 1', 'Crystal Caves - Level 2', 'Crystal Caves - Level 3', 'Crystals Rule', 'Dino Dive', 'Dino Drink', 'Egg Dropper (Activity)', 'Fireworks (Activity)', 'Flower Waterer (Activity)', 'Happy Camel', 'Heavy, Heavier, Heaviest', 'Honey Cake', 'Leaf Leader', 'Lifting Heavy Things', 'Magma Peak - Level 1', 'Magma Peak - Level 2', 'Mushroom Sorter (Assessment)', 'Ordering Spheres', 'Pan Balance', "Pirate's Tale", 'Rulers', 'Sandcastle Builder (Activity)', 'Scrub-A-Dub', 'Slop Problem', 'Treasure Map', 'Tree Top City - Level 1', 'Tree Top City - Level 2', 'Tree Top City - Level 3', 'Watering Hole (Activity)', 'Welcome to Lost Lagoon!']


In [20]:
vs = sorted(test_raw['title'].unique())
_log(f'{len(vs)} test titles={vs}')

44 test titles=['12 Monkeys', 'Air Show', 'All Star Sorting', 'Balancing Act', 'Bird Measurer (Assessment)', 'Bottle Filler (Activity)', 'Bubble Bath', 'Bug Measurer (Activity)', 'Cart Balancer (Assessment)', 'Cauldron Filler (Assessment)', 'Chest Sorter (Assessment)', 'Chicken Balancer (Activity)', 'Chow Time', 'Costume Box', 'Crystal Caves - Level 1', 'Crystal Caves - Level 2', 'Crystal Caves - Level 3', 'Crystals Rule', 'Dino Dive', 'Dino Drink', 'Egg Dropper (Activity)', 'Fireworks (Activity)', 'Flower Waterer (Activity)', 'Happy Camel', 'Heavy, Heavier, Heaviest', 'Honey Cake', 'Leaf Leader', 'Lifting Heavy Things', 'Magma Peak - Level 1', 'Magma Peak - Level 2', 'Mushroom Sorter (Assessment)', 'Ordering Spheres', 'Pan Balance', "Pirate's Tale", 'Rulers', 'Sandcastle Builder (Activity)', 'Scrub-A-Dub', 'Slop Problem', 'Treasure Map', 'Tree Top City - Level 1', 'Tree Top City - Level 2', 'Tree Top City - Level 3', 'Watering Hole (Activity)', 'Welcome to Lost Lagoon!']


In [21]:
vs = sorted(train_labels['title'].unique())
_log(f'{len(vs)} train_labels titles={vs}')

5 train_labels titles=['Bird Measurer (Assessment)', 'Cart Balancer (Assessment)', 'Cauldron Filler (Assessment)', 'Chest Sorter (Assessment)', 'Mushroom Sorter (Assessment)']


# All event ids in test set also exist in train set

In [44]:
test_set = set(test_raw['event_id'])
train_set = set(train_raw['event_id'])
vs = test_set - train_set
_log(f'{len(vs)} event_ids exist in test set but not train set.')

0 event_ids exist in test set but not train set.


In [22]:
def _remove_unlabelled_data(train_raw, train_labels):
    return train_raw[train_raw['installation_id'].isin(train_labels['installation_id'].unique())]


train_raw = _remove_unlabelled_data(train_raw, train_labels)

In [23]:
%%time
def _add_labels(train_raw, train_labels, on):
    return pd.merge(train_raw, train_labels, on=on, how='left')


train_raw = _add_labels(train_raw, train_labels, on=JOIN_KEY)

#_log(f'train[\'{TARGET}\'] count null={train[TARGET].isna().sum()}')

Wall time: 4.02 s


In [24]:
def _transform_timestamp(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df


train_raw = _transform_timestamp(train_raw)
test_raw = _transform_timestamp(test_raw)

In [25]:
def _drop_unused_columns(df):
    cols = ['game_session', 'type']
    return df.drop(columns=cols)

    
#train = _drop_unused_columns(train)
#test = _drop_unused_columns(test)

In [26]:
%%time
def _set_string_type(df, cols):
    df[cols] = df[cols].fillna(NAN).astype(str)
    return df


cols = list(CATEGORICAL_FEATURES)
train_raw = _set_string_type(train_raw, cols=cols + [TARGET])
test_raw = _set_string_type(test_raw, cols=cols)

Wall time: 1min 35s


In [27]:
def _sort_it(df):
    return df.sort_values(by=['installation_id', 'timestamp'])


train_raw = _sort_it(train_raw)
test_raw = _sort_it(test_raw)

# Multiple accuracy groups per installation id
In the train set, there are multiple accuracy groups per installation id. The task is to predict the accuracy group of the **last** assessment for a given installation id.

In [28]:
vs = train_raw[train_raw[TARGET] != NAN].groupby('installation_id')[TARGET].nunique()
vs

installation_id
0006a69f    3
0006c192    3
00129856    1
001d0ed0    3
00225f67    1
           ..
ff9305d7    2
ff9715db    4
ffc90c32    2
ffd2871d    1
ffeb0b1b    2
Name: accuracy_group, Length: 3614, dtype: int64

In [29]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7734558 entries, 0 to 7734557
Data columns (total 11 columns):
event_id           object
game_session       object
timestamp          object
installation_id    object
event_count        int64
event_code         object
game_time          int64
title              object
type               object
world              object
accuracy_group     object
dtypes: int64(2), object(9)
memory usage: 708.1+ MB


In [30]:
def _log_smoothing(df, cols):
    for col in cols:
        df[col] = np.log(df[col] + 1)
    return df


#cols = ['event_count', 'game_time']
#train = _log_smoothing(train, cols)
#test = _log_smoothing(test, cols)

In [31]:
train_raw.head(40)

Unnamed: 0,event_id,game_session,timestamp,installation_id,event_count,event_code,game_time,title,type,world,accuracy_group
0,27253bdc,34ba1a28d02ba8ba,2019-08-06 04:57:18.904000+00:00,0006a69f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,__NAN__
1,27253bdc,4b57c9a59474a1b9,2019-08-06 04:57:45.301000+00:00,0006a69f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,__NAN__
2,77261ab5,2b9d5af79bcdb79f,2019-08-06 04:58:14.538000+00:00,0006a69f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK,__NAN__
3,b2dba42b,2b9d5af79bcdb79f,2019-08-06 04:58:14.615000+00:00,0006a69f,2,3010,29,Sandcastle Builder (Activity),Activity,MAGMAPEAK,__NAN__
4,1325467d,2b9d5af79bcdb79f,2019-08-06 04:58:16.680000+00:00,0006a69f,3,4070,2137,Sandcastle Builder (Activity),Activity,MAGMAPEAK,__NAN__
5,1325467d,2b9d5af79bcdb79f,2019-08-06 04:58:18.474000+00:00,0006a69f,4,4070,3937,Sandcastle Builder (Activity),Activity,MAGMAPEAK,__NAN__
6,1325467d,2b9d5af79bcdb79f,2019-08-06 04:58:19.365000+00:00,0006a69f,5,4070,4820,Sandcastle Builder (Activity),Activity,MAGMAPEAK,__NAN__
7,1bb5fbdb,2b9d5af79bcdb79f,2019-08-06 04:58:21.490000+00:00,0006a69f,6,3110,6954,Sandcastle Builder (Activity),Activity,MAGMAPEAK,__NAN__
8,1325467d,2b9d5af79bcdb79f,2019-08-06 04:58:22.732000+00:00,0006a69f,7,4070,8187,Sandcastle Builder (Activity),Activity,MAGMAPEAK,__NAN__
9,5e812b27,2b9d5af79bcdb79f,2019-08-06 04:58:23.295000+00:00,0006a69f,8,4030,8745,Sandcastle Builder (Activity),Activity,MAGMAPEAK,__NAN__


In [32]:
train_raw.describe(include='all')

Unnamed: 0,event_id,game_session,timestamp,installation_id,event_count,event_code,game_time,title,type,world,accuracy_group
count,7734558,7734558,7734558,7734558,7734558.0,7734558.0,7734558.0,7734558,7734558,7734558,7734558
unique,379,175467,7347901,3614,,42.0,,44,4,4,5
top,bb3e370b,bb1f09ec062b6660,2019-09-03 17:03:39.779000+00:00,f1c21eda,,4070.0,,Bottle Filler (Activity),Game,MAGMAPEAK,__NAN__
freq,168955,3182,10,58988,,1521039.0,,661294,3834750,3208242,6869111
mean,,,,,97.75152,,167704.6,,,,
std,,,,,131.1293,,1068495.0,,,,
min,,,,,1.0,,0.0,,,,
25%,,,,,24.0,,29271.0,,,,
50%,,,,,57.0,,69412.0,,,,
75%,,,,,121.0,,147911.0,,,,


In [33]:
test_raw.head()

Unnamed: 0,event_id,game_session,timestamp,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,0ea9ecc81a565215,2019-09-10 16:50:24.910000+00:00,00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,c1ea43d8b8261d27,2019-09-10 16:50:55.503000+00:00,00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,27253bdc,7ed86c6b72e725e2,2019-09-10 16:51:51.805000+00:00,00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK
3,27253bdc,7e516ace50e7fe67,2019-09-10 16:53:12.825000+00:00,00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES
4,7d093bf9,a022c3f60ba547e7,2019-09-10 16:54:12.115000+00:00,00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES


In [34]:
test_raw.describe(include='all')

Unnamed: 0,event_id,game_session,timestamp,installation_id,event_count,event_code,game_time,title,type,world
count,1156414,1156414,1156414,1156414,1156414.0,1156414.0,1156414.0,1156414,1156414,1156414
unique,365,28445,1103166,1000,,42.0,,44,4,4
top,bb3e370b,b246933e8ac6a4aa,2019-09-13 18:38:38.848000+00:00,7b728c89,,4070.0,,Bottle Filler (Activity),Game,MAGMAPEAK
freq,28667,4456,9,21237,,234260.0,,112223,572260,511291
mean,,,,,110.7075,,186964.6,,,
std,,,,,219.3408,,1670162.0,,,
min,,,,,1.0,,0.0,,,
25%,,,,,25.0,,30561.25,,,
50%,,,,,60.0,,71028.0,,,
75%,,,,,124.0,,147691.0,,,


# Train-test split not by time
Both train and test sets span the same time period.

In [35]:
_log(f'train[timestamp] is from {train_raw.timestamp.min()} to {train_raw.timestamp.max()}')
_log(f'test[timestamp] is from {test_raw.timestamp.min()} to {test_raw.timestamp.max()}')

train[timestamp] is from 2019-07-23 14:38:25.256000+00:00 to 2019-10-14 21:57:26.930000+00:00
test[timestamp] is from 2019-07-24 00:04:25.361000+00:00 to 2019-10-14 21:00:34.858000+00:00


# Feature Engineering

In [51]:
def _cutoff(df, TARGET):
    return df[df[TARGET] != NAN]['timestamp'].max()


def _trim_events_after_last_assessment(df, cutoff):
    res = df[df['timestamp'] <= cutoff]
    #_log(f'cutoff: before={df.shape}, after={res.shape}')
    return res
    
    
def _target_variable(df, cutoff, TARGET):
    vs = df[df['timestamp'] == cutoff][TARGET].values
    assert len(set(vs)) == 1
    return str(int(float(vs[0])))
    
    
def _game_time_median(df):
    return df.groupby(['game_session'])['game_time'].transform('max').median()
    
    
def _event_count_median(df):
    return df.groupby(['game_session'])['event_count'].transform('max').median()


def _features(df, installation_id):
    res = {}
    iid = df[df['installation_id'] == installation_id]
    if TARGET in df.columns:
        cutoff = _cutoff(iid, TARGET)
        iid = _trim_events_after_last_assessment(iid, cutoff)
        res[TARGET] = _target_variable(iid, cutoff, TARGET)
    res['installation_id'] = [installation_id]
    res['type_activity'] = np.int32([sum(iid['type'] == 'Activity')])
    res['type_assessment'] = np.int32([sum(iid['type'] == 'Assessment')])
    res['type_clip'] = np.int32([sum(iid['type'] == 'Clip')])
    res['type_game'] = np.int32([sum(iid['type'] == 'Game')])
    assert len(iid) == res['type_activity'][0] + res['type_assessment'][0] + res['type_clip'][0] + res['type_game'][0]
    res['world_crystalcaves'] = np.int32([sum(iid['world'] == 'CRYSTALCAVES')])
    res['world_magmapeak'] = np.int32([sum(iid['world'] == 'MAGMAPEAK')])
    res['world_treetopcity'] = np.int32([sum(iid['world'] == 'TREETOPCITY')])
    res['world_none'] = np.int32([sum(iid['world'] == 'NONE')])
    res['title_12_monkeys'] = np.int32([sum(iid['title'] == '12 Monkeys')])
    res['title_air_show'] = np.int32([sum(iid['title'] == 'Air Show')])
    res['title_all_star_sorting'] = np.int32([sum(iid['title'] == 'All Star Sorting')])
    res['title_balancing_act'] = np.int32([sum(iid['title'] == 'Balancing Act')])
    res['title_bird_measurer'] = np.int32([sum(iid['title'] == 'Bird Measurer (Assessment)')])
    res['title_bottle_filler'] = np.int32([sum(iid['title'] == 'Bottle Filler (Activity)')])
    res['title_bubble_bath'] = np.int32([sum(iid['title'] == 'Bubble Bath')])
    res['title_bug_measurer'] = np.int32([sum(iid['title'] == 'Bug Measurer (Activity)')])
    res['title_cart_balancer'] = np.int32([sum(iid['title'] == 'Cart Balancer (Assessment)')])
    res['title_cauldron_filler'] = np.int32([sum(iid['title'] == 'Cauldron Filler (Assessment)')])
    res['title_chest_sorter'] = np.int32([sum(iid['title'] == 'Chest Sorter (Assessment)')])
    res['title_chicken_balancer'] = np.int32([sum(iid['title'] == 'Chicken Balancer (Activity)')])
    res['title_chow_time'] = np.int32([sum(iid['title'] == 'Chow Time')])
    res['title_costume_box'] = np.int32([sum(iid['title'] == 'Costume Box')])
    res['title_crystal_caves_1'] = np.int32([sum(iid['title'] == 'Crystal Caves - Level 1')])
    res['title_crystal_caves_2'] = np.int32([sum(iid['title'] == 'Crystal Caves - Level 2')])
    res['title_crystal_caves_3'] = np.int32([sum(iid['title'] == 'Crystal Caves - Level 3')])
    res['title_crystals_rule'] = np.int32([sum(iid['title'] == 'Crystals Rule')])
    res['title_dino_dive'] = np.int32([sum(iid['title'] == 'Dino Dive')])
    res['title_dino_drink'] = np.int32([sum(iid['title'] == 'Dino Drink')])
    res['title_egg_dropper'] = np.int32([sum(iid['title'] == 'Egg Dropper (Activity)')])
    res['title_fireworks'] = np.int32([sum(iid['title'] == 'Fireworks (Activity)')])
    res['title_flower_waterer'] = np.int32([sum(iid['title'] == 'Flower Waterer (Activity)')])
    res['title_happy_camel'] = np.int32([sum(iid['title'] == 'Happy Camel')])
    res['title_heavy'] = np.int32([sum(iid['title'] == 'Heavy, Heavier, Heaviest')])
    res['title_honey_cake'] = np.int32([sum(iid['title'] == 'Honey Cake')])
    res['title_leaf_leader'] = np.int32([sum(iid['title'] == 'Leaf Leader')])
    res['title_lifting'] = np.int32([sum(iid['title'] == 'Lifting Heavy Things')])
    res['title_magma_peak_1'] = np.int32([sum(iid['title'] == 'Magma Peak - Level 1')])
    res['title_magma_peak_2'] = np.int32([sum(iid['title'] == 'Magma Peak - Level 2')])
    res['title_mushroom_sorter'] = np.int32([sum(iid['title'] == 'Mushroom Sorter (Assessment)')])
    res['title_ordering_spheres'] = np.int32([sum(iid['title'] == 'Ordering Spheres')])
    res['title_pan_balance'] = np.int32([sum(iid['title'] == 'Pan Balance')])
    res['title_pirate_tale'] = np.int32([sum(iid['title'] == "Pirate's Tale")])
    res['title_rulers'] = np.int32([sum(iid['title'] == 'Rulers')])
    res['title_sandcastle'] = np.int32([sum(iid['title'] == 'Sandcastle Builder (Activity)')])
    res['title_scrub'] = np.int32([sum(iid['title'] == 'Scrub-A-Dub')])
    res['title_slop'] = np.int32([sum(iid['title'] == 'Slop Problem')])
    res['title_treasure_map'] = np.int32([sum(iid['title'] == 'Treasure Map')])
    res['title_treetop_city_1'] = np.int32([sum(iid['title'] == 'Tree Top City - Level 1')])
    res['title_treetop_city_2'] = np.int32([sum(iid['title'] == 'Tree Top City - Level 2')])
    res['title_treetop_city_3'] = np.int32([sum(iid['title'] == 'Tree Top City - Level 3')])
    res['title_watering_hole'] = np.int32([sum(iid['title'] == 'Watering Hole (Activity)')])
    res['title_welcome'] = np.int32([sum(iid['title'] == 'Welcome to Lost Lagoon!')])
    res['event_2000'] = np.int32([sum(iid['event_code'] == '2000')])
    res['event_2010'] = np.int32([sum(iid['event_code'] == '2010')])
    res['event_2020'] = np.int32([sum(iid['event_code'] == '2020')])
    res['event_2025'] = np.int32([sum(iid['event_code'] == '2025')])
    res['event_2030'] = np.int32([sum(iid['event_code'] == '2030')])
    res['event_2035'] = np.int32([sum(iid['event_code'] == '2035')])
    res['event_2040'] = np.int32([sum(iid['event_code'] == '2040')])
    res['event_2050'] = np.int32([sum(iid['event_code'] == '2050')])
    res['event_2060'] = np.int32([sum(iid['event_code'] == '2060')])
    res['event_2070'] = np.int32([sum(iid['event_code'] == '2070')])
    res['event_2075'] = np.int32([sum(iid['event_code'] == '2075')])
    res['event_2080'] = np.int32([sum(iid['event_code'] == '2080')])
    res['event_2081'] = np.int32([sum(iid['event_code'] == '2081')])
    res['event_2083'] = np.int32([sum(iid['event_code'] == '2083')])
    res['event_3010'] = np.int32([sum(iid['event_code'] == '3010')])
    res['event_3020'] = np.int32([sum(iid['event_code'] == '3020')])
    res['event_3021'] = np.int32([sum(iid['event_code'] == '3021')])
    res['event_3110'] = np.int32([sum(iid['event_code'] == '3110')])
    res['event_3120'] = np.int32([sum(iid['event_code'] == '3120')])
    res['event_3121'] = np.int32([sum(iid['event_code'] == '3121')])
    res['event_4010'] = np.int32([sum(iid['event_code'] == '4010')])
    res['event_4020'] = np.int32([sum(iid['event_code'] == '4020')])
    res['event_4021'] = np.int32([sum(iid['event_code'] == '4021')])
    res['event_4022'] = np.int32([sum(iid['event_code'] == '4022')])
    res['event_4025'] = np.int32([sum(iid['event_code'] == '4025')])
    res['event_4030'] = np.int32([sum(iid['event_code'] == '4030')])
    res['event_4031'] = np.int32([sum(iid['event_code'] == '4031')])
    res['event_4035'] = np.int32([sum(iid['event_code'] == '4035')])
    res['event_4040'] = np.int32([sum(iid['event_code'] == '4040')])
    res['event_4045'] = np.int32([sum(iid['event_code'] == '4045')])
    res['event_4050'] = np.int32([sum(iid['event_code'] == '4050')])
    res['event_4070'] = np.int32([sum(iid['event_code'] == '4070')])
    res['event_4080'] = np.int32([sum(iid['event_code'] == '4080')])
    res['event_4090'] = np.int32([sum(iid['event_code'] == '4090')])
    res['event_4095'] = np.int32([sum(iid['event_code'] == '4095')])
    res['event_4100'] = np.int32([sum(iid['event_code'] == '4100')])
    res['event_4110'] = np.int32([sum(iid['event_code'] == '4110')])
    res['event_4220'] = np.int32([sum(iid['event_code'] == '4220')])
    res['event_4230'] = np.int32([sum(iid['event_code'] == '4230')])
    res['event_4235'] = np.int32([sum(iid['event_code'] == '4235')])
    res['event_5000'] = np.int32([sum(iid['event_code'] == '5000')])
    res['event_5010'] = np.int32([sum(iid['event_code'] == '5010')])
    res['game_time_p50'] = np.int32(_game_time_median(iid))
    res['event_count_p50'] = np.int32(_event_count_median(iid))
    return pd.DataFrame.from_dict(res)


def _preprocess(raw):
    res = pd.DataFrame()
    iids = raw['installation_id'].unique()[:10]
    for iid in tqdm(iids):
        res = pd.concat([res, _features(raw, iid)])
    return res


train = _preprocess(train_raw)
train.info()

100%|██████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.93it/s]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 0
Data columns (total 98 columns):
accuracy_group            10 non-null object
installation_id           10 non-null object
type_activity             10 non-null int32
type_assessment           10 non-null int32
type_clip                 10 non-null int32
type_game                 10 non-null int32
world_crystalcaves        10 non-null int32
world_magmapeak           10 non-null int32
world_treetopcity         10 non-null int32
world_none                10 non-null int32
title_12_monkeys          10 non-null int32
title_air_show            10 non-null int32
title_all_star_sorting    10 non-null int32
title_balancing_act       10 non-null int32
title_bird_measurer       10 non-null int32
title_bottle_filler       10 non-null int32
title_bubble_bath         10 non-null int32
title_bug_measurer        10 non-null int32
title_cart_balancer       10 non-null int32
title_cauldron_filler     10 non-null int32
title_chest_sort




In [52]:
train.head()

Unnamed: 0,accuracy_group,installation_id,type_activity,type_assessment,type_clip,type_game,world_crystalcaves,world_magmapeak,world_treetopcity,world_none,title_12_monkeys,title_air_show,title_all_star_sorting,title_balancing_act,title_bird_measurer,title_bottle_filler,title_bubble_bath,title_bug_measurer,title_cart_balancer,title_cauldron_filler,title_chest_sorter,title_chicken_balancer,title_chow_time,title_costume_box,title_crystal_caves_1,title_crystal_caves_2,title_crystal_caves_3,title_crystals_rule,title_dino_dive,title_dino_drink,title_egg_dropper,title_fireworks,title_flower_waterer,title_happy_camel,title_heavy,title_honey_cake,title_leaf_leader,title_lifting,title_magma_peak_1,title_magma_peak_2,title_mushroom_sorter,title_ordering_spheres,title_pan_balance,title_pirate_tale,title_rulers,title_sandcastle,title_scrub,title_slop,title_treasure_map,title_treetop_city_1,title_treetop_city_2,title_treetop_city_3,title_watering_hole,title_welcome,event_2000,event_2010,event_2020,event_2025,event_2030,event_2035,event_2040,event_2050,event_2060,event_2070,event_2075,event_2080,event_2081,event_2083,event_3010,event_3020,event_3021,event_3110,event_3120,event_3121,event_4010,event_4020,event_4021,event_4022,event_4025,event_4030,event_4031,event_4035,event_4040,event_4045,event_4050,event_4070,event_4080,event_4090,event_4095,event_4100,event_4110,event_4220,event_4230,event_4235,event_5000,event_5010,game_time_p50,event_count_p50
0,3,0006a69f,1218,261,28,1111,0,728,1887,3,2,295,203,0,119,110,115,319,0,0,0,0,0,3,0,0,0,212,0,91,0,299,278,0,0,0,0,0,2,1,142,2,0,2,2,161,195,2,3,2,2,2,51,3,57,4,66,10,55,6,10,9,3,2,1,8,2,5,467,30,54,461,30,54,12,280,29,45,108,337,6,15,10,2,0,389,0,4,1,13,14,9,0,0,5,5,104816,115
0,2,0006c192,1206,343,30,643,206,1343,669,4,1,0,0,2,200,250,145,134,0,27,0,111,82,0,3,2,1,6,185,0,0,0,201,0,0,0,0,1,2,3,116,1,4,1,3,467,221,0,1,3,1,1,43,4,48,0,52,2,45,2,6,5,1,1,0,4,1,3,251,29,22,245,29,20,7,197,65,37,34,340,0,49,20,1,0,676,0,4,1,6,2,5,6,6,0,0,162878,200
0,3,00129856,0,43,0,0,0,0,43,0,0,0,0,0,40,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,2,1,2,0,0,0,0,0,0,0,0,0,6,0,1,5,0,1,0,3,0,0,3,7,0,0,0,0,0,7,0,0,0,1,1,0,0,0,0,0,39701,40
0,0,001d0ed0,12,202,19,176,247,0,161,1,0,0,0,1,51,0,0,0,21,0,29,0,67,0,1,1,1,0,0,0,12,0,0,45,1,2,0,2,0,0,101,2,64,1,2,0,0,0,1,1,1,1,0,1,29,3,11,2,5,2,0,0,0,0,0,1,1,0,29,22,11,29,21,10,3,28,0,0,26,65,0,12,6,0,0,79,0,0,0,9,5,0,0,0,0,0,52719,51
0,0,00225f67,211,65,10,355,0,0,640,1,1,37,122,0,37,0,0,3,0,0,0,0,0,1,0,0,0,196,0,0,0,86,122,0,0,0,0,0,0,0,28,1,0,1,1,0,0,0,1,1,1,1,0,1,18,0,11,3,9,0,0,0,1,1,0,0,0,0,82,10,10,82,9,8,3,59,0,15,21,80,0,6,0,0,2,208,0,0,1,0,2,0,0,0,0,0,130717,122


In [53]:
test = _preprocess(test_raw)
test.info()

100%|██████████████████████████████████████████████████| 10/10 [00:01<00:00,  8.55it/s]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 0
Data columns (total 97 columns):
installation_id           10 non-null object
type_activity             10 non-null int32
type_assessment           10 non-null int32
type_clip                 10 non-null int32
type_game                 10 non-null int32
world_crystalcaves        10 non-null int32
world_magmapeak           10 non-null int32
world_treetopcity         10 non-null int32
world_none                10 non-null int32
title_12_monkeys          10 non-null int32
title_air_show            10 non-null int32
title_all_star_sorting    10 non-null int32
title_balancing_act       10 non-null int32
title_bird_measurer       10 non-null int32
title_bottle_filler       10 non-null int32
title_bubble_bath         10 non-null int32
title_bug_measurer        10 non-null int32
title_cart_balancer       10 non-null int32
title_cauldron_filler     10 non-null int32
title_chest_sorter        10 non-null int32
title_chicken_bal




In [54]:
test.head()

Unnamed: 0,installation_id,type_activity,type_assessment,type_clip,type_game,world_crystalcaves,world_magmapeak,world_treetopcity,world_none,title_12_monkeys,title_air_show,title_all_star_sorting,title_balancing_act,title_bird_measurer,title_bottle_filler,title_bubble_bath,title_bug_measurer,title_cart_balancer,title_cauldron_filler,title_chest_sorter,title_chicken_balancer,title_chow_time,title_costume_box,title_crystal_caves_1,title_crystal_caves_2,title_crystal_caves_3,title_crystals_rule,title_dino_dive,title_dino_drink,title_egg_dropper,title_fireworks,title_flower_waterer,title_happy_camel,title_heavy,title_honey_cake,title_leaf_leader,title_lifting,title_magma_peak_1,title_magma_peak_2,title_mushroom_sorter,title_ordering_spheres,title_pan_balance,title_pirate_tale,title_rulers,title_sandcastle,title_scrub,title_slop,title_treasure_map,title_treetop_city_1,title_treetop_city_2,title_treetop_city_3,title_watering_hole,title_welcome,event_2000,event_2010,event_2020,event_2025,event_2030,event_2035,event_2040,event_2050,event_2060,event_2070,event_2075,event_2080,event_2081,event_2083,event_3010,event_3020,event_3021,event_3110,event_3120,event_3121,event_4010,event_4020,event_4021,event_4022,event_4025,event_4030,event_4031,event_4035,event_4040,event_4045,event_4050,event_4070,event_4080,event_4090,event_4095,event_4100,event_4110,event_4220,event_4230,event_4235,event_5000,event_5010,game_time_p50,event_count_p50
0,00abaee7,454,27,14,373,253,241,373,1,2,0,79,1,0,0,0,26,26,1,0,0,159,1,1,2,1,0,135,0,61,264,0,0,0,0,0,2,1,1,0,0,0,0,0,103,0,0,0,1,0,0,0,1,26,1,11,1,7,0,0,0,1,1,0,0,0,0,111,33,9,108,33,9,3,120,12,0,10,148,0,33,0,0,0,190,0,0,0,1,0,0,0,0,0,0,105916,103
0,01242218,1356,245,29,1089,885,848,984,2,1,72,56,3,61,221,51,146,14,36,83,226,77,1,1,2,1,230,79,111,186,197,160,123,1,2,40,2,2,2,51,1,124,1,1,179,126,0,1,2,2,1,41,2,58,4,85,2,80,2,6,6,4,1,3,10,2,7,418,25,72,416,25,72,12,325,26,36,94,407,11,49,25,6,0,385,0,2,0,23,3,9,0,0,4,4,118071,126
0,017c5718,143,1,6,0,0,0,146,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,72,71,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,4,9,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,24,0,0,0,15,0,20,4,40,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,51907,71
0,01a44906,145,1,10,78,0,0,231,3,1,0,78,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,37,108,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,1,0,0,3,14,0,3,2,3,0,0,0,0,0,0,0,0,0,21,2,3,21,2,3,1,31,0,29,9,61,0,0,0,0,0,28,0,1,0,0,0,0,0,0,0,0,77204,78
0,01bc6cb6,226,1,17,708,522,3,424,3,0,0,420,1,0,0,0,0,1,0,0,226,158,0,1,1,1,0,0,0,0,0,0,130,1,1,0,1,2,1,0,3,0,0,0,0,0,0,0,1,0,0,0,3,25,0,28,9,25,0,0,0,0,0,0,1,0,1,75,27,27,73,27,27,6,146,0,3,0,231,0,62,16,4,0,134,0,0,5,0,0,0,0,0,0,0,221698,226


In [55]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
_log(os.listdir("."))

['.ipynb_checkpoints', 'dsbowl19-preprocess.ipynb', 'test.csv', 'train.csv']
