In [40]:
import os
import random
from math import sqrt
import json
import numpy as np
import pandas as pd
import lightgbm as lgb
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.metrics import cohen_kappa_score, make_scorer, mean_squared_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [41]:
def _log(str):
    os.system(f'echo \"{str}\"')
    print(str)

In [42]:
NAN = '__NAN__'
INPUT_ROOT = '../input/data-science-bowl-2019'
JOIN_KEY = ['installation_id', 'game_session', 'title']
TARGET = 'accuracy_group'
FEATURES = {
    'event_id', 
    'game_session', 
    'timestamp', 
    'installation_id', 
    'event_count',
    'event_code', 
    'game_time', 
    'title', 
    'type', 
    'world',
    'event_data'
}
EVENT_CODES = ['2000', '2010', '2020', '2025', '2030', '2035', '2040', '2050', '2060', '2070', '2075', '2080', '2081', '2083', '3010', '3020', '3021', '3110', '3120', '3121', '4010', '4020', '4021', '4022', '4025', '4030', '4031', '4035', '4040', '4045', '4050', '4070', '4080', '4090', '4095', '4100', '4110', '4220', '4230', '4235', '5000', '5010']
SEED = 31
FOLDS = 3
ESTIMATORS = 1000

In [43]:
def _init():
    # Characters such as empty strings '' or numpy.inf are considered NA values
    pd.set_option('use_inf_as_na', True)
    pd.set_option('display.max_columns', 999)
    pd.set_option('display.max_rows', 999)
    
    
_init()

In [44]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


seed_everything(SEED)

In [45]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk(INPUT_ROOT):
    for filename in filenames:
        _log(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

../input/data-science-bowl-2019\sample_submission.csv
../input/data-science-bowl-2019\test.csv
../input/data-science-bowl-2019\test.csv.zip
../input/data-science-bowl-2019\train.csv
../input/data-science-bowl-2019\train.csv.zip
../input/data-science-bowl-2019\train_labels.csv
../input/data-science-bowl-2019\train_labels.csv.zip


In [46]:
%%time
train_raw = pd.read_csv(f'{INPUT_ROOT}/train.csv', usecols=FEATURES)
train_labels = pd.read_csv(f'{INPUT_ROOT}/train_labels.csv', usecols=JOIN_KEY + [TARGET])
test_raw = pd.read_csv(f'{INPUT_ROOT}/test.csv', usecols=FEATURES)
train_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17690 entries, 0 to 17689
Data columns (total 4 columns):
game_session       17690 non-null object
installation_id    17690 non-null object
title              17690 non-null object
accuracy_group     17690 non-null int64
dtypes: int64(1), object(3)
memory usage: 552.9+ KB
Wall time: 1min


# Add labels to train data

In [47]:
def _remove_unlabelled_data(train_raw, train_labels):
    return train_raw[train_raw['installation_id'].isin(train_labels['installation_id'].unique())]


train_raw = _remove_unlabelled_data(train_raw, train_labels)

In [48]:
%%time
def _add_labels(train_raw, train_labels, on):
    train_raw = pd.merge(train_raw, train_labels, on=on, how='left')
    train_raw[TARGET] = np.int16(train_raw[TARGET])
    return train_raw


train_raw = _add_labels(train_raw, train_labels, on=JOIN_KEY)
del train_labels

Wall time: 6.24 s


# Extract event data JSON

In [50]:
def _concat_columns(df1, df2):
    """Concatenate the columns of two pandas dataframes in the order of the operands.
    Both dataframes must have the same number of rows.
    """
    assert len(df1) == len(df2)
    res = pd.concat([df1, df2.reindex(df1.index)], axis=1, join='inner')
    assert len(res) == len(df1)
    return res
    

def _extract_event_data(df, keep_cols, chunk_size=100000):
    res = pd.DataFrame()
    _len = len(df)
    all_cols = []
    for i in tqdm(range(0, _len, chunk_size)):
        if i + chunk_size < _len:
            chunk = df[i:i + chunk_size].copy()
        else:
            chunk = df[i:].copy()
        ed = pd.io.json.json_normalize(chunk['event_data'].apply(json.loads)).add_prefix('ed.')
        if i == 0:
            all_cols = ed.columns.values
        ed = ed[keep_cols]
        chunk = _concat_columns(chunk, ed)
        res = pd.concat([res, chunk], ignore_index=True)
    _log(f'{len(all_cols)} event_data={all_cols}')
    assert len(df) == len(res)
    return res


keep_cols = ['ed.identifier', 'ed.duration', 'ed.coordinates.x', 'ed.coordinates.y',
            'ed.coordinates.stage_width', 'ed.coordinates.stage_height',
            'ed.level', 'ed.round', 'ed.correct', 'ed.misses']
train_raw = _extract_event_data(train_raw, keep_cols)
test_raw = _extract_event_data(test_raw, keep_cols)











  0%|                                                           | 0/78 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A









  1%|▋                                                  | 1/78 [00:06<08:26,  6.57s/it][A[A[A[A[A[A[A[A[A[A









  3%|█▎                                                 | 2/78 [00:13<08:29,  6.71s/it][A[A[A[A[A[A[A[A[A[A









  4%|█▉                                                 | 3/78 [00:19<08:16,  6.62s/it][A[A[A[A[A[A[A[A[A[A









  5%|██▌                                                | 4/78 [00:26<08:12,  6.65s/it][A[A[A[A[A[A[A[A[A[A









  6%|███▎                                               | 5/78 [00:33<08:05,  6.65s/it][A[A[A[A[A[A[A[A[A[A









  8%|███▉                                               | 6/78 [00:40<08:12,  6.84s/it][A[A[A[A[A[A[A[A[A[A









  9%|████▌                                              | 7/78 [00:47<08:00,  6.76s/it][A[A[A[A[

 82%|█████████████████████████████████████████         | 64/78 [09:29<02:26, 10.45s/it][A[A[A[A[A[A[A[A[A[A









 83%|█████████████████████████████████████████▋        | 65/78 [09:40<02:18, 10.63s/it][A[A[A[A[A[A[A[A[A[A









 85%|██████████████████████████████████████████▎       | 66/78 [09:50<02:06, 10.57s/it][A[A[A[A[A[A[A[A[A[A









 86%|██████████████████████████████████████████▉       | 67/78 [10:00<01:55, 10.52s/it][A[A[A[A[A[A[A[A[A[A









 87%|███████████████████████████████████████████▌      | 68/78 [10:11<01:46, 10.63s/it][A[A[A[A[A[A[A[A[A[A









 88%|████████████████████████████████████████████▏     | 69/78 [10:22<01:36, 10.76s/it][A[A[A[A[A[A[A[A[A[A









 90%|████████████████████████████████████████████▊     | 70/78 [10:33<01:25, 10.72s/it][A[A[A[A[A[A[A[A[A[A









 91%|█████████████████████████████████████████████▌    | 71/78 [10:45<01:16, 11.00s/it][A[A[A[A[A[A[A[A

136 event_data=['ed.event_code' 'ed.event_count' 'ed.version' 'ed.game_time'
 'ed.description' 'ed.identifier' 'ed.media_type' 'ed.total_duration'
 'ed.coordinates.x' 'ed.coordinates.y' 'ed.coordinates.stage_width'
 'ed.coordinates.stage_height' 'ed.duration' 'ed.size' 'ed.sand'
 'ed.filled' 'ed.castles_placed' 'ed.molds' 'ed.level' 'ed.round'
 'ed.movie_id' 'ed.time_played' 'ed.options' 'ed.animals'
 'ed.round_target.size' 'ed.round_target.type' 'ed.round_target.animal'
 'ed.item_type' 'ed.position' 'ed.animal' 'ed.correct' 'ed.misses'
 'ed.houses' 'ed.dinosaurs' 'ed.dinosaur' 'ed.dinosaurs_placed'
 'ed.house.size' 'ed.house.position' 'ed.rocket' 'ed.height' 'ed.launched'
 'ed.flowers' 'ed.flower' 'ed.growth' 'ed.stumps' 'ed.source'
 'ed.destination' 'ed.session_duration' 'ed.exit_type' 'ed.distance'
 'ed.target_distances' 'ed.round_prompt' 'ed.target_size' 'ed.resources'
 'ed.object_type' 'ed.group' 'ed.bug' 'ed.buglength' 'ed.stage_number'
 'ed.hat' 'ed.caterpillar' 'ed.hats' 'ed.ca











  8%|████▎                                              | 1/12 [00:06<01:15,  6.85s/it][A[A[A[A[A[A[A[A[A[A









 17%|████████▌                                          | 2/12 [00:14<01:09,  7.00s/it][A[A[A[A[A[A[A[A[A[A









 25%|████████████▊                                      | 3/12 [00:21<01:04,  7.15s/it][A[A[A[A[A[A[A[A[A[A









 33%|█████████████████                                  | 4/12 [00:28<00:56,  7.07s/it][A[A[A[A[A[A[A[A[A[A









 42%|█████████████████████▎                             | 5/12 [00:36<00:50,  7.22s/it][A[A[A[A[A[A[A[A[A[A









 50%|█████████████████████████▌                         | 6/12 [00:44<00:44,  7.41s/it][A[A[A[A[A[A[A[A[A[A









 58%|█████████████████████████████▊                     | 7/12 [00:51<00:37,  7.50s/it][A[A[A[A[A[A[A[A[A[A









 67%|██████████████████████████████████                 | 8/12 [00:59<00:29,  7.44s/it][A[A[A[A[

134 event_data=['ed.event_code' 'ed.event_count' 'ed.version' 'ed.round' 'ed.game_time'
 'ed.coordinates.x' 'ed.coordinates.y' 'ed.coordinates.stage_width'
 'ed.coordinates.stage_height' 'ed.description' 'ed.identifier'
 'ed.media_type' 'ed.total_duration' 'ed.duration' 'ed.dinosaur' 'ed.diet'
 'ed.target_weight' 'ed.resources' 'ed.weight' 'ed.source' 'ed.correct'
 'ed.scale_weight' 'ed.scale_contents' 'ed.houses' 'ed.dinosaurs'
 'ed.size' 'ed.dinosaurs_placed' 'ed.house.size' 'ed.house.position'
 'ed.misses' 'ed.rocket' 'ed.height' 'ed.launched' 'ed.crystals'
 'ed.crystal_id' 'ed.side' 'ed.left' 'ed.right' 'ed.session_duration'
 'ed.layout.row1' 'ed.layout.row2' 'ed.gate.row' 'ed.gate.column'
 'ed.gate.side' 'ed.nest' 'ed.bug' 'ed.sand' 'ed.filled'
 'ed.castles_placed' 'ed.target_water_level' 'ed.water_level' 'ed.level'
 'ed.movie_id' 'ed.time_played' 'ed.options' 'ed.animals'
 'ed.round_target.size' 'ed.round_target.type' 'ed.round_target.animal'
 'ed.item_type' 'ed.position' 'ed.ani




In [51]:
test_raw.info(max_cols=999)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156414 entries, 0 to 1156413
Data columns (total 21 columns):
event_id                       1156414 non-null object
game_session                   1156414 non-null object
timestamp                      1156414 non-null object
event_data                     1156414 non-null object
installation_id                1156414 non-null object
event_count                    1156414 non-null int64
event_code                     1156414 non-null int64
game_time                      1156414 non-null int64
title                          1156414 non-null object
type                           1156414 non-null object
world                          1156414 non-null object
ed.identifier                  33835 non-null object
ed.duration                    32850 non-null float64
ed.coordinates.x               51261 non-null float64
ed.coordinates.y               51261 non-null float64
ed.coordinates.stage_width     51261 non-null float64
ed.coordinates.s

In [52]:
train_raw.info(max_cols=999)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7734558 entries, 0 to 7734557
Data columns (total 22 columns):
event_id                       object
game_session                   object
timestamp                      object
event_data                     object
installation_id                object
event_count                    int64
event_code                     int64
game_time                      int64
title                          object
type                           object
world                          object
accuracy_group                 int16
ed.identifier                  object
ed.duration                    float64
ed.coordinates.x               float64
ed.coordinates.y               float64
ed.coordinates.stage_width     float64
ed.coordinates.stage_height    float64
ed.level                       float64
ed.round                       float64
ed.correct                     object
ed.misses                      float64
dtypes: float64(8), int16(1), int64(3), object(

In [53]:
# All event ids in test set also exist in train set
test_set = set(test_raw['event_id'])
train_set = set(train_raw['event_id'])
vs = test_set - train_set
_log(f'{len(vs)} event_ids exist in test set but not train set.')

0 event_ids exist in test set but not train set.


In [54]:
EVENT_IDS = sorted(test_raw['event_id'].unique())
_log(f'{len(vs)} EVENT_IDS={vs}')

0 EVENT_IDS=set()


In [55]:
vs = sorted(train_raw['type'].unique())
_log(f'{len(vs)} train_raw type={vs}')

4 train_raw type=['Activity', 'Assessment', 'Clip', 'Game']


In [56]:
vs = sorted(train_raw['world'].unique())
_log(f'{len(vs)} train_raw type={vs}')

4 train_raw type=['CRYSTALCAVES', 'MAGMAPEAK', 'NONE', 'TREETOPCITY']


In [57]:
vs = sorted(train_raw['event_code'].unique())
_log(f'{len(vs)} train_raw type={vs}')

42 train_raw type=[2000, 2010, 2020, 2025, 2030, 2035, 2040, 2050, 2060, 2070, 2075, 2080, 2081, 2083, 3010, 3020, 3021, 3110, 3120, 3121, 4010, 4020, 4021, 4022, 4025, 4030, 4031, 4035, 4040, 4045, 4050, 4070, 4080, 4090, 4095, 4100, 4110, 4220, 4230, 4235, 5000, 5010]


In [58]:
vs = sorted(train_raw['title'].unique())
_log(f'{len(vs)} train_raw titles={vs}')

44 train_raw titles=['12 Monkeys', 'Air Show', 'All Star Sorting', 'Balancing Act', 'Bird Measurer (Assessment)', 'Bottle Filler (Activity)', 'Bubble Bath', 'Bug Measurer (Activity)', 'Cart Balancer (Assessment)', 'Cauldron Filler (Assessment)', 'Chest Sorter (Assessment)', 'Chicken Balancer (Activity)', 'Chow Time', 'Costume Box', 'Crystal Caves - Level 1', 'Crystal Caves - Level 2', 'Crystal Caves - Level 3', 'Crystals Rule', 'Dino Dive', 'Dino Drink', 'Egg Dropper (Activity)', 'Fireworks (Activity)', 'Flower Waterer (Activity)', 'Happy Camel', 'Heavy, Heavier, Heaviest', 'Honey Cake', 'Leaf Leader', 'Lifting Heavy Things', 'Magma Peak - Level 1', 'Magma Peak - Level 2', 'Mushroom Sorter (Assessment)', 'Ordering Spheres', 'Pan Balance', "Pirate's Tale", 'Rulers', 'Sandcastle Builder (Activity)', 'Scrub-A-Dub', 'Slop Problem', 'Treasure Map', 'Tree Top City - Level 1', 'Tree Top City - Level 2', 'Tree Top City - Level 3', 'Watering Hole (Activity)', 'Welcome to Lost Lagoon!']


In [59]:
vs = sorted(test_raw['title'].unique())
_log(f'{len(vs)} test titles={vs}')

44 test titles=['12 Monkeys', 'Air Show', 'All Star Sorting', 'Balancing Act', 'Bird Measurer (Assessment)', 'Bottle Filler (Activity)', 'Bubble Bath', 'Bug Measurer (Activity)', 'Cart Balancer (Assessment)', 'Cauldron Filler (Assessment)', 'Chest Sorter (Assessment)', 'Chicken Balancer (Activity)', 'Chow Time', 'Costume Box', 'Crystal Caves - Level 1', 'Crystal Caves - Level 2', 'Crystal Caves - Level 3', 'Crystals Rule', 'Dino Dive', 'Dino Drink', 'Egg Dropper (Activity)', 'Fireworks (Activity)', 'Flower Waterer (Activity)', 'Happy Camel', 'Heavy, Heavier, Heaviest', 'Honey Cake', 'Leaf Leader', 'Lifting Heavy Things', 'Magma Peak - Level 1', 'Magma Peak - Level 2', 'Mushroom Sorter (Assessment)', 'Ordering Spheres', 'Pan Balance', "Pirate's Tale", 'Rulers', 'Sandcastle Builder (Activity)', 'Scrub-A-Dub', 'Slop Problem', 'Treasure Map', 'Tree Top City - Level 1', 'Tree Top City - Level 2', 'Tree Top City - Level 3', 'Watering Hole (Activity)', 'Welcome to Lost Lagoon!']


In [60]:
def _transform_timestamp(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df


train_raw = _transform_timestamp(train_raw)
test_raw = _transform_timestamp(test_raw)

In [61]:
%%time
def _set_string_type(df, cols):
    df[cols] = df[cols].fillna(NAN).astype(str)
    return df


cols = ['event_code', 'timestamp']
train_raw = _set_string_type(train_raw, cols=cols)
test_raw = _set_string_type(test_raw, cols=cols)

Wall time: 1min 22s


In [62]:
%%time
def _sort_it(df):
    return df.sort_values(by=['installation_id', 'timestamp'])


train_raw = _sort_it(train_raw)
test_raw = _sort_it(test_raw)

Wall time: 36.6 s


# Multiple accuracy groups per installation id
In the train set, there are multiple accuracy groups per installation id. The task is to predict the accuracy group of the **last** assessment for a given installation id.

In [63]:
vs = train_raw[train_raw[TARGET].notna()].groupby('installation_id')[TARGET].nunique()
vs

installation_id
0006a69f    3
0006c192    3
00129856    2
001d0ed0    3
00225f67    1
           ..
ff9305d7    2
ff9715db    4
ffc90c32    3
ffd2871d    2
ffeb0b1b    2
Name: accuracy_group, Length: 3614, dtype: int64

In [64]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7734558 entries, 0 to 7734557
Data columns (total 22 columns):
event_id                       object
game_session                   object
timestamp                      object
event_data                     object
installation_id                object
event_count                    int64
event_code                     object
game_time                      int64
title                          object
type                           object
world                          object
accuracy_group                 int16
ed.identifier                  object
ed.duration                    float64
ed.coordinates.x               float64
ed.coordinates.y               float64
ed.coordinates.stage_width     float64
ed.coordinates.stage_height    float64
ed.level                       float64
ed.round                       float64
ed.correct                     object
ed.misses                      float64
dtypes: float64(8), int16(1), int64(2), object

In [65]:
def _log_smoothing(df, cols):
    for col in cols:
        df[col] = np.log(df[col] + 1)
    return df


#cols = ['event_count', 'game_time']
#train = _log_smoothing(train, cols)
#test = _log_smoothing(test, cols)

In [66]:
train_raw.head(40)

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,accuracy_group,ed.identifier,ed.duration,ed.coordinates.x,ed.coordinates.y,ed.coordinates.stage_width,ed.coordinates.stage_height,ed.level,ed.round,ed.correct,ed.misses
0,27253bdc,34ba1a28d02ba8ba,2019-08-06 04:57:18.904000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,0,,,,,,,,,,
1,27253bdc,4b57c9a59474a1b9,2019-08-06 04:57:45.301000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,0,,,,,,,,,,
2,77261ab5,2b9d5af79bcdb79f,2019-08-06 04:58:14.538000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK,0,,,,,,,,,,
3,b2dba42b,2b9d5af79bcdb79f,2019-08-06 04:58:14.615000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0006a69f,2,3010,29,Sandcastle Builder (Activity),Activity,MAGMAPEAK,0,"Dot_LetsSandcastle,Dot_FillMold,Dot_MoldShape",,,,,,,,,
4,1325467d,2b9d5af79bcdb79f,2019-08-06 04:58:16.680000+00:00,"{""coordinates"":{""x"":273,""y"":650,""stage_width"":...",0006a69f,3,4070,2137,Sandcastle Builder (Activity),Activity,MAGMAPEAK,0,,,273.0,650.0,1015.0,762.0,,,,
5,1325467d,2b9d5af79bcdb79f,2019-08-06 04:58:18.474000+00:00,"{""coordinates"":{""x"":863,""y"":237,""stage_width"":...",0006a69f,4,4070,3937,Sandcastle Builder (Activity),Activity,MAGMAPEAK,0,,,863.0,237.0,1015.0,762.0,,,,
6,1325467d,2b9d5af79bcdb79f,2019-08-06 04:58:19.365000+00:00,"{""coordinates"":{""x"":817,""y"":617,""stage_width"":...",0006a69f,5,4070,4820,Sandcastle Builder (Activity),Activity,MAGMAPEAK,0,,,817.0,617.0,1015.0,762.0,,,,
7,1bb5fbdb,2b9d5af79bcdb79f,2019-08-06 04:58:21.490000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0006a69f,6,3110,6954,Sandcastle Builder (Activity),Activity,MAGMAPEAK,0,"Dot_LetsSandcastle,Dot_FillMold,Dot_MoldShape",6925.0,,,,,,,,
8,1325467d,2b9d5af79bcdb79f,2019-08-06 04:58:22.732000+00:00,"{""coordinates"":{""x"":809,""y"":180,""stage_width"":...",0006a69f,7,4070,8187,Sandcastle Builder (Activity),Activity,MAGMAPEAK,0,,,809.0,180.0,1015.0,762.0,,,,
9,5e812b27,2b9d5af79bcdb79f,2019-08-06 04:58:23.295000+00:00,"{""size"":0,""coordinates"":{""x"":782,""y"":207,""stag...",0006a69f,8,4030,8745,Sandcastle Builder (Activity),Activity,MAGMAPEAK,0,,,782.0,207.0,1015.0,762.0,,,,


In [67]:
test_raw.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,ed.identifier,ed.duration,ed.coordinates.x,ed.coordinates.y,ed.coordinates.stage_width,ed.coordinates.stage_height,ed.level,ed.round,ed.correct,ed.misses
0,27253bdc,0ea9ecc81a565215,2019-09-10 16:50:24.910000+00:00,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,,,,,,,,,,
1,27253bdc,c1ea43d8b8261d27,2019-09-10 16:50:55.503000+00:00,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,,,,,,,,,,
2,27253bdc,7ed86c6b72e725e2,2019-09-10 16:51:51.805000+00:00,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK,,,,,,,,,,
3,27253bdc,7e516ace50e7fe67,2019-09-10 16:53:12.825000+00:00,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES,,,,,,,,,,
4,7d093bf9,a022c3f60ba547e7,2019-09-10 16:54:12.115000+00:00,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES,,,,,,,,0.0,,


# Train-test split not by time
Both train and test sets span the same time period.

In [68]:
_log(f'train[timestamp] is from {train_raw.timestamp.min()} to {train_raw.timestamp.max()}')
_log(f'test[timestamp] is from {test_raw.timestamp.min()} to {test_raw.timestamp.max()}')

train[timestamp] is from 2019-07-23 14:38:25.256000+00:00 to 2019-10-14 21:57:26.930000+00:00
test[timestamp] is from 2019-07-24 00:04:25.361000+00:00 to 2019-10-14 21:00:34.858000+00:00


# Feature Engineering

In [None]:
def _cutoff(df, TARGET):
    return df[df[TARGET].notna()]['timestamp'].max()


def _trim_events_after_last_assessment(df, cutoff):
    res = df[df['timestamp'] <= cutoff]
    #_log(f'cutoff: before={df.shape}, after={res.shape}')
    return res
    
    
def _target_variable(df, cutoff, TARGET):
    vs = df[df['timestamp'] == cutoff][TARGET].values
    assert len(set(vs)) == 1
    return int(float(vs[0]))
    
    
def _game_session_stats(df, col):
    vs = df.groupby(['game_session'])[col].transform('max')
    return (vs.median(), vs.max())


def _event_code_counts(df, code):
    total = np.int32([sum(df['event_code'] == code)])
    activity = np.int32([sum((df['event_code'] == code) & (df['type'] == 'Activity'))])
    assessment = np.int32([sum((df['event_code'] == code) & (df['type'] == 'Assessment'))])
    clip = np.int32([sum((df['event_code'] == code) & (df['type'] == 'Clip'))])
    game = np.int32([sum((df['event_code'] == code) & (df['type'] == 'Game'))])
    return (total, activity, assessment, clip, game)


def _features_map(df, EVENT_CODES, EVENT_IDS, suffix=''):
    res = {}
    res[f'type_activity{suffix}'] = np.int32([sum(df['type'] == 'Activity')])
    res[f'type_assessment{suffix}'] = np.int32([sum(df['type'] == 'Assessment')])
    res[f'type_clip{suffix}'] = np.int32([sum(df['type'] == 'Clip')])
    res[f'type_game{suffix}'] = np.int32([sum(df['type'] == 'Game')])
    assert len(df) == res[f'type_activity{suffix}'][0] + res[f'type_assessment{suffix}'][0] + res[f'type_clip{suffix}'][0] + res[f'type_game{suffix}'][0]
    res[f'world_crystalcaves{suffix}'] = np.int32([sum(df['world'] == 'CRYSTALCAVES')])
    res[f'world_magmapeak{suffix}'] = np.int32([sum(df['world'] == 'MAGMAPEAK')])
    res[f'world_treetopcity{suffix}'] = np.int32([sum(df['world'] == 'TREETOPCITY')])
    res[f'world_none{suffix}'] = np.int32([sum(df['world'] == 'NONE')])
    res[f'title_12_monkeys{suffix}'] = np.int32([sum(df['title'] == '12 Monkeys')])
    res[f'title_air_show{suffix}'] = np.int32([sum(df['title'] == 'Air Show')])
    res[f'title_all_star_sorting{suffix}'] = np.int32([sum(df['title'] == 'All Star Sorting')])
    res[f'title_balancing_act{suffix}'] = np.int32([sum(df['title'] == 'Balancing Act')])
    res[f'title_bird_measurer{suffix}'] = np.int32([sum(df['title'] == 'Bird Measurer (Assessment)')])
    res[f'title_bottle_filler{suffix}'] = np.int32([sum(df['title'] == 'Bottle Filler (Activity)')])
    res[f'title_bubble_bath{suffix}'] = np.int32([sum(df['title'] == 'Bubble Bath')])
    res[f'title_bug_measurer{suffix}'] = np.int32([sum(df['title'] == 'Bug Measurer (Activity)')])
    res[f'title_cart_balancer{suffix}'] = np.int32([sum(df['title'] == 'Cart Balancer (Assessment)')])
    res[f'title_cauldron_filler{suffix}'] = np.int32([sum(df['title'] == 'Cauldron Filler (Assessment)')])
    res[f'title_chest_sorter{suffix}'] = np.int32([sum(df['title'] == 'Chest Sorter (Assessment)')])
    res[f'title_chicken_balancer{suffix}'] = np.int32([sum(df['title'] == 'Chicken Balancer (Activity)')])
    res[f'title_chow_time{suffix}'] = np.int32([sum(df['title'] == 'Chow Time')])
    res[f'title_costume_box{suffix}'] = np.int32([sum(df['title'] == 'Costume Box')])
    res[f'title_crystal_caves_1{suffix}'] = np.int32([sum(df['title'] == 'Crystal Caves - Level 1')])
    res[f'title_crystal_caves_2{suffix}'] = np.int32([sum(df['title'] == 'Crystal Caves - Level 2')])
    res[f'title_crystal_caves_3{suffix}'] = np.int32([sum(df['title'] == 'Crystal Caves - Level 3')])
    res[f'title_crystals_rule{suffix}'] = np.int32([sum(df['title'] == 'Crystals Rule')])
    res[f'title_dino_dive{suffix}'] = np.int32([sum(df['title'] == 'Dino Dive')])
    res[f'title_dino_drink{suffix}'] = np.int32([sum(df['title'] == 'Dino Drink')])
    res[f'title_egg_dropper{suffix}'] = np.int32([sum(df['title'] == 'Egg Dropper (Activity)')])
    res[f'title_fireworks{suffix}'] = np.int32([sum(df['title'] == 'Fireworks (Activity)')])
    res[f'title_flower_waterer{suffix}'] = np.int32([sum(df['title'] == 'Flower Waterer (Activity)')])
    res[f'title_happy_camel{suffix}'] = np.int32([sum(df['title'] == 'Happy Camel')])
    res[f'title_heavy{suffix}'] = np.int32([sum(df['title'] == 'Heavy, Heavier, Heaviest')])
    res[f'title_honey_cake{suffix}'] = np.int32([sum(df['title'] == 'Honey Cake')])
    res[f'title_leaf_leader{suffix}'] = np.int32([sum(df['title'] == 'Leaf Leader')])
    res[f'title_lifting{suffix}'] = np.int32([sum(df['title'] == 'Lifting Heavy Things')])
    res[f'title_magma_peak_1{suffix}'] = np.int32([sum(df['title'] == 'Magma Peak - Level 1')])
    res[f'title_magma_peak_2{suffix}'] = np.int32([sum(df['title'] == 'Magma Peak - Level 2')])
    res[f'title_mushroom_sorter{suffix}'] = np.int32([sum(df['title'] == 'Mushroom Sorter (Assessment)')])
    res[f'title_ordering_spheres{suffix}'] = np.int32([sum(df['title'] == 'Ordering Spheres')])
    res[f'title_pan_balance{suffix}'] = np.int32([sum(df['title'] == 'Pan Balance')])
    res[f'title_pirate_tale{suffix}'] = np.int32([sum(df['title'] == "Pirate's Tale")])
    res[f'title_rulers{suffix}'] = np.int32([sum(df['title'] == 'Rulers')])
    res[f'title_sandcastle{suffix}'] = np.int32([sum(df['title'] == 'Sandcastle Builder (Activity)')])
    res[f'title_scrub{suffix}'] = np.int32([sum(df['title'] == 'Scrub-A-Dub')])
    res[f'title_slop{suffix}'] = np.int32([sum(df['title'] == 'Slop Problem')])
    res[f'title_treasure_map{suffix}'] = np.int32([sum(df['title'] == 'Treasure Map')])
    res[f'title_treetop_city_1{suffix}'] = np.int32([sum(df['title'] == 'Tree Top City - Level 1')])
    res[f'title_treetop_city_2{suffix}'] = np.int32([sum(df['title'] == 'Tree Top City - Level 2')])
    res[f'title_treetop_city_3{suffix}'] = np.int32([sum(df['title'] == 'Tree Top City - Level 3')])
    res[f'title_watering_hole{suffix}'] = np.int32([sum(df['title'] == 'Watering Hole (Activity)')])
    res[f'title_welcome{suffix}'] = np.int32([sum(df['title'] == 'Welcome to Lost Lagoon!')])
    
    for code in EVENT_CODES:
        (total, activity, assessment, clip, game) = _event_code_counts(df, code)
        res[f'event_{code}{suffix}'] = total
        res[f'event_{code}_activity{suffix}'] = activity
        res[f'event_{code}_assessment{suffix}'] = assessment
        res[f'event_{code}_clip{suffix}'] = clip
        res[f'event_{code}_game{suffix}'] = game
    
    for eid in EVENT_IDS:
        res[f'eid_{eid}{suffix}'] = np.int32([sum(df['event_id'] == eid)])
    
    res[f'game_time{suffix}'] = np.int32(df['game_time'].max())
    res[f'event_count{suffix}'] = np.int32(df['event_count'].max())
    res[f'ed_duration{suffix}'] = np.int32(df['ed.duration'].fillna(0).max())
    res[f'ed_level{suffix}'] = np.int32(df['ed.level'].fillna(0).max())
    res[f'ed_round{suffix}'] = np.int32(df['ed.round'].fillna(0).max())
    res[f'ed_correct{suffix}'] = np.int32(df['ed.correct'].fillna(0).max())
    res[f'ed_misses{suffix}'] = np.int32(df['ed.misses'].fillna(0).max())
    df['ed.coordinates.x'] = np.int32(df['ed.coordinates.x'].fillna(0))
    df['ed.coordinates.y'] = np.int32(df['ed.coordinates.y'].fillna(0))
    df['ed.coordinates.stage_width'] = np.int32(df['ed.coordinates.stage_width'].fillna(0))
    df['ed.coordinates.stage_height'] = np.int32(df['ed.coordinates.stage_height'].fillna(0))
    #res[f'ed_coord_x{suffix}'] = np.float32((df['ed.coordinates.x'] / df['ed.coordinates.stage_width']).fillna(0))
    return res


def _features(df, installation_id, EVENT_CODES, EVENT_IDS):
    res = {}
    iid = df[df['installation_id'] == installation_id]
    if TARGET in df.columns:
        cutoff = _cutoff(iid, TARGET)
        iid = _trim_events_after_last_assessment(iid, cutoff)
        res[TARGET] = _target_variable(iid, cutoff, TARGET)
    res['installation_id'] = [installation_id]
    cols = ['game_time', 'event_count']
    for col in cols:
        (_median, _max) = np.int32(_game_session_stats(iid, col))
        res[f'{col}_p50'] = _median
        res[f'{col}_max'] = _max
    res.update(_features_map(iid, EVENT_CODES, EVENT_IDS))
    return pd.DataFrame.from_dict(res)


def _preprocess(raw, EVENT_CODES, EVENT_IDS):
    res = pd.DataFrame()
    iids = raw['installation_id'].unique()
    for iid in tqdm(iids):
        res = pd.concat([res, _features(raw, iid, EVENT_CODES, EVENT_IDS)])
    return res


train = _preprocess(train_raw, EVENT_CODES, EVENT_IDS)
train.info(max_cols=999)













  0%|                                                         | 0/3614 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A











  0%|                                               | 1/3614 [00:01<1:09:04,  1.15s/it][A[A[A[A[A[A[A[A[A[A[A[A











  0%|                                               | 2/3614 [00:02<1:06:00,  1.10s/it][A[A[A[A[A[A[A[A[A[A[A[A











  0%|                                               | 3/3614 [00:02<1:00:55,  1.01s/it][A[A[A[A[A[A[A[A[A[A[A[A











  0%|                                                 | 4/3614 [00:03<57:44,  1.04it/s][A[A[A[A[A[A[A[A[A[A[A[A











  0%|                                                 | 5/3614 [00:04<55:18,  1.09it/s][A[A[A[A[A[A[A[A[A[A[A[A











  0%|                                               | 6/3614 [00:05<1:02:33,  1.04s/it][A[A[A[A[A[A[A[A[A[A[A[A











  0%|                                      

In [None]:
train.head(10)

In [None]:
test = _preprocess(test_raw, EVENT_CODES, EVENT_IDS)
test.info(max_cols=999)

In [None]:
test.head(20)

In [None]:
train.to_parquet('train.parquet')
test.to_parquet('test.parquet')
_log(os.listdir("."))

# Train Model

In [None]:
train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')

In [None]:
PREDICTORS = set(test.columns.values) - {'installation_id'}

In [None]:
def _rmse(y, y_pred):
    return sqrt(mean_squared_error(y, y_pred))


SCORING = make_scorer(_rmse, greater_is_better = False)

In [None]:
%%time
y_train = train[TARGET].astype(int)
x_train = train[PREDICTORS]
model = lgb.LGBMRegressor(n_estimators=ESTIMATORS, reg_alpha=1)
pipe = Pipeline([('model', model)])
param_grid = {
    'model__learning_rate': [0.01],
    'model__num_leaves': [80],
    'model__min_child_samples': [200],
    'model__colsample_bytree': [0.5]
}
cv = GridSearchCV(pipe, cv=FOLDS, param_grid=param_grid, scoring=SCORING)
#cv.fit(x_train, y_train, model__early_stopping_rounds=200, model__verbose=500)
cv.fit(x_train, y_train)
#assert cv.best_estimator_['model'].n_classes_ == 4
_log(f'best_params_={cv.best_params_}\nbest_score_={cv.best_score_:.5f}')

In [None]:
# plot_metric only works with early stopping rounds
#lgb.plot_metric(cv.best_estimator_['model'])

In [None]:
lgb.plot_importance(cv.best_estimator_['model'], max_num_features=100, figsize=(10, 30))

# Predict out of fold

In [None]:
oof = train[['installation_id']].copy()
oof[TARGET] = cv.predict(x_train)
assert oof[TARGET].min() > -0.5
assert oof[TARGET].max() < 3.5
oof[TARGET] = np.round(oof[TARGET]).astype(int)
oof.head()

In [None]:
score = cohen_kappa_score(oof[TARGET], y_train, weights='quadratic')
_log(f'oof score={score:.5f}')

# Predict on Test set

In [None]:
x_test = test[PREDICTORS]
sub = test[['installation_id']].copy()
sub[TARGET] = cv.predict(x_test)
assert sub[TARGET].min() > -0.5
assert sub[TARGET].max() < 3.5
sub[TARGET] = np.round(sub[TARGET]).astype(int)
sub.head()

In [None]:
plt.subplot(1, 3, 1)
plt.title('test predict')
sub[TARGET].plot(kind='hist')
plt.subplot(1, 3, 2)
plt.title('oof predict')
oof[TARGET].plot(kind='hist')
plt.subplot(1, 3, 3)
plt.title('oof truth')
tmp = train[TARGET].copy()
tmp = tmp.astype(int)
tmp.plot(kind='hist')
plt.tight_layout()

In [None]:
sub.to_csv('submission.csv', index=False)
_log(os.listdir("."))