In [1]:
import os
import pandas as pd
import numpy as np
import ujson as json
import matplotlib.pyplot as plt
import lightgbm as lgb

from itertools import combinations

from sklearn.model_selection import cross_val_score, ShuffleSplit, train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
PATH_TO_DATA = './data/'
SEED = 17
SIZE = None

In [3]:
def replace_na(df):
    df[np.isnan(df)] = 0
    df[np.isinf(df)] = 0
    return df

In [4]:
def read_matches(matches_file, size=None):
    with open(os.path.join(PATH_TO_DATA, matches_file)) as f:
        count = 0
        for line in f:
            if size is not None and count >= size:
                return
            count += 1
            yield json.loads(line)

In [5]:
def count_ruined_towers(match):
    radiant_tower_kills = 0
    dire_tower_kills = 0
    for objective in match['objectives']:
        if objective['type'] == 'CHAT_MESSAGE_TOWER_KILL':
            if objective['team'] == 2:
                radiant_tower_kills += 1
            if objective['team'] == 3:
                dire_tower_kills += 1
    return radiant_tower_kills, dire_tower_kills

In [6]:
def count_players_feature(match, feature):
    radiant = 0
    dire = 0
    for slot, player in enumerate(match['players']):
        if slot < 5:
            radiant += player[feature]
        else:
            dire += player[feature]
    return radiant, dire

In [7]:
def combine_numeric_features(df, feature_suffixes):
    for feat_suff in feature_suffixes:
        for team in 'r', 'd':
            players = [f'{team}{i}' for i in range(1, 6)]
            player_col_names = [f'{player}_{feat_suff}' for player in players]

            df[f'{team}_{feat_suff}_std'] = df[player_col_names].std(axis=1)
            df[f'{team}_{feat_suff}_mean'] = df[player_col_names].mean(axis=1)

            df.drop(columns=player_col_names, inplace=True)
        
        df[f'{feat_suff}_std_ratio'] = df[f'r_{feat_suff}_std'] / df[f'd_{feat_suff}_std']
        df[f'{feat_suff}_mean_ratio'] = df[f'r_{feat_suff}_mean'] / df[f'd_{feat_suff}_mean']
        df.drop(
            columns=[f'r_{feat_suff}_std', f'd_{feat_suff}_std', f'r_{feat_suff}_mean',
                     f'd_{feat_suff}_mean'],
            inplace=True
        )
    return df

In [8]:
def make_coordinate_features(df):
    for team in 'r', 'd':
        for player in [f'{team}{i}' for i in range(1, 6)]:
            df[f'{player}_distance'] = np.sqrt(df[f'{player}_x']**2 + df[f'{player}_y']**2)
            df.drop(columns=[f'{player}_x', f'{player}_y'], inplace=True)
    return df

In [9]:
def ohe_hero_ids(df):
    for team in ('r', 'd'):
        for i in range(1, 6):
            col = f'{team}{i}_hero_id'
            df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)
            df.drop(columns=[col], inplace=True)
    return df

In [10]:
def add_new_features(df_features, matches_file, size=None):
    df_features = df_features.copy()

    for match in read_matches(matches_file, size=size):
        match_id = match['match_id_hash']

        radiant_tower_kills, dire_tower_kills = count_ruined_towers(match)
        df_features.loc[match_id, 'radiant_tower_kills'] = radiant_tower_kills
        df_features.loc[match_id, 'dire_tower_kills'] = dire_tower_kills
        df_features.loc[match_id, 'diff_tower_kills'] = radiant_tower_kills - dire_tower_kills

        for feature in ('gold', 'xp', ):
            radiant, dire = count_players_feature(match, feature)
            df_features.loc[match_id, f'radiant_{feature}'] = radiant
            df_features.loc[match_id, f'dire_{feature}'] = dire
            df_features.loc[match_id, f'diff_{feature}'] = radiant - dire

    df_features = make_coordinate_features(df_features)

    numeric_features = [
        'kills', 'deaths', 'assists', 'denies', 'gold', 'xp', 'health',
        'max_health', 'max_mana', 'level', 'towers_killed', 'stuns',
        'creeps_stacked', 'camps_stacked', 'lh', 'rune_pickups', 'firstblood_claimed',
        'teamfight_participation', 'roshans_killed', 'obs_placed', 'sen_placed'
    ]
    df_features = replace_na(combine_numeric_features(df_features, numeric_features))
    
    df_features = ohe_hero_ids(df_features)

    return df_features

In [11]:
df_train = pd.read_csv(
    os.path.join(PATH_TO_DATA, 'train_features.csv'), 
    index_col='match_id_hash'
)
df_targets = pd.read_csv(
    os.path.join(PATH_TO_DATA, 'train_targets.csv'), 
    index_col='match_id_hash'
)
df_test = pd.read_csv(
    os.path.join(PATH_TO_DATA, 'test_features.csv'), 
    index_col='match_id_hash'
)

In [12]:
df_train_ext = add_new_features(df_train, 'train_matches.jsonl', size=SIZE)[:SIZE]
df_targets_ext = df_targets[:SIZE]

In [13]:
X = df_train_ext
y = df_targets_ext['radiant_win'].map({True: 1, False: 0})

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=SEED)

categorical_features = [
    'objectives_len', 'game_mode',
    'radiant_tower_kills', 'dire_tower_kills', 'diff_tower_kills',
]

In [14]:
X_train.head()

Unnamed: 0_level_0,game_time,game_mode,lobby_type,objectives_len,chat_len,radiant_tower_kills,dire_tower_kills,diff_tower_kills,radiant_gold,dire_gold,...,d5_hero_id_107,d5_hero_id_108,d5_hero_id_109,d5_hero_id_110,d5_hero_id_111,d5_hero_id_112,d5_hero_id_113,d5_hero_id_114,d5_hero_id_119,d5_hero_id_120
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2c18aa48409e75984884f5ed31f28f49,292,22,7,1,19,0.0,0.0,0.0,5443.0,6788.0,...,0,0,0,0,0,0,0,0,0,0
c532424db819b975390c511a8ceb28d3,1504,4,0,12,5,6.0,0.0,6.0,54429.0,29478.0,...,0,0,0,0,0,0,0,0,0,0
57aa998b502506ce93dce3a49b3e8c4a,193,22,7,0,0,0.0,0.0,0.0,4159.0,3701.0,...,0,0,0,0,0,0,0,0,0,0
63323de8f997757cf1db142061f432d4,1449,22,7,6,3,3.0,1.0,2.0,39936.0,36575.0,...,0,0,0,0,0,0,0,0,0,0
9884bac6e6350d06d2df11a644ce2ce5,575,22,7,1,1,0.0,0.0,0.0,10614.0,12668.0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
params = {
    'metric': 'auc',
    'is_unbalance': True,
    'bagging_seed': SEED,
}

lgtrain = lgb.Dataset(X_train, y_train)
lgvalid = lgb.Dataset(X_valid, y_valid)

model = lgb.train(
    params,
    lgtrain,
    num_boost_round=500,
    valid_sets=lgvalid,
    early_stopping_rounds=100,
    verbose_eval=50
)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.826267
[100]	valid_0's auc: 0.826531
[150]	valid_0's auc: 0.825656
Early stopping, best iteration is:
[67]	valid_0's auc: 0.82717


In [20]:
df_test_ext = add_new_features(df_test, 'test_matches.jsonl', size=SIZE)[:SIZE]
model = lgb.train(
    params,
    lgb.Dataset(X[:SIZE], y[:SIZE]),
    num_boost_round=500,
)
y_pred = model.predict(df_test_ext, num_iteration=model.best_iteration)