In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import ujson as json
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import combinations
from functools import partial
from collections import OrderedDict

from sklearn.model_selection import cross_val_score, ShuffleSplit, train_test_split
from sklearn.decomposition import PCA
from catboost import CatBoostClassifier, CatBoostRegressor

In [2]:
PATH_TO_DATA = './data/'
SEED = 17
SIZE = 1000

In [3]:
def get_game_time(match):
    if match['game_time'] == 0:
        return [match['players'][0]['times'][-1]]
    else:
        return [match['game_time']]

In [4]:
def replace_na(df):
    df[np.isnan(df)] = 0
    df[np.isinf(df)] = 0
    return df

In [5]:
def read_matches(matches_file, size=None):
    with open(os.path.join(PATH_TO_DATA, matches_file)) as f:
        count = 0
        for line in f:
            if size is not None and count >= size:
                return
            count += 1
            yield json.loads(line)

In [6]:
def count_ruined_towers(match):
    radiant_tower_kills = 0
    dire_tower_kills = 0
    for objective in match['objectives']:
        if objective['type'] == 'CHAT_MESSAGE_TOWER_KILL':
            if objective['team'] == 2:
                radiant_tower_kills += 1
            if objective['team'] == 3:
                dire_tower_kills += 1
    return radiant_tower_kills, dire_tower_kills, radiant_tower_kills-dire_tower_kills

In [7]:
def ratio_numeric_features(df, features):
    for feature in features:
        for team in 'r', 'd':
            cols = [f'{team}{i}_{feature}' for i in range(1, 6)]
            df[f'{team}_{feature}_std'] = df[cols].std(axis=1)
            df[f'{team}_{feature}_mean'] = df[cols].mean(axis=1)
            df.drop(columns=cols, inplace=True)
        
        df[f'{feature}_std_ratio'] = df[f'r_{feature}_std'] / df[f'd_{feature}_std']
        df[f'{feature}_mean_ratio'] = df[f'r_{feature}_mean'] / df[f'd_{feature}_mean']

        df.drop(columns=[
            f'r_{feature}_std', f'd_{feature}_std',
            f'r_{feature}_mean', f'd_{feature}_mean'
        ], inplace=True)

    return df

In [8]:
def tri_state_hero(df):
    hero_ids = df['r1_hero_id'].unique()
    tmp = pd.DataFrame(
        columns=[f'{i}_hero_id' for i in hero_ids],
        index=df.index
    ).fillna(0)

    for hero_id in hero_ids:
        for team in ('r', 'd'):
            tmp.loc[
                (df[f'{team}1_hero_id'] == hero_id) |
                (df[f'{team}2_hero_id'] == hero_id) |
                (df[f'{team}3_hero_id'] == hero_id) |
                (df[f'{team}4_hero_id'] == hero_id) |
                (df[f'{team}5_hero_id'] == hero_id),
                f'{hero_id}_hero_id'
            ] = 1 if team == 'r' else -1

    drop_cols = [f'{team}{i}_hero_id' for team in ('r', 'd') for i in range(1, 6)]
    df.drop(columns=drop_cols, inplace=True)

    return pd.concat([df, tmp], axis=1)

In [9]:
def make_coordinate_features(df):
    r_y_coord = [f'r{i}_y' for i in range(1,6)]
    r_x_coord = [f'r{i}_x' for i in range(1,6)]
    d_y_coord = [f'd{i}_y' for i in range(1,6)]
    d_x_coord = [f'd{i}_x' for i in range(1,6)]
    for agg in ('mean', 'std'):
        df[f'r_y_{agg}'] = df[r_y_coord].agg(agg, axis=1)
        df[f'r_x_{agg}'] = df[r_x_coord].agg(agg, axis=1)
        df[f'd_y_{agg}'] = df[d_y_coord].agg(agg, axis=1)
        df[f'd_x_{agg}'] = df[d_x_coord].agg(agg, axis=1)
    df.drop(columns=r_y_coord+r_x_coord+d_y_coord+d_x_coord, inplace=True)
    return df

In [10]:
MATCH_FEATURES = [
    (['game_time'], get_game_time),
    (['game_mode'], lambda m: [m['game_mode']]),
    (['lobby_type'], lambda m: [m['lobby_type']]),
    (['objectives_len'], lambda m: [len(m['objectives'])]),
    (['chat_len'], lambda m: [len(m['chat'])]),
    (['radiant_tower_kills', 'dire_tower_kills', 'diff_tower_kills'],
     count_ruined_towers),
]

PLAYER_FIELDS = [
    'hero_id',
    
    'kills',
    'deaths',
    'assists',
    'denies',
    
    'gold',
    'lh',
    'xp',
    'health',
    'max_health',
    'max_mana',
    'level',

    'x',
    'y',

    'stuns',
    'teamfight_participation',
    'rune_pickups',
    'obs_placed',
    'sen_placed',
    'creeps_stacked',
    'camps_stacked',
    'firstblood_claimed',
    'towers_killed',
    'roshans_killed',
]

def extract_features_csv(match):
    row = [('match_id_hash', match['match_id_hash'])]

    for fields, func in MATCH_FEATURES:
        row.extend(zip(fields, func(match)))
        
    for slot, player in enumerate(match['players']):
        player_name = f'r{slot + 1}' if slot < 5 else f'd{slot - 4}'
        for field in PLAYER_FIELDS:
            column_name = f'{player_name}_{field}'
            row.append((column_name, player[field]))

        row.append((f'{player_name}_ability_level', len(player['ability_upgrades'])))
        row.append((f'{player_name}_max_hero_hit', player['max_hero_hit']['value']))
        row.append((f'{player_name}_purchase_count', len(player['purchase_log'])))
        row.append((f'{player_name}_count_ability_use', sum(player['ability_uses'].values())))
        row.append((f'{player_name}_damage_dealt', sum(player['damage'].values())))
        row.append((f'{player_name}_damage_received', sum(player['damage_taken'].values())))

    return OrderedDict(row)

In [11]:
def get_features_df(matches_file, size=SIZE):
    csv_rows = [
        extract_features_csv(match)
        for match in read_matches(matches_file, size=size)
    ]
    df = pd.DataFrame.from_records(csv_rows).set_index('match_id_hash')
    
    df = make_coordinate_features(df)
    
    df = tri_state_hero(df)

    numeric_features = [
        'kills', 'deaths', 'assists', 'denies', 'gold', 'xp', 'health',
        'max_health', 'max_mana', 'level', 'stuns', 'lh', 'rune_pickups',
        'teamfight_participation', 'obs_placed', 'sen_placed', 'ability_level', 
        'max_hero_hit', 'purchase_count', 'count_ability_use', 'damage_dealt', 'damage_received',
        'firstblood_claimed',  'roshans_killed', 'towers_killed', 'creeps_stacked', 'camps_stacked',
    ]
    df = replace_na(ratio_numeric_features(df, numeric_features))

    return df

In [12]:
df_train = pd.read_csv(
    os.path.join(PATH_TO_DATA, 'train_features.csv'), 
    index_col='match_id_hash'
)[:SIZE]
df_targets = pd.read_csv(
    os.path.join(PATH_TO_DATA, 'train_targets.csv'), 
    index_col='match_id_hash'
)[:SIZE]

y_duration = df_targets['duration']
X = get_features_df('train_matches.jsonl')

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y_duration, test_size=0.3, random_state=SEED)

cat_features = [
    'objectives_len', 'game_mode',
    'radiant_tower_kills', 'dire_tower_kills', 'diff_tower_kills',
]
cat_features_idx = [X.columns.get_loc(i) for i in cat_features]

In [15]:
X_train.head()

Unnamed: 0_level_0,game_time,game_mode,lobby_type,objectives_len,chat_len,radiant_tower_kills,dire_tower_kills,diff_tower_kills,11_hero_id,15_hero_id,...,firstblood_claimed_std_ratio,firstblood_claimed_mean_ratio,roshans_killed_std_ratio,roshans_killed_mean_ratio,towers_killed_std_ratio,towers_killed_mean_ratio,creeps_stacked_std_ratio,creeps_stacked_mean_ratio,camps_stacked_std_ratio,camps_stacked_mean_ratio
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
58e75cf312de926700e05781fed0213e,-1.038019,22,7,1,3,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32a2f85286a8c08697d149f6f2cb70dc,0.039738,22,7,4,5,0,3,-3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,1.0,1.0
828a929c6a4b1db14f57722ffa20e966,1.78034,22,0,22,75,6,7,-1,0,0,...,0.0,0.0,1.0,1.0,0.601417,0.666667,1.666667,1.666667,2.0,2.0
0d23ca5219fe12f4611edd46f5baf063,-0.439687,22,0,3,0,2,0,2,-1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7ea9773592013072ad989d7e0056f6a8,-1.150602,22,7,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
params = {
    'random_state': SEED
}
model = CatBoostRegressor(**params)
model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid),
    cat_features=cat_features_idx,
    plot=True,
    verbose=False
);

425728.6702575572

In [17]:
print(model.best_score_)

NameError: name 'model' is not defined

In [None]:
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.get_feature_importance()
}).sort_values('importance', ascending=False)[:100]

plt.figure(figsize=(12,28))
sns.barplot(x=feature_importance.importance, y=feature_importance.feature);