In [1]:
import numpy as np
import pandas as pd

In [2]:
import json
from tqdm import tqdm_notebook

In [3]:
import os

PATH_TO_DATA = '../input/mlcourse-dota2-win-prediction/'

In [4]:
def read_matches(matches_file):
    
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)
    
    with open(matches_file) as fin:
        for line in tqdm_notebook(fin, total=total_matches):
            yield json.loads(line)

In [5]:
import collections

MATCH_FEATURES = [
    ('game_time', lambda m: m['game_time']),
    ('game_mode', lambda m: m['game_mode']),
    ('lobby_type', lambda m: m['lobby_type']),
    ('objectives_len', lambda m: len(m['objectives'])),
    ('chat_len', lambda m: len(m['chat'])),
]

PLAYER_FIELDS = [
    'hero_id',
    
    'kills',
    'deaths',
    'assists',
    'denies',
    
    'gold',
    'lh',
    'xp',
    'health',
    'max_health',
    'max_mana',
    'level',

    'x',
    'y',
    
    'stuns',
    'creeps_stacked',
    'camps_stacked',
    'rune_pickups',
    'firstblood_claimed',
    'teamfight_participation',
    'towers_killed',
    'roshans_killed',
    'obs_placed',
    'sen_placed',
]

In [6]:
def extract_targets_csv(match, targets):
    return collections.OrderedDict([('match_id_hash', match['match_id_hash'])] + [
        (field, targets[field])
        for field in ['game_time', 'radiant_win', 'duration', 'time_remaining', 'next_roshan_team']
    ])

In [7]:
PLAYER_FIELDS_NEW  = [
    'observers_placed',
    'nearby_creep_death_count',
]

In [8]:
PLAYER_FIELDS_T  = [
    'gold_t',
    'lh_t',
    'dn_t',
    'xp_t',
]

In [9]:
PLAYER_FIELDS_N  = [
    'ability_upgrades',
]

In [10]:
PLAYER_FIELDS_SUM  = [
	'purchase',
	'killed',
	'item_uses',
	'ability_uses',
	'hero_hits',
	'damage',
	'damage_taken',
	'damage_inflictor',
	'killed_by',
	'multi_kills',
	'healing',
	'damage_inflictor_received',
]

In [11]:
def extract_features_csv(match):
    row = [
        ('match_id_hash', match['match_id_hash']),
    ]
    
    for field, f in MATCH_FEATURES:
        row.append((field, f(match)))
        
    for slot, player in enumerate(match['players']):
        if slot < 5:
            player_name = 'r%d' % (slot + 1)
        else:
            player_name = 'd%d' % (slot - 4)

        for field in PLAYER_FIELDS:
            column_name = '%s_%s' % (player_name, field)
            row.append((column_name, player[field]))
        
        for field in PLAYER_FIELDS_NEW:
            column_name = '%s_%s' % (player_name, field)
            row.append((column_name, player[field]))
            
        for field in PLAYER_FIELDS_T:
            column_name = '%s_%s' % (player_name, field)
            if (len(player['times']) > 1) and (player['times'][len(player['times']) - 1] > 0):
                ans = float((player[field][len(player[field]) - 1])/player['times'][len(player['times']) - 1])
            else:
                ans = 0
            row.append((column_name, ans))
        for field in PLAYER_FIELDS_N:
            column_name = '%s_%s' % (player_name, field)
            row.append((column_name, len(player[field])))
        for field in PLAYER_FIELDS_SUM:
            column_name = '%s_%s' % (player_name, field)
            ans = []
            for key in player[field].keys():
                ans.append(player[field][key])
            row.append((column_name, sum(ans)))
            
    return collections.OrderedDict(row)

In [12]:
def extractor(path):
    df_features = []
    df_targets  = []

    for match in read_matches(path):
        match_id_hash = match['match_id_hash']
        features = extract_features_csv(match)
        targets = extract_targets_csv(match, match['targets'])

        df_features.append(features)
        df_targets.append(targets)
    return df_features, df_targets

In [13]:
df_train_features, df_train_targets = extractor(PATH_TO_DATA + 'train_matches.jsonl')

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))




In [14]:
df_test_features = []
for match in read_matches(PATH_TO_DATA + 'test_matches.jsonl'):
    match_id_hash = match['match_id_hash']
    features = extract_features_csv(match)

    df_test_features.append(features)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [15]:
df_train_features = pd.DataFrame.from_records(df_train_features).set_index('match_id_hash')
df_train_targets = pd.DataFrame.from_records(df_train_targets).set_index('match_id_hash')
df_test_features = pd.DataFrame.from_records(df_test_features).set_index('match_id_hash')

In [16]:
def add_new_features(df_features, matches_file):
    
    # Process raw data and add new features
    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash']

        # Counting ruined towers for both teams
        radiant_tower_kills = 0
        dire_tower_kills = 0
        for objective in match['objectives']:
            if objective['type'] == 'CHAT_MESSAGE_TOWER_KILL':
                if objective['team'] == 2:
                    radiant_tower_kills += 1
                if objective['team'] == 3:
                    dire_tower_kills += 1

        # Write new features
        df_features.loc[match_id_hash, 'radiant_tower_kills'] = radiant_tower_kills
        df_features.loc[match_id_hash, 'dire_tower_kills'] = dire_tower_kills
        df_features.loc[match_id_hash, 'diff_tower_kills'] = radiant_tower_kills - dire_tower_kills

In [17]:
# copy the dataframe with features
df_train_features_extended = df_train_features.copy()
df_test_features_extended = df_test_features.copy()

# add new features
add_new_features(df_train_features_extended, PATH_TO_DATA + 'train_matches.jsonl')
add_new_features(df_test_features_extended,  PATH_TO_DATA + 'test_matches.jsonl')

#fill NaN
df_train_features_extended.fillna(0)
df_test_features_extended.fillna(0)

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




Unnamed: 0_level_0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_hero_id,r1_kills,r1_deaths,r1_assists,r1_denies,...,d5_damage,d5_damage_taken,d5_damage_inflictor,d5_killed_by,d5_multi_kills,d5_healing,d5_damage_inflictor_received,radiant_tower_kills,dire_tower_kills,diff_tower_kills
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30cc2d778dca82f2edb568ce9b585caa,23,4,0,0,0,79,0,0,0,0,...,185,187,185,0,0,0,187,0.0,0.0,0.0
70e5ba30f367cea48793b9003fab9d38,1044,22,7,12,6,23,3,5,7,1,...,12491,7863,4785,5,0,280,6343,7.0,2.0,5.0
4d9ef74d3a2025d79e9423105fd73d41,1091,22,7,6,1,64,3,1,7,1,...,30441,10133,7722,5,0,1407,7928,4.0,1.0,3.0
2bb79e0c1eaac1608e5a09c8e0c6a555,623,22,7,1,0,41,0,0,1,2,...,16860,9002,4209,4,0,85,3791,0.0,0.0,0.0
bec17f099b01d67edc82dfb5ce735a43,1538,22,7,7,11,40,2,7,12,0,...,77162,23133,9706,2,2,7586,10324,2.0,4.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9376a283b50779433de829c79529fe2c,1695,22,7,18,1,93,2,6,6,5,...,70957,13068,9707,4,2,470,9109,0.0,8.0,-8.0
bce2bace8b61980d282c9f6a9c69ef9c,2191,22,7,12,9,40,3,5,8,4,...,153285,28212,14360,7,1,5220,20318,6.0,3.0,3.0
dc00c2964363b0344a4891bdde235a44,391,22,7,1,0,21,0,1,0,8,...,4887,2302,910,1,0,400,1874,0.0,0.0,0.0
d75db83f7857720f851a302b00ee6149,1254,22,7,7,3,90,0,4,6,0,...,68463,8263,7300,3,0,4793,5300,4.0,0.0,4.0


In [18]:
col = ['purchase',
 'killed',
 'item_uses',
 'ability_uses',
 'hero_hits',
 'damage',
 'damage_taken',
 'damage_inflictor',
 'killed_by',
 'multi_kills',
 'healing',
 'damage_inflictor_received',
 'ability_upgrades',
 'gold_t',
 'lh_t',
 'dn_t',
 'xp_t',
 'observers_placed',
 'nearby_creep_death_count',
 'kills',
 'deaths',
 'assists',
 'denies',
 'gold',
 'lh',
 'xp',
 'health',
 'max_health',
 'max_mana',
 'level',
 'x',
 'y',
 'stuns',
 'creeps_stacked',
 'camps_stacked',
 'rune_pickups',
 'firstblood_claimed',
 'teamfight_participation',
 'towers_killed',
 'roshans_killed',
 'obs_placed',
 'sen_placed']

In [19]:
def add_calc_features(df, col):
    metrics = [
                    'total_',
                    'std_',
                    'mean_',
                    'min_',
                    'max_'
              ]
    
    for c in col:
         
        r_columns = [f'r{i}_{c}' for i in range(1, 6)]
        d_columns = [f'd{i}_{c}' for i in range(1, 6)]
        
        for m in metrics:
            df['r_' + m + c] = df[r_columns].sum(1)
            df['d_' + m + c] = df[d_columns].sum(1)
            
            ratio_col = m + c + '_ratio'
            df[ratio_col] = 0
            ind = df[df['d_' + m + c] >  0].index
            df.loc[ind, ratio_col] = df.loc[ind, 'r_' + m + c]/df.loc[ind, 'd_' + m + c]

In [20]:
# copy the dataframe with features
df_train_features_extended2 = df_train_features_extended.copy()
df_test_features_extended2  = df_test_features_extended.copy()

# add calc features
add_calc_features(df_train_features_extended2, col)
add_calc_features(df_test_features_extended2,  col)

In [21]:
df_train_features_extended2.to_csv('df_train_features_ext.csv')
df_test_features_extended2.to_csv('df_test_features_ext.csv')
df_train_targets.to_csv('df_train_targets.csv')

In [22]:
df_train_features_extended2['max_kills_ratio']

match_id_hash
a400b8f29dece5f4d266f49f1ae2e98a    0.000000
b9c57c450ce74a2af79c9ce96fac144d    5.333333
6db558535151ea18ca70a6892197db41    0.000000
46a0ddce8f7ed2a8d9bd5edcbb925682    6.000000
b1b35ff97723d9b7ade1c9c3cf48f770    0.500000
                                      ...   
defd0caeed6ea83d7d5fbdec013fe7d1    0.593750
bc7a87ed5f9c2bca55f9f7a93da0b0c5    1.000000
e2ca68ac1a6847f4a37f6c9c8ee8695b    0.222222
47ad6454ede66c1c78fdaa9391dfc556    1.076923
9928dfde50efcbdb2055da23dcdbc101    1.500000
Name: max_kills_ratio, Length: 39675, dtype: float64