In [1]:
!pip install orjson tqdm

Collecting orjson
[?25l  Downloading https://files.pythonhosted.org/packages/63/23/fbe5af5ba2334953b2c0bf6873bc5593da7d3585f36692fe0e2953b6b92e/orjson-2.1.3-cp36-cp36m-manylinux1_x86_64.whl (173kB)
[K     |████████████████████████████████| 174kB 9.5MB/s 
Installing collected packages: orjson
Successfully installed orjson-2.1.3


In [2]:
import numpy as np
import pandas as pd

In [3]:
import orjson as json
import tqdm
import multiprocessing as mp
import gc
import re

In [4]:
from catboost import Pool, CatBoostClassifier

In [5]:
game_mode_map = {
    0: "Unknown",
    1: "All Pick",
    2: "Captains Mode",
    3: "Random Draft",
    4: "Single Draft",
    5: "All Random",
    6: "Intro",
    7: "Diretide",
    8: "Reverse Captains Mode",
    9: "Greeviling",
    10: "Tutorial Mode",
    11: "Mid only",
    12: "Least Played",
    13: "Limited Heroes",
    14: "Compendium Matchmaking",
    15: "Custom Mode",
    16: "Captains Draft",
    17: "Balanced Draft",
    18: "Ability Draft",
    19: "Event",
    20: "Random Death Match",
    21: "1v1 Mid",
    22: "All Draft",
    23: "Turbo Mode",
    24: "Mutation Mode"
}

lobby_type_map = {
-1: "Invalid",
0: "Public matchmaking",
1:"Practice",
2: "Tournament",
3: "Tutorial",
4: "Co-op with bots",
5: "Team match",
6: "Solo Queue",
7: "Ranked",
8: "1v1 Mid",
9: "Battle Cup"
}

slot_radiant_map = {i: f'r{i+1}' for i in range(5)}
slot_dire_map = {i:f'd{j+1}' for j, i in enumerate(range(128, 133))}
# Map slot to hero identifier
slot_hero = {**slot_radiant_map, **slot_dire_map}

team_identifier = {2: 'r', 3: 'd'}

barracks_map = {
    "1": "Dire Bot Melee",
    "2": "Dire Bot Ranged",
    "4": "Dire Mid Melee",
    "8": "Dire Mid Ranged",
    "16": "Dire Top Melee",
    "32": "Dire Top Ranged",
    "64": "Radiant Bot Melee",
    "128": "Radiant Bot Ranged",
    "256": "Radiant Mid Melee",
    "512": "Radiant Mid Ranged",
    "1024": "Radiant Top Melee",
    "2048": "Radiant Top Ranged"
}

In [6]:
def chat_features(game):
    """
    game -> JSON object
    
    returns: dict of chat features:
    1) Total amount of messages
    2) # of messages per team
    3) # of messages per player
    """
    chat = game['chat']
    
    suffix = "_chat_msg"
    
    # Initialize with zeros

    if 'init_dict' not in dir(chat_features):
        team_features = {'r' + suffix: 0, 'd' + suffix: 0}
        hero_features = {(f'r{i+1}' if i < 5 else f'd{i-4}')+suffix:0 for i in range(10)}

        chat_features.init_dict = {'chat_len': 0, **team_features, **hero_features}

    features = chat_features.init_dict.copy()
    # Use slot_to_hero mapping to update coutns
    for msg in chat:
        player_slot = msg['player_slot']
        if player_slot is None:
            continue
        hero_id = slot_hero[msg['player_slot']]
        if hero_id.startswith('r'):
            features['r' + suffix] += 1
        else:
            features['d' + suffix] += 1

        features[hero_id + suffix] += 1
        features['chat_len'] += 1
    
    return features

def objectives_features(game):
    if 'init_features' not in dir(objectives_features):
        objectives_features.init_features = {'r_tower_killed': 0, 'd_tower_killed': 0,
                'r_barracks_killed': 0, 'd_barracks_killed': 0,
                'r_roshan_killed': 0, 'd_roshan_killed': 0,
                'r_first_blood': False, 'd_first_blood': False,
                'r_had_aegis': 0, 'd_had_aegis': 0,
                'r_tower_denied': 0, 'd_tower_denied': 0,
                'obj_len': 0, 'r_obj': 0, 'd_obj': 0}
    c = objectives_features.init_features.copy()
    for obj in game['objectives']:
        event = obj['type']
        if event == 'CHAT_MESSAGE_TOWER_KILL':
            team = team_identifier[obj['team']]
            key =  team + '_tower_killed'
            c[key] += 1
            c[team + '_obj'] += 1
        elif event == "CHAT_MESSAGE_TOWER_DENY":
            team = slot_hero[obj['player_slot']][0]
            key = team + '_tower_denied'
            c[key] += 1
            c[team + '_obj'] += 1
        elif event == "CHAT_MESSAGE_ROSHAN_KILL":
            team = team_identifier[obj['team']]
            key = team + '_roshan_killed'
            c[key] += 1
            c[team + '_obj'] += 1
        elif event == "CHAT_MESSAGE_BARRACKS_KILL":
            barrack = barracks_map[obj['key']]
            team = ('d' if barrack.startswith("Dire") else 'r')
            c[team + '_barracks_killed'] += 1
            c[team + '_obj'] += 1
        elif event == "CHAT_MESSAGE_FIRSTBLOOD":
            team = slot_hero[obj['player_slot']][0]
            c[team + '_obj'] += 1
            c[team + '_first_blood'] = True
        elif event == "CHAT_MESSAGE_AEGIS":
            team = slot_hero[obj['player_slot']][0]
            key = team + '_had_aegis'
            c[key] += 1
            c[team + '_obj'] += 1
        else:  # ignore other events, like aegis steal and deny
            continue
        c['obj_len'] += 1
    return c

def general_features(game):
    if 'init_dict' not in dir(general_features):
        general_features.init_dict = {'game_time': 0, 'game_mode': None,
                                      'lobby_type': None, 'game_phase': None}
    c = general_features.init_dict.copy()
    c['game_time'] = game['game_time']
    c['lobby_type'] = lobby_type_map[game['lobby_type']]
    gm = game_mode_map[game['game_mode']]
    if gm == "Captains Draft":
        gm = "Captains Mode"
    elif gm == "Least Played":
        gm = "Random Draft"
    elif gm == 'All Random':
        gm = "Random Draft"
    c['game_mode'] = gm
    
    game_mins = game['game_time'] / 60.0
    game_phase = 'early_game'
    if game_mins >= 26:
        game_phase = 'late_game'
    elif game_mins >= 13:
        game_phase = 'mid_game'
    c['game_phase'] = game_phase
    
    return c

def teamfights_features(game):
    if 'init_dict' not in dir(teamfights_features):
        teamfights_features.init_dict = {'n_fights': 0, 'mean_time': 0, 'mean_deaths': 0}
    c = teamfights_features.init_dict.copy()
    c['n_fights'] = len(game['teamfights'])
    for fight in game['teamfights']:
        c['mean_time'] += fight['end'] - fight['start']
        c['mean_deaths'] += fight['deaths']
    if c['n_fights']:
        c['mean_time'] /= c['n_fights']
        c['mean_deaths'] /= c['n_fights']
    return c

def log_features(player):
    if 'init_dict' not in dir(log_features):
        log_features.init_dict = {'len_obs_left': 0, 'len_sen_left': 0, 'len_purchase': 0,
                                  'n_buybacks': 0}
    c = log_features.init_dict.copy()
    c['len_obs_left'] = len(player['obs_left_log'])
    c['len_sen_left'] =  len(player['sen_left_log'])
    c['len_purchase'] = len(player['purchase_log'])
    c['n_buybacks'] = len(player['buyback_log'])
    return c

def dt_features(game, player, beta=.6, corrected=True):
    if 'init_dict' not in dir(dt_features):
        dt_features.init_dict = {'xp_sec': 0, 'gold_sec': 0}
    c = dt_features.init_dict.copy()
    
    
    times = player['times'].copy()
    gold_t = player['gold_t'].copy()
    xp_t = player['xp_t'].copy()
    
    if times and game['game_time'] > times[-1]:
        times.append(game['game_time'])
        gold_t.append(player['gold'])
        xp_t.append(player['xp'])
    n = len(times) - 1
    dt = [times[i + 1] - times[i] for i in range(n)]
    dgold = [gold_t[i + 1] - gold_t[i] for i in range(n)]
    dxp = [xp_t[i + 1] - xp_t[i] for i in range(n)]

    
    gold_sec = [gold / time for gold,time in zip(dgold, dt)]
    avg_gold = 0
    for gs in gold_sec:
        avg_gold = beta * avg_gold + (1 - beta) * gs
    
    xp_sec = [xp / time for xp, time in zip(dxp, dt)]
    avg_xp = 0
    for xs in xp_sec:
        avg_xp = beta * avg_xp + (1 - beta) * xs
    
    if corrected and n > 0:
        avg_gold = avg_gold / (1 - beta ** (n + 1))
        avg_xp = avg_xp / (1 - beta ** (n + 1))
    
    c['xp_sec'] = avg_xp
    c['gold_sec'] = avg_gold
    return c

def normalize_coords(x, y, xmin=66.0, xmax=200.0, ymin=66.0, ymax=200.0):
    return (x - xmin) / (xmax - xmin) , (y - ymin) / (ymax - ymin)

def pos_features(player):
    if 'init' not in dir(pos_features):
        pos_features.init = {'x': None, 'y': None, 'lane': None,
                             'my_base_d': 0, 'enemy_base_d': 0}
        
        pos_features.radiant_base = (0.1, 0.1)
        pos_features.dire_base = (.9, .9)
        pos_features.dist = lambda x, y, b: ((x - b[0]) ** 2 + (y - b[1]) ** 2) ** .5
        
        pos_features.is_top_lane = lambda x, y: x < .175 or y < .175
        pos_features.is_mid_lane = lambda x, y: abs(x - y) < .15
        pos_features.is_bot_lane = lambda x, y: x > .85 or y > .85
        
    c = pos_features.init.copy()
    x, y = player['x'], player['y']
    x, y = normalize_coords(x, y)
    if pos_features.is_top_lane(x, y):
        c['lane'] = 'top'
    elif pos_features.is_mid_lane(x, y):
        c['lane'] = 'mid'
    elif pos_features.is_bot_lane(x, y):
        c['lane'] = 'bot'
    else:
        c['lane'] = 'jungle'
    
    hero_identifier = slot_hero[player['player_slot']]
    team = hero_identifier[0]
    if team == 'r':
        c['my_base_d'] = pos_features.dist(x, y, pos_features.radiant_base)
        c['enemy_base_d'] = pos_features.dist(x, y, pos_features.dire_base)
    else:
        c['my_base_d'] = pos_features.dist(x, y, pos_features.dire_base)
        c['enemy_base_d'] = pos_features.dist(x, y, pos_features.radiant_base)
    
    c['x'] = x
    c['y'] = y
    return c

def action_features(player):
    if 'init' not in dir(action_features):
        action_features.actions = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                                   17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31,
                                   32, 33, 36]
        action_features.init = {f'action_{a}': 0 for a in action_features.actions}
    c = action_features.init.copy()
    for a, v in player['actions'].items():
        c[f'action_{a}'] = v
    return c

def life_state_features(game, player):
    if 'init' not in dir(life_state_features):
        life_state_features.init = {'sec_dead': 0, 'sec_magic': 0,
                                    'frac_dead': 0, 'frac_magic': 0}
    c = life_state_features.init.copy()
    
    time = game['game_time']
    
    keys = player['life_state'].keys()
    if '1' in keys:
        c['sec_dead'] = player['life_state']['1'] / 2.0
        if time:
            c['frac_dead'] = c['sec_dead'] / time
    if '2' in keys:
        c['sec_magic'] = player['life_state']['2'] / 2.0
        if time:
            c['frac_magic'] = c['sec_magic'] / time
    return c

def gold_features(player):
    if 'init' not in dir(gold_features):
        gold_features.gold_reasons = [0, 1, 2, 5, 6, 11, 12, 13, 14, 15]
        gold_features.init = {'net_gold': 0}
        gold_features.init.update({f'gold_reason_{i}': 0 for i in gold_features.gold_reasons})
        gold_features.init.update({f'gold_reason_{i}_frac': 0 for i in gold_features.gold_reasons})

    c = gold_features.init.copy()
    for reason, value in player['gold_reasons'].items():
        if reason == '0':
            value -= 600  # base value
        c['net_gold'] += value
        c['gold_reason_' + reason] = value
    for reason, value in player['gold_reasons'].items():
        if c['net_gold']:
            c['gold_reason_' + reason + '_frac'] = c['gold_reason_' + reason] / c['net_gold']
    return c

def xp_features(player):
    if 'init' not in dir(xp_features):
        xp_features.xp_reasons = [0, 1, 2, 3]
        xp_features.init = {'net_xp': 0}
        xp_features.init.update({f'xp_reason_{i}': 0 for i in xp_features.xp_reasons})
        xp_features.init.update({f'xp_reason_{i}_frac': 0 for i in xp_features.xp_reasons})

    c = xp_features.init.copy()
    c['net_xp'] = player['xp']
    for reason, value in player['xp_reasons'].items():
        c['xp_reason_' + reason] = value
        if player['xp']:
            c['xp_reason_' + reason + '_frac'] = value / player['xp']
    return c

def purchase_features(player):
    if 'init' not in dir(purchase_features):
        purchase_features.categories = ['Consumables', 'Attributes', 'Armaments', 'Arcane', 'Common', 'Support',
                      'Caster', 'Weapons', 'Armor', 'Artifacts', 'Secret shop']
        purchase_features.init = {'pch_total_cost': 0}
        purchase_features.init.update({f'pch_{cat}': 0 for cat in purchase_features.categories})

    c = purchase_features.init.copy()
    for p, amount in player['purchase'].items():
        if p not in items_df.index:
            continue
        item = items_df.loc[p]
        c['pch_total_cost'] += item.cost
        c[f'pch_{item.category}'] += 1
    return c

def hero_features(player):
    if 'init' not in dir(hero_features):
        hero_features.roles = ['Initiator', 'Disabler', 'Durable', 'Nuker',
                               'Carry', 'Pusher', 'Escape', 'Jungler', 'Support']
        hero_features.rolelevels = [1, 2, 3]
        hero_features.init = {f'role_{a}_lvl': 0 for a in hero_features.roles}
        hero_features.init.update({'complexity': 0, 'attack_type': None, 'attribute': None,
                                 'hero_tag': 0})
        
    c = hero_features.init.copy()
    
    hero = heroes_df.loc[player['hero_id']]
    c['hero_tag'] = hero.tag
    c['attribute'] = hero.AttributePrimary
    c['attack_type'] = hero.AttackCapabilities
    c['complexity'] = hero.Complexity
    for role, level in zip(hero.Role, hero.Rolelevels):
        c[f'role_{role}_lvl'] = level
    return c

item_index = ['tango', 'wraith_band', 'enchanted_mango', 'clarity', 'tpscroll', 'magic_stick', 'wind_lace', 'magic_wand', 'stout_shield', 'quelling_blade', 'branches', 'faerie_fire', 'flask', 'orb_of_venom', 'boots', 'null_talisman', 'phase_boots', 'blade_of_alacrity', 'ancient_janggo', 'dust', 'soul_ring', 'tranquil_boots', 'chainmail', 'pers', 'blight_stone', 'ring_of_aquila', 'power_treads', 'smoke_of_deceit', 'ward_sentry', 'ring_of_health', 'ring_of_basilius', 'ring_of_regen', 'gauntlets', 'circlet', 'ward_observer', 'bottle', 'helm_of_iron_will', 'void_stone', 'bracer', 'ring_of_protection', 'gloves', 'arcane_boots', 'blades_of_attack', 'lifesteal', 'slippers', 'robe', 'urn_of_shadows', 'bfury', 'mekansm', 'broadsword', 'ogre_axe', 'blink', 'mithril_hammer', 'infused_raindrop', 'belt_of_strength', 'tome_of_knowledge', 'vanguard', 'boots_of_elves', 'staff_of_wizardry', 'invis_sword', 'energy_booster', 'hand_of_midas', 'shadow_amulet', 'blade_mail', 'medallion_of_courage', 'cloak', 'helm_of_the_dominator', 'kaya', 'dragon_lance', 'black_king_bar', 'desolator', 'heavens_halberd', 'armlet', 'travel_boots', 'ultimate_scepter', 'sange_and_yasha', 'radiance', 'reaver', 'lesser_crit', 'monkey_king_bar', 'quarterstaff', 'maelstrom', 'mask_of_madness', 'echo_sabre', 'javelin', 'glimmer_cape', 'solar_crest', 'vladmir', 'diffusal_blade', 'cyclone', 'mantle', 'sphere', 'oblivion_staff', 'ultimate_orb', 'shivas_guard', 'spirit_vessel', 'silver_edge', 'butterfly', 'yasha', 'abyssal_blade', 'platemail', 'manta', 'greater_crit', 'force_staff', 'aether_lens', 'point_booster', 'buckler', 'headdress', 'ghost', 'nullifier', 'meteor_hammer', 'veil_of_discord', 'hurricane_pike', 'crimson_guard', 'heart', 'basher', 'gem', 'mystic_staff', 'hyperstone', 'ethereal_blade', 'claymore', 'sobi_mask', 'guardian_greaves', 'sheepstick', 'orchid', 'aeon_disk', 'bloodstone', 'lotus_orb', 'hood_of_defiance', 'vitality_booster', 'demon_edge', 'pipe', 'refresher', 'skadi', 'mjollnir', 'rod_of_atos', 'aegis', 'relic', 'assault', 'talisman_of_evasion', 'soul_booster', 'eagle', 'sange', 'satanic', 'bloodthorn', 'necronomicon', 'octarine_core', 'rapier', 'courier', 'moon_shard', 'dagon']

def strange_items_mapper(item):
    if item == 'ward_dispenser':
        return 'ward_observer'
    elif 'tango' in item:
        return 'tango'
    elif 'dagon' in item:
        return 'dagon'
    elif 'necronomicon' in item:
        return 'necronomicon'
    elif 'travel_boots' in item:
        return 'travel_boots'
    else:
        return item

def team_items_features(game):
    if 'init' not in dir(team_items_features):
        radiant = {f'r_{item}': 0 for item in item_index}
        dire = {f'd_{item}': 0 for item in item_index}
        team_items_features.init = {**radiant, **dire}
    
    c = team_items_features.init.copy()
    for i, p in enumerate(game['players']):
        team = 'r' if i < 5 else 'd'
        for source in ['hero_inventory', 'hero_stash']:
            for thing in p[source]:
                name = thing['id'][5:]
                if name in item_index:
                    c[f'{team}_{name}'] += 1
                else:
                    name2 = strange_items_mapper(name)
                    if name2 != name:
                        c[f'{team}_{name2}'] += 1
    return c

def player_kill_streaks(player):
    
    ks_cum = {'3-4': 0, '5-6': 0, '>6': 0}

    ks = {int(k):v for k, v in player['kill_streaks'].items()}
    sorted_ks = sorted(ks.keys(), reverse=True)

    for i, k in enumerate(sorted_ks):
        if ks[k] == 0:
            continue
        if k < 5:
            ks_cum['3-4'] += ks[k]
        elif k < 7:
            ks_cum['5-6'] += ks[k]
        else:
            ks_cum['>6'] += ks[k]
        v = ks[k]
        for ki in sorted_ks[i:]:
            ks[ki] -= v
    return ks_cum

def player_multi_kills(player):
    
    mk_cum = {'2': 0, '>2': 0}

    mk = {int(k):v for k, v in player['multi_kills'].items()}
    sorted_mk = sorted(mk.keys(), reverse=True)

    for i, k in enumerate(sorted_mk):
        if mk[k] == 0:
            continue
        if k == 2:
            mk_cum['2'] += mk[k]
        else:
            mk_cum['>2'] += mk[k]
        v = mk[k]
        for ki in sorted_mk[i:]:
            mk[ki] -= v
    return mk_cum

def team_kill_streaks_features(game):
    if 'init' not in dir(team_kill_streaks_features):
        multi_kills = ['2', '>2']
        kill_streaks = ['3-4', '5-6', '>6']
        team_kill_streaks_features.init = {'r_kill_streaks': 0, 'r_multi_kills': 0,
                                           'd_kill_streaks': 0, 'd_multi_kills': 0}
        team_kill_streaks_features.init.update({f'r{i}_multi_kills_{mk}': 0 for mk in multi_kills for i in range(1, 6)})
        team_kill_streaks_features.init.update({f'd{i}_multi_kills_{mk}': 0 for mk in multi_kills for i in range(1, 6)})
    
        team_kill_streaks_features.init.update({f'r{i}_kill_streaks_{mk}': 0 for mk in kill_streaks for i in range(1, 6)})
        team_kill_streaks_features.init.update({f'd{i}_kill_streaks_{mk}': 0 for mk in kill_streaks for i in range(1, 6)})

    
    c = team_kill_streaks_features.init.copy()
    
    for i, player in enumerate(game['players']):
        team = 'r' if i < 5 else 'd'
        idx = i % 5 + 1
        
        mk_cum = player_multi_kills(player)
        for k, v in mk_cum.items():
            c[f'{team}{idx}_multi_kills_{k}'] = v
        c[f'{team}_multi_kills'] += sum(mk_cum.values())

        ks_cum = player_kill_streaks(player)
        for k, v in ks_cum.items():
            c[f'{team}{idx}_kill_streaks_{k}'] = v
        c[f'{team}_kill_streaks'] += sum(ks_cum.values())
    return c

def main_player_features(player):
    if 'init' not in dir(main_player_features):
        main_player_features.init = {'obs_placed': 0, 'sen_placed': 0, 'creeps_stacked': 0,
                                    'camps_stacked': 0, 'rune_pickups': 0,
                                     'teamfight_participation': 0, 'towers_killed': 0,
                                     'pings': 0, 'gold': 0, 'lh': 0, 'health': 0,
                                    'max_health': 0, 'level': 0, 'kills': 0, 'deaths': 0,
                                    'assists': 0, 'denies': 0, 'nearby_creep_death_count': 0}
    
    c = main_player_features.init.copy()
    for k in c.keys():
        if k == 'pings':
            if player[k]:
                c[k] = player[k]['0']
        else:
            c[k] = player[k]
    c['health_frac'] = c['health'] / c['max_health']
    return c


heroes_tags = ['nevermore', 'brewmaster', 'pudge', 'huskar', 'lycan', 'phantom_lancer', 'windrunner', 'night_stalker', 'ogre_magi',
          'tinker', 'razor', 'centaur', 'shadow_shaman', 'weaver', 'naga_siren', 'enchantress', 'antimage', 'clinkz', 'visage',
          'skywrath_mage', 'rattletrap', 'phantom_assassin', 'dragon_knight', 'furion', 'sven', 'spectre', 'viper', 'venomancer',
          'storm_spirit', 'bristleback', 'lion', 'faceless_void', 'shredder', 'juggernaut', 'doom_bringer', 'rubick', 'skeleton_king',
          'legion_commander', 'batrider', 'kunkka', 'zuus', 'sniper', 'gyrocopter', 'omniknight', 'morphling', 'chaos_knight', 'dark_willow',
          'luna', 'ancient_apparition', 'abaddon', 'spirit_breaker', 'abyssal_underlord', 'keeper_of_the_light', 'wisp', 'monkey_king', 'dazzle',
          'invoker', 'slark', 'silencer', 'obsidian_destroyer', 'bane', 'bounty_hunter', 'witch_doctor', 'lone_druid', 'bloodseeker', 'drow_ranger',
          'slardar', 'troll_warlord', 'broodmother', 'axe', 'elder_titan', 'medusa', 'disruptor', 'phoenix', 'tidehunter', 'lina', 'techies', 'mirana',
          'life_stealer', 'riki', 'oracle', 'pangolier', 'ursa', 'tiny', 'queenofpain', 'alchemist', 'winter_wyvern', 'treant', 'puck', 'shadow_demon',
          'lich', 'necrolyte', 'crystal_maiden', 'tusk', 'jakiro', 'ember_spirit', 'sand_king', 'nyx_assassin', 'terrorblade', 'earthshaker', 'vengefulspirit',
          'magnataur', 'warlock', 'pugna', 'earth_spirit', 'arc_warden', 'meepo', 'death_prophet', 'templar_assassin', 'enigma',
          'undying', 'leshrac', 'dark_seer', 'beastmaster', 'chen']
    
def team_lineups(game):
    if 'init' not in dir(team_lineups):
        dire = {f'r_{hero}': 0 for hero in heroes_tags}
        radiant = {f'd_{hero}': 0 for hero in heroes_tags}
        team_lineups.init = {**dire, **radiant}
    
    c = team_lineups.init.copy()
    
    for i, p in enumerate(game['players']):
        team = 'r' if i <  5 else 'd'
        name = '_'.join(p['hero_name'].split('_')[3:])
        c[f'{team}_{name}'] = 1
    return c


def player_features(game, player):
    prefix = slot_hero[player['player_slot']]
    features = {**log_features(player), **dt_features(game, player),
            **pos_features(player), **action_features(player),
            **life_state_features(game, player), **gold_features(player),
           **xp_features(player), **main_player_features(player)}
    
    return {f'{prefix}_{key}': value for key, value in features.items()}

def game_features(game):
    features = {**chat_features(game), **objectives_features(game),
                **general_features(game), **teamfights_features(game),
                **team_kill_streaks_features(game),
               **team_lineups(game)}
    for p in game['players']:
        features.update(player_features(game, p))
    return features


categorical_columns = ['r_first_blood', 'd_first_blood', 'game_mode', 'lobby_type', 'game_phase', 'r_nevermore', 'r_brewmaster',
                       'r_pudge', 'r_huskar', 'r_lycan', 'r_phantom_lancer', 'r_windrunner', 'r_night_stalker', 'r_ogre_magi',
                       'r_tinker', 'r_razor', 'r_centaur', 'r_shadow_shaman', 'r_weaver', 'r_naga_siren', 'r_enchantress',
                       'r_antimage', 'r_clinkz', 'r_visage', 'r_skywrath_mage', 'r_rattletrap', 'r_phantom_assassin',
                       'r_dragon_knight', 'r_furion', 'r_sven', 'r_spectre', 'r_viper', 'r_venomancer', 'r_storm_spirit',
                       'r_bristleback', 'r_lion', 'r_faceless_void', 'r_shredder', 'r_juggernaut', 'r_doom_bringer', 'r_rubick',
                       'r_skeleton_king', 'r_legion_commander', 'r_batrider', 'r_kunkka', 'r_zuus', 'r_sniper', 'r_gyrocopter',
                       'r_omniknight', 'r_morphling', 'r_chaos_knight', 'r_dark_willow', 'r_luna', 'r_ancient_apparition',
                       'r_abaddon', 'r_spirit_breaker', 'r_abyssal_underlord', 'r_keeper_of_the_light', 'r_wisp', 'r_monkey_king',
                       'r_dazzle', 'r_invoker', 'r_slark', 'r_silencer', 'r_obsidian_destroyer', 'r_bane', 'r_bounty_hunter',
                       'r_witch_doctor', 'r_lone_druid', 'r_bloodseeker', 'r_drow_ranger', 'r_slardar', 'r_troll_warlord',
                       'r_broodmother', 'r_axe', 'r_elder_titan', 'r_medusa', 'r_disruptor', 'r_phoenix', 'r_tidehunter',
                       'r_lina', 'r_techies', 'r_mirana', 'r_life_stealer', 'r_riki', 'r_oracle', 'r_pangolier', 'r_ursa',
                       'r_tiny', 'r_queenofpain', 'r_alchemist', 'r_winter_wyvern', 'r_treant', 'r_puck', 'r_shadow_demon',
                       'r_lich', 'r_necrolyte', 'r_crystal_maiden', 'r_tusk', 'r_jakiro', 'r_ember_spirit', 'r_sand_king',
                       'r_nyx_assassin', 'r_terrorblade', 'r_earthshaker', 'r_vengefulspirit', 'r_magnataur', 'r_warlock',
                       'r_pugna', 'r_earth_spirit', 'r_arc_warden', 'r_meepo', 'r_death_prophet', 'r_templar_assassin', 'r_enigma',
                       'r_undying', 'r_leshrac', 'r_dark_seer', 'r_beastmaster', 'r_chen', 'd_nevermore', 'd_brewmaster', 'd_pudge',
                       'd_huskar', 'd_lycan', 'd_phantom_lancer', 'd_windrunner', 'd_night_stalker', 'd_ogre_magi', 'd_tinker',
                       'd_razor', 'd_centaur', 'd_shadow_shaman', 'd_weaver', 'd_naga_siren', 'd_enchantress', 'd_antimage',
                       'd_clinkz', 'd_visage', 'd_skywrath_mage', 'd_rattletrap', 'd_phantom_assassin', 'd_dragon_knight',
                       'd_furion', 'd_sven', 'd_spectre', 'd_viper', 'd_venomancer', 'd_storm_spirit', 'd_bristleback', 'd_lion',
                       'd_faceless_void', 'd_shredder', 'd_juggernaut', 'd_doom_bringer', 'd_rubick', 'd_skeleton_king',
                       'd_legion_commander', 'd_batrider', 'd_kunkka', 'd_zuus', 'd_sniper', 'd_gyrocopter', 'd_omniknight',
                       'd_morphling', 'd_chaos_knight', 'd_dark_willow', 'd_luna', 'd_ancient_apparition', 'd_abaddon',
                       'd_spirit_breaker', 'd_abyssal_underlord', 'd_keeper_of_the_light', 'd_wisp', 'd_monkey_king', 'd_dazzle',
                       'd_invoker', 'd_slark', 'd_silencer', 'd_obsidian_destroyer', 'd_bane', 'd_bounty_hunter', 'd_witch_doctor',
                       'd_lone_druid', 'd_bloodseeker', 'd_drow_ranger', 'd_slardar', 'd_troll_warlord', 'd_broodmother', 'd_axe',
                       'd_elder_titan', 'd_medusa', 'd_disruptor', 'd_phoenix', 'd_tidehunter', 'd_lina', 'd_techies', 'd_mirana',
                       'd_life_stealer', 'd_riki', 'd_oracle', 'd_pangolier', 'd_ursa', 'd_tiny', 'd_queenofpain', 'd_alchemist',
                       'd_winter_wyvern', 'd_treant', 'd_puck', 'd_shadow_demon', 'd_lich', 'd_necrolyte', 'd_crystal_maiden',
                       'd_tusk', 'd_jakiro', 'd_ember_spirit', 'd_sand_king', 'd_nyx_assassin', 'd_terrorblade', 'd_earthshaker',
                       'd_vengefulspirit', 'd_magnataur', 'd_warlock', 'd_pugna', 'd_earth_spirit', 'd_arc_warden', 'd_meepo',
                       'd_death_prophet', 'd_templar_assassin', 'd_enigma', 'd_undying', 'd_leshrac', 'd_dark_seer', 'd_beastmaster',
                       'd_chen', 'r1_lane',  'r2_lane', 'r3_lane', 'r4_lane', 'r5_lane', 'd1_lane', 'd2_lane', 'd3_lane', 'd4_lane', 'd5_lane']

In [7]:
def team_features(df, agg=None, remove_hero_features=False,
                  calculate_diff=True,
                  calculate_ratios=True,
                  remove_team_features=False):
    hero_features = [c for c in df.columns if re.search(r'^[rd][0-9]_.*', c)]
    team_features = [c for c in df.columns if re.search(r'^[rd]_.*', c)]

    single_player_features = [c for c in train_df.columns.tolist() if c.startswith('r1')]
    
    df_cpy = df.copy()
    if agg is None:
        agg = [np.sum]
    else:
        agg = [np.sum] + agg
    for feature in tqdm.tqdm(single_player_features):
        if feature.endswith('chat_msg') or feature.endswith('hero_tag') or feature.endswith('lane') or \
            feature.endswith('attack_type') or feature.endswith('attribute'):
            continue
        name = feature[3:]
        subdf = df_cpy[[c for c in df_cpy.columns if c.endswith(name)]]
        rad = subdf[[c for c in subdf.columns if c.startswith('r')]]
        dire = subdf[[c for c in subdf.columns if c.startswith('d')]]
        for f in agg:
            fname = str(f).split(' ')[1]
            for team in ['r', 'd']:
                team_df = subdf[[c for c in subdf.columns if c.startswith(team)]]
                
                df_cpy[f'{team}_{name}_{fname}'] = f(team_df, axis=1)
    eps = 1e-4
    if remove_hero_features:
        df_cpy.drop(hero_features, axis=1, inplace=True)
        if calculate_diff:
            for feature in team_features:
                if feature.startswith('d'):
                    continue
                name = feature[2:]
                df_cpy[f'{name}_diff'] = df_cpy[feature].astype(np.float32) - df_cpy[f'd_{name}'].astype(np.float32)
                if calculate_ratios:
                    df_cpy[f'{name}_ratio'] = df_cpy[feature].astype(np.float32) / (df_cpy[f'd_{name}'].astype(np.float32) + eps)
            for feature in single_player_features:
                if feature.endswith('chat_msg') or feature.endswith('hero_tag') or feature.endswith('lane') or feature.endswith('attack_type') or feature.endswith('attribute'):
                    continue
                feature = feature[0] + feature[2:] + '_sum'
                name = feature[2:]
                df_cpy[f'{name}_diff'] = df_cpy[feature].astype(np.float32) - df_cpy[f'd_{name}'].astype(np.float32)
                if calculate_ratios:
                    df_cpy[f'{name}_ratio'] = df_cpy[feature].astype(np.float32) / (df_cpy[f'd_{name}'].astype(np.float32) + eps)
            if remove_team_features:
                df_cpy.drop(team_features, axis=1, inplace=True)
    return df_cpy

In [8]:
def permute_players(X, Y, permute_more=False):
    general_features = []
    hero_features = []
    for c in X.columns:
        if re.search(r'^[rd][0-9]_.*', c):
            hero_features.append(c)
        else:
            general_features.append(c)
    
    general_features = X[general_features]
    permutations = [(2, 1, 4, 5, 3), (3, 4, 5, 1, 2), (4, 5, 2, 3, 1), (5, 3, 1, 2, 4)]
    if permute_more:
        permutations = permutations + [(1, 2, 5, 4, 3), (2, 3, 1, 5, 4), (3, 5, 4, 1, 2), (4, 1, 2, 3, 5), (5, 4, 3, 2, 1)]
    enh_train = []

    for perm_rad, perm_dire in zip(permutations, permutations[1:] + [permutations[0]]):
        new_r_team = []
        new_d_team = []
        for i, prad, pdire in zip(range(1, 6), perm_rad, perm_dire):
            p = X[[c for c in hero_features if c.startswith('r'+str(prad))]]
            p.columns = [c[0] + str(i) + c[2:] for c in p.columns]
            new_r_team.append(p)

            p = X[[c for c in hero_features if c.startswith('d'+str(pdire))]]
            p.columns = [c[0] + str(i) + c[2:] for c in p.columns]
            new_d_team.append(p)
        combined_df = pd.concat([general_features] + new_r_team + new_d_team, axis=1)
        enh_train.append(combined_df)
        gc.collect()
    X_enh = pd.concat([X] + enh_train)
    y_enh = pd.DataFrame(pd.concat([Y.radiant_win] * (1 + len(enh_train))), columns=['radiant_win'])
    return X_enh, y_enh

def permute_teams(X, Y):
    hero_features = [c for c in X.columns if re.search(r'^[rd][0-9]_.*', c)]
    team_features = [c for c in X.columns if re.search(r'^[rd]_.*', c)]
    
    rad_cols = []
    dire_cols = []
    neutral_cols = [c for c in X.columns if c not in hero_features and c not in team_features]
    for c in (hero_features + team_features):
        if c.startswith('r'):
            rad_cols.append(c)
        elif c.startswith('d'):
            dire_cols.append(c)
    neutral_data = X[neutral_cols]
    
    radiant_team = X[rad_cols]
    radiant_team.columns = ['d' + c[1:] for c in radiant_team.columns]
    
    dire_team = X[dire_cols]
    dire_team.columns = ['r' + c[1:] for c in dire_team.columns]
    
    target = Y.radiant_win
    inv_target = ~ target
    inv_target.index = inv_target.index.map(lambda x: 'inv_' + str(x))
    target_comb = pd.concat([target, inv_target])
    
    y_inv = pd.DataFrame(target_comb, columns=['radiant_win'])
    del target, inv_target, target_comb
    gc.collect()    
    X_inv = pd.concat([neutral_data, dire_team, radiant_team], axis=1)
    X_inv.index = X_inv.index.map(lambda x: 'inv_' + str(x))
    X_inv[[c for c in X_inv.columns if c.endswith('ratio')]] = 1 / X_inv[[c for c in X_inv.columns if c.endswith('ratio')]]
    X_comb = pd.concat([X, X_inv])
    gc.collect()
    return X_comb, y_inv

In [9]:
def job(line):
    game = json.loads(line)
    return game_features(game)

def extr_game_features(path_frm, path_to):
    features = []
    for line in tqdm.tqdm(open(path_frm)):
        game = json.loads(line)
        features.append(game_features(game))
    df = pd.DataFrame(features)
    df.to_csv(path_to, index=False)
    return df

def mp_game_features(path_frm, path_to, first_n=-1):
    pool = mp.Pool(mp.cpu_count())
    features = []
    file_content = [line for line in open(path_frm)]
    if first_n > 0:
        file_content = file_content[:first_n]
    for _ in tqdm.tqdm(pool.imap(job, file_content, chunksize=100)):
        features.append(_)
#     features = pool.map(job, open(path_frm))
    del file_content
    df = pd.DataFrame(features)
    del features
    df.to_csv(path_to, index=False)
    pool.close()
    return df

In [10]:
train_df = mp_game_features('../input/mlcourse-dota2-win-prediction/train_matches.jsonl', 'train_features_total.csv')

39675it [02:17, 289.42it/s]


In [11]:
test_df = mp_game_features('../input/mlcourse-dota2-win-prediction/test_matches.jsonl', 'test_features_total.csv')

10000it [00:34, 292.65it/s]


In [12]:
train_team_df = team_features(train_df, agg=[np.min, np.median, np.max, np.std],
                  remove_hero_features=True, calculate_diff=True,
                  remove_team_features=True, calculate_ratios=True)

train_team_df.to_csv('train_team_features.csv', index=False)

100%|██████████| 104/104 [00:44<00:00,  2.36it/s]


In [13]:
test_team_df = team_features(test_df, agg=[np.min, np.median, np.max, np.std],
                  remove_hero_features=True, calculate_diff=True,
                  remove_team_features=True, calculate_ratios=True)

test_team_df.to_csv('test_team_features.csv', index=False)

100%|██████████| 104/104 [00:11<00:00,  9.17it/s]


In [14]:
object_cols = train_df.select_dtypes(include=[np.object]).columns
int_cols = train_df.select_dtypes(include=[np.int64]).columns
float_cols = train_df.select_dtypes(include=[np.float64]).columns
dtype_map = {col: np.int16 for col in int_cols}
dtype_map.update({col: np.float16 for col in float_cols})

In [15]:
X = pd.read_csv('train_features_total.csv', dtype=dtype_map)
X_test = pd.read_csv('test_features_total.csv', dtype=dtype_map)
Y = pd.read_csv('../input/mlcourse-dota2-win-prediction/train_targets.csv')
submission_df = pd.read_csv('../input//mlcourse-dota2-win-prediction/sample_submission.csv')

In [16]:
object_cols = train_team_df.select_dtypes(include=[np.object]).columns
int_cols = train_team_df.select_dtypes(include=[np.int64]).columns
float_cols = train_team_df.select_dtypes(include=[np.float64]).columns
dtype_map = {col: np.int16 for col in int_cols}
dtype_map.update({col: np.float16 for col in float_cols})

In [17]:
train_team_df = pd.read_csv('train_team_features.csv', dtype=dtype_map)
test_team_df = pd.read_csv('test_team_features.csv', dtype=dtype_map)

In [18]:
X_team_unique = train_team_df[[c for c in train_team_df.columns if c not in X.columns]]
X_team_test = test_team_df[[c for c in test_team_df.columns if c not in X.columns]]

In [19]:
X_combined = pd.concat([X, X_team_unique], axis=1)
X_test_combined = pd.concat([X_test, X_team_test], axis=1)

In [20]:
variances = np.array([X_combined[c].var() if X_combined[c].dtype != object else -1 for c in X_combined.columns])

In [21]:
zero_var_cols = [c for i, c in enumerate(X_combined.columns) if variances[i] == 0]

In [22]:
X_combined.drop(zero_var_cols, axis=1, inplace=True)
X_test_combined.drop(zero_var_cols, axis=1, inplace=True)

In [23]:
del X_team_test, X_test, train_df, test_df, test_team_df, train_team_df

In [24]:
del X, X_team_unique
gc.collect()

0

In [25]:
X_enh, Y_enh = permute_players(X_combined, Y)
del X_combined, Y
X_enh2, Y_enh2 = permute_teams(X_enh, Y_enh)
del X_enh, Y_enh

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [26]:
X_enh2.drop([c for c in zero_var_cols if c in X_enh2.columns], axis=1, inplace=True)
# X_test_combined.drop([c for c in X_test_combined.columns if c in X_enh2.columns], axis=1, inplace=True)

In [27]:
print(X_enh2.shape, Y_enh2.shape, X_test_combined.shape)

(396750, 2728) (396750, 1) (10000, 2728)


In [28]:
def train_and_predict(n_iterations, path_format='total_team_combined_{}.csv', first_n=-1):
    params = {
          'random_seed': 42,
          'n_estimators': n_iterations,
          'task_type': 'GPU',
          'verbose': int(n_iterations // 5),
          'one_hot_max_size': 130,
          }
    model = CatBoostClassifier(**params)
    model.fit(X_enh2, Y_enh2, verbose=int(n_iterations // 5), cat_features=categorical_columns)
    test_pool = Pool(X_test_combined, cat_features=categorical_columns)
    if first_n > 0:
         submission_df.radiant_win_prob[:first_n] = model.predict_proba(test_pool)[:, 1]
    else:
        submission_df.radiant_win_prob = model.predict_proba(test_pool)[:, 1]
    submission_df.to_csv(path_format.format(n_iterations), index=False)
    return submission_df.copy()

In [29]:
Y_enh2.radiant_win = Y_enh2.radiant_win.astype(np.float32)

In [30]:
def temperature_sharpen(*args, t=.5):
    """
    This works only for binary classification
    """
    n = len(args)
    
    accumulator = pd.read_csv(args[0], index_col=0)
    for arg in args[1:]:
        accumulator.radiant_win_prob += pd.read_csv(arg, index_col=0).radiant_win_prob ** t
    accumulator.radiant_win_prob /= n
    return accumulator

In [31]:
categorical_columns = [c for c in categorical_columns if c in X_enh2.columns and X_enh2[c].dtype in (np.object, np.int, np.int8, np.int16, np.int32, np.int64)]

In [32]:
names = []
for n_iter in [5000, 7000, 9000, 11000]:
    train_and_predict(n_iter)
    names.append('total_team_combined_{}.csv'.format(n_iter))

Learning rate set to 0.021713
0:	learn: 0.6847298	total: 66.6ms	remaining: 5m 33s
1000:	learn: 0.4445187	total: 42.3s	remaining: 2m 49s
2000:	learn: 0.4067499	total: 1m 24s	remaining: 2m 6s
3000:	learn: 0.3747680	total: 2m 7s	remaining: 1m 24s
4000:	learn: 0.3462010	total: 2m 50s	remaining: 42.6s
4999:	learn: 0.3204602	total: 3m 34s	remaining: 0us
Learning rate set to 0.016089
0:	learn: 0.6868849	total: 70ms	remaining: 8m 9s
1400:	learn: 0.4424586	total: 59.5s	remaining: 3m 57s
2800:	learn: 0.4037690	total: 1m 59s	remaining: 2m 58s
4200:	learn: 0.3708889	total: 3m	remaining: 2m
5600:	learn: 0.3414999	total: 4m 1s	remaining: 1m
6999:	learn: 0.3154017	total: 5m 3s	remaining: 0us
Learning rate set to 0.012861
0:	learn: 0.6881295	total: 54.8ms	remaining: 8m 13s
1800:	learn: 0.4413080	total: 1m 16s	remaining: 5m 6s
3600:	learn: 0.4019648	total: 2m 34s	remaining: 3m 51s
5400:	learn: 0.3683520	total: 3m 52s	remaining: 2m 35s
7200:	learn: 0.3385800	total: 5m 10s	remaining: 1m 17s
8999:	learn: 

In [33]:
acc = temperature_sharpen(*names)
acc.to_csv('reproduce.csv')