In [1]:
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

In [2]:
# abilities = pd.read_csv('./sberbank_data/final/dota2_abilities.csv')
# items = pd.read_csv('./sberbank_data/data_2019/dota2_items.csv')

<br><br><br>
# LOAD DATA
----

## Preprocess  skill_train

In [3]:
# train
skill_train = pd.read_csv('./sberbank_data/final/academy2019_final_train.csv', index_col='id')

# test
skill_test = pd.read_csv('./sberbank_data/final/academy2019_final_test.csv', index_col='id')

In [4]:
# train
# delete 'other' from 'winner_team'
skill_train = skill_train[skill_train['winner_team'] != 'other']

# test
skill_test = skill_test[skill_test['winner_team'] != 'other']

In [5]:
# split on data and targets in train data set
data_train = skill_train.copy()
# test
data_test = skill_test.copy()

In [6]:
# get dummies on 'hero_id', 'player_team' and 'winner_team'
# train
player_team = pd.get_dummies(data_train['player_team'], prefix='player_team')
winner_team = pd.get_dummies(data_train['winner_team'], prefix='winner_team')
# test
player_team_test = pd.get_dummies(data_test['player_team'], prefix='player_team')
winner_team_test = pd.get_dummies(data_test['winner_team'], prefix='winner_team')

# drop colums
data_train.drop(['player_team'], inplace=True, axis=1)
data_train.drop(['winner_team'], inplace=True, axis=1)
# test
data_test.drop(['player_team'], inplace=True, axis=1)
data_test.drop(['winner_team'], inplace=True, axis=1)

# concatenate them to dataframe
data_train = pd.concat([data_train, player_team, winner_team], axis=1)
#test
data_test = pd.concat([data_test, player_team_test, winner_team_test], axis=1)

## Preprocess heros dataFrame

In [7]:
heroes = pd.read_csv('./sberbank_data/data_2019/dota2_heroes.csv', index_col='hero_id')

#### roles ====> one_hot

In [8]:
heroes['roles'] = heroes['roles'].apply(lambda x: x.replace(" ", '').strip('[]').replace("'", '').split(','))

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
heroes = heroes.join(pd.DataFrame(mlb.fit_transform(heroes.pop('roles')),
                          columns=mlb.classes_,
                          index=heroes.index))

#### attack_type, cm_enabled ====> one_hot

In [10]:
# get dummies
attack_type_dummies = pd.get_dummies(heroes['attack_type'], prefix='att_type_')
captain_mode_dummies = pd.get_dummies(heroes['cm_enabled'], prefix='cm_')
prim_attr_dummies = pd.get_dummies(heroes['primary_attr'], prefix='pr_attr_')

# drop colums
heroes.drop(['attack_type'], inplace=True, axis=1)
heroes.drop(['cm_enabled'], inplace=True, axis=1)
heroes.drop(['primary_attr'], inplace=True, axis=1)

# concatenate them to dataframe
heroes = pd.concat([heroes, 
                    attack_type_dummies, 
                    captain_mode_dummies, 
                    prim_attr_dummies], axis=1)

## Join heroes to data_train

In [11]:
heroes.drop(['name', 'localized_name'], inplace=True, axis=1)

In [12]:
data_train = data_train.join(heroes, on='hero_id')
data_test = data_test.join(heroes, on='hero_id')

# apply one hot for 'hero_id'
# train
hero_dummies = pd.get_dummies(data_train['hero_id'], prefix='hero_id')
data_train.drop(['hero_id'], inplace=True, axis=1)
data_train = pd.concat([data_train, hero_dummies], axis=1)
# test
hero_dummies_test = pd.get_dummies(data_test['hero_id'], prefix='hero_id')
data_test.drop(['hero_id'], inplace=True, axis=1)
data_test = pd.concat([data_test, hero_dummies_test], axis=1)

In [13]:
data_train.shape, data_test.shape

((40395, 211), (15835, 210))

## Parse JSON (extract extra features)

In [14]:
from source.functions_v2 import *

  from pandas.core import datetools


In [15]:
json_train = './sberbank_data/final/academy2019_final_train.jsonlines'
json_test = './sberbank_data/final/academy2019_final_test.jsonlines'

In [16]:
def extract_data(path_to_json):
    buff = []
    ids = []

    data_gold = []
    data_items = []
    data_damage = []
    data_abilities = []
    data_items_count = []
    data_targets_count = []
    data_gold_deviation = []
    data_lvls = []

    batch_size = 10000

    with open(path_to_json) as inp:
        total = 40395 if 'train' in path_to_json else 15835
        for i, line in tqdm(enumerate(inp), total=total):
            record = json.loads(line)
            buff.append(record)
            ids.append(record['id'])

            if i + 1 == batch_size:
                # extract features
                data_gold.append(get_gold_features(buff))
                data_items.append(get_items_features(buff))
                data_damage.append(get_damage_features(buff))
                data_lvls.append(get_lvl_times(buff))
                
                data_abilities.append(get_abilities(buff))
                data_items_count.append(get_items_value(buff))
                data_targets_count.append(get_targets_value(buff))
                data_gold_deviation.append(gold_deviation(buff))

                # free buff
                buff = []

        # process last batch
        data_gold.append(get_gold_features(buff))
        data_items.append(get_items_features(buff))
        data_damage.append(get_damage_features(buff))
        data_lvls.append(get_lvl_times(buff))
        
        data_abilities.append(get_abilities(buff))
        data_items_count.append(get_items_value(buff))
        data_targets_count.append(get_targets_value(buff))
        data_gold_deviation.append(gold_deviation(buff))

        # free buff
        buff = []
        
    #========================
    # convert to numpy arrays
    data_gold = np.concatenate(data_gold)
    data_items = np.concatenate(data_items)
    data_damage = np.concatenate(data_damage)
    data_lvls = np.concatenate(data_lvls)
    
    data_abilities = np.concatenate(data_abilities)
    data_items_count = np.concatenate(data_items_count)
    data_targets_count = np.concatenate(data_targets_count)
    data_gold_deviation = np.concatenate(data_gold_deviation)
    
    # merge all data
    extra_data = np.concatenate((data_gold,
                                 data_items, 
                                 data_damage, 
                                 data_lvls,
                                 
                                 data_abilities,
                                 data_items_count, 
                                 data_targets_count, 
                                 data_gold_deviation
                                ), axis=1)
    
    return extra_data, ids

In [17]:
extra_data, ids = extract_data(json_train)
extra_data_test, ids_test = extract_data(json_test)

A Jupyter Widget




A Jupyter Widget




In [21]:
import sys
print("% 8.2f Mb" % (sys.getsizeof(extra_data) / 1024 / 1024))

  184.03 Mb


In [22]:
extra_data.shape, extra_data_test.shape

((40403, 597), (15836, 597))

In [23]:
columns = columns_gold + columns_items + columns_damage + columns_lvl
columns += columns_abililies + columns_items_value + columns_tar_value + columns_deviation

In [24]:
extra_data = pd.DataFrame(data=extra_data, 
                          index=ids, 
                          columns=columns)

extra_data_test = pd.DataFrame(data=extra_data_test, 
                          index=ids_test, 
                          columns=columns)

In [25]:
data_train = data_train.join(extra_data)
data_test = data_test.join(extra_data_test)

In [26]:
train_name = 'train_extracted_final'
test_name = 'test_extracted_final'

In [27]:
data_train.shape, data_test.shape

((40395, 808), (15835, 807))

In [28]:
data_train.to_csv('./sberbank_data/data_2019/' + train_name + '.csv')
data_test.to_csv('./sberbank_data/data_2019/' + test_name + '.csv')

In [None]:
df = 