In [23]:
import pandas as pd
import numpy as np
import feature_transform as ftr
import random
import copy
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.cross_validation import KFold
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler

from scipy.sparse import csr_matrix
from scipy.sparse import hstack
CNT_HEROES = 113
EPS = 1e-5

%matplotlib inline 

In [24]:
def test_model(clf, X, y):
    if not isinstance(y, np.ndarray):
        y = np.array(y)
        
    kf = KFold(n = X.shape[0], n_folds=5, random_state=42)
    errors = []
    params_list = []
    cnt = 0
    for train_index, test_index in kf:
        print(cnt)
        cnt += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        err = log_loss(y_test, clf.predict_proba(X_test)[:, 1])
        errors.append(err)
        params_list.append(clf.get_params())
    avg_err = np.average(errors)
    return avg_err, np.array(errors), params_list

In [25]:
features = pd.read_csv("./features.csv", index_col="match_id")
features_test = pd.read_csv("./features_test.csv", index_col="match_id")

# Добавляем инфу о винрейтах героев

In [26]:
heroes_wr = pd.read_csv("./data/heroes_wr.csv", delimiter=';')

heroes = pd.read_csv("./data/dictionaries/heroes.csv")

heroes = pd.merge(heroes, heroes_wr, on=['localized_name'], how='left')

heroes = heroes.set_index('id')

In [27]:
features['r1_hero_wr'] = heroes.ix[features.r1_hero.values]['wr'].values
features['r2_hero_wr'] = heroes.ix[features.r2_hero.values]['wr'].values
features['r3_hero_wr'] = heroes.ix[features.r3_hero.values]['wr'].values
features['r4_hero_wr'] = heroes.ix[features.r4_hero.values]['wr'].values
features['r5_hero_wr'] = heroes.ix[features.r5_hero.values]['wr'].values


features['d1_hero_wr'] = heroes.ix[features.d1_hero.values]['wr'].values
features['d2_hero_wr'] = heroes.ix[features.d2_hero.values]['wr'].values
features['d3_hero_wr'] = heroes.ix[features.d3_hero.values]['wr'].values
features['d4_hero_wr'] = heroes.ix[features.d4_hero.values]['wr'].values
features['d5_hero_wr'] = heroes.ix[features.d5_hero.values]['wr'].values

radiant_sorted = features[['r1_hero_wr','r2_hero_wr', 'r3_hero_wr', 'r4_hero_wr', 'r5_hero_wr']].values.T
radiant_sorted.sort(axis=0)
radiant_sorted = radiant_sorted.T
features[['r1_hero_wr','r2_hero_wr', 'r3_hero_wr', 'r4_hero_wr', 'r5_hero_wr']] = radiant_sorted

dire_sorted = features[['d1_hero_wr','d2_hero_wr', 'd3_hero_wr', 'd4_hero_wr', 'd5_hero_wr']].values.T
dire_sorted.sort(axis=0)
dire_sorted = dire_sorted.T
features[['d1_hero_wr','d2_hero_wr', 'd3_hero_wr', 'd4_hero_wr', 'd5_hero_wr']] = dire_sorted

features_test['r1_hero_wr'] = heroes.ix[features_test.r1_hero.values]['wr'].values
features_test['r2_hero_wr'] = heroes.ix[features_test.r2_hero.values]['wr'].values
features_test['r3_hero_wr'] = heroes.ix[features_test.r3_hero.values]['wr'].values
features_test['r4_hero_wr'] = heroes.ix[features_test.r4_hero.values]['wr'].values
features_test['r5_hero_wr'] = heroes.ix[features_test.r5_hero.values]['wr'].values


features_test['d1_hero_wr'] = heroes.ix[features_test.d1_hero.values]['wr'].values
features_test['d2_hero_wr'] = heroes.ix[features_test.d2_hero.values]['wr'].values
features_test['d3_hero_wr'] = heroes.ix[features_test.d3_hero.values]['wr'].values
features_test['d4_hero_wr'] = heroes.ix[features_test.d4_hero.values]['wr'].values
features_test['d5_hero_wr'] = heroes.ix[features_test.d5_hero.values]['wr'].values

radiant_sorted = features_test[['r1_hero_wr','r2_hero_wr', 'r3_hero_wr', 'r4_hero_wr', 'r5_hero_wr']].values.T
radiant_sorted.sort(axis=0)
radiant_sorted = radiant_sorted.T
features_test[['r1_hero_wr','r2_hero_wr', 'r3_hero_wr', 'r4_hero_wr', 'r5_hero_wr']] = radiant_sorted

dire_sorted = features_test[['d1_hero_wr','d2_hero_wr', 'd3_hero_wr', 'd4_hero_wr', 'd5_hero_wr']].values.T
dire_sorted.sort(axis=0)
dire_sorted = dire_sorted.T
features_test[['d1_hero_wr','d2_hero_wr', 'd3_hero_wr', 'd4_hero_wr', 'd5_hero_wr']] = dire_sorted

# Здесь тип фичи добавляем

In [28]:
def create_team_feature(features):
    features['diff_gold'] = features['r1_gold']+features['r2_gold']+features['r3_gold']+features['r4_gold']+features['r5_gold']\
                            -features['d1_gold']-features['d2_gold']-features['d3_gold']-features['d4_gold']-features['d5_gold']

    features['diff_xp'] = features['r1_xp']+features['r2_xp']+features['r3_xp']+features['r4_xp']+features['r5_xp']\
                            -features['d1_xp']-features['d2_xp']-features['d3_xp']-features['d4_xp']-features['d5_xp']

    features['diff_kills'] = features['r1_kills']+features['r2_kills']+features['r3_kills']+features['r4_kills']+features['r5_kills']\
                            -features['d1_kills']-features['d2_kills']-features['d3_kills']-features['d4_kills']-features['d5_kills']

    
    for what in ['gold', 'xp', 'kills', 'deaths', 'level', 'items', 'lh']:
        sum_r = np.ones(features.shape[0]) * 0
        sum_d = np.ones(features.shape[0]) * 0
        min_r = np.ones(features.shape[0]) * 10000000
        min_d = np.ones(features.shape[0]) * 10000000
        max_r = np.ones(features.shape[0]) * (-10000000)
        max_d = np.ones(features.shape[0]) * (-10000000)
        median_d = np.ndarray((features.shape[0], 5))
        median_r = np.ndarray((features.shape[0], 5))
        
        for j in range(5):
            sum_r += features['r' + str(j + 1) + '_' + str(what)]
            sum_d += features['d' + str(j + 1) + '_' + str(what)]
            min_r = np.minimum(min_r, np.array(features['r' + str(j + 1) + '_' + str(what)]))
            min_d = np.minimum(min_d, np.array(features['d' + str(j + 1) + '_' + str(what)]))
            max_r = np.maximum(max_r, np.array(features['r' + str(j + 1) + '_' + str(what)]))
            max_d = np.maximum(max_d, np.array(features['d' + str(j + 1) + '_' + str(what)]))
            median_d[:, j] = np.array(features['d' + str(j + 1) + '_' + str(what)])
            median_r[:, j] = np.array(features['r' + str(j + 1) + '_' + str(what)])
            
        var_r = np.ones(features.shape[0]) * 0
        var_d = np.ones(features.shape[0]) * 0
        
        for j in range(5):
            var_r += (np.array(features['r' + str(j + 1) + '_' + str(what)]) - sum_r / 5) ** 2
            var_d += (np.array(features['d' + str(j + 1) + '_' + str(what)]) - sum_d / 5) ** 2
        
        var_r = var_r ** (0.5)
        var_d = var_d ** (0.5)
        
        all_sum = sum_r + sum_d
        all_sum[all_sum == 0] = 1
        features['summ_r_' + str(what)] = sum_r
        features['summ_d_' + str(what)] = sum_d
        features['mean_' + str(what)] = (sum_r - sum_d) / all_sum
        features['min_r_' + str(what)] = min_r
        features['min_d_' + str(what)] = min_d
        features['max_r_' + str(what)] = max_r
        features['max_d_' + str(what)] = max_d
        features['var_d_' + str(what)] = var_d
        features['var_r_' + str(what)] = var_r
        features['median_r_' + str(what)] = np.median(median_r, axis = 1)
        features['median_d_' + str(what)] = np.median(median_d, axis = 1)
        features['ratio' + str(what)] = sum_r / np.maximum(sum_d, 1)
        features['ratio_mean' + str(what)] = var_r / np.maximum(var_d, 1)
        
    rd = ["r1_gold", "r2_gold", "r3_gold", "r4_gold", "r5_gold",
            "d1_gold","d2_gold", "d3_gold", "d4_gold", "d5_gold",
     "r1_xp", "r2_xp", "r3_xp", "r4_xp", "r5_xp",
            "d1_xp","d2_xp", "d3_xp", "d4_xp", "d5_xp",
      "r1_kills", "r2_kills", "r3_kills", "r4_kills", "r5_kills",
            "d1_kills","d2_kills", "d3_kills", "d4_kills", "d5_kills",
      "r1_deaths", "r2_deaths", "r3_deaths", "r4_deaths", "r5_deaths",
            "d1_deaths","d2_deaths", "d3_deaths", "d4_deaths", "d5_deaths",   
      "r1_level", "r2_level", "r3_level", "r4_level", "r5_level",
            "d1_level","d2_level", "d3_level", "d4_level", "d5_level"
      ,"r1_items", "r2_items", "r3_items", "r4_items", "r5_items",
            "d1_items","d2_items", "d3_items", "d4_items", "d5_items"
        ,"r1_lh", "r2_lh", "r3_lh", "r4_lh", "r5_lh",
            "d1_lh","d2_lh", "d3_lh", "d4_lh", "d5_lh"
     ]
    features = features.drop(rd, axis=1)
    
    return features

In [29]:
def start_time_to_patch(start_time):
    start_time[start_time>1443052800] = 1
    start_time[start_time > 1] = 2
    return start_time

In [30]:
features = create_team_feature(features)
features_test = create_team_feature(features_test)

In [38]:
buf = np.array(features['summ_d_gold'])

In [39]:
stdscaler = StandardScaler()
buf = stdscaler.fit_transform(buf)
print(buf)

[-0.61672628 -0.04291252  0.29023141 ..., -0.17095361 -0.2610566
 -1.04352991]




In [18]:
features.start_time = start_time_to_patch(features.start_time.values)
features_test.start_time = start_time_to_patch(features_test.start_time.values)

# Добавляем melee или range герой

In [19]:
dist_features = pd.read_csv('./data/dictionaries/HeroesFeatures.csv', index_col='index')

melee_hero = np.ndarray(features.shape[0])
range_hero = np.ndarray(features.shape[0])

for i, index in enumerate(features.index.values):

    cnt_melee = 0
    cnt_range = 0
    for s in ['r', 'd']:
        for j in range(5):
            cnt_melee += dist_features[" melee"].ix[features[str(s) + str(j + 1) + "_hero"][index]]
            cnt_range += dist_features[" range"].ix[features[str(s) + str(j + 1) + "_hero"][index]]
    melee_hero[i] = cnt_melee
    range_hero[i] = cnt_range

KeyboardInterrupt: 

In [20]:
features['cnt_melee'] = np.asarray(melee_hero, dtype=float)
features['cnt_range'] = np.asarray(range_hero, dtype=float)

In [21]:
c = features.columns.intersection(features_test.columns)
X_train, y_train = features[c].copy(), features['radiant_win'].copy()
X_test = features_test[c].copy()

KeyboardInterrupt: 

заполняем nan-ки по смыслу: фичи с временем -- на макс+1 (тип сделано было после 5 минут игры),
                            остальное на -1

In [22]:
X_train.first_blood_team = X_train.first_blood_team.fillna(-1)
X_train.first_blood_player1 = X_train.first_blood_player1.fillna(-1)
X_train.first_blood_player2 = X_train.first_blood_player2.fillna(-1)

X_test.first_blood_team = X_test.first_blood_team.fillna(-1)
X_test.first_blood_player1 = X_test.first_blood_player1.fillna(-1)
X_test.first_blood_player2 =X_test.first_blood_player2.fillna(-1)

NameError: name 'X_train' is not defined

In [None]:
X_train = X_train.fillna(300 + 1) # 300 -- 5 minutes
X_test = X_test.fillna(300 + 1)
X_train['is_cour_bought_radiant'] = (X_train.radiant_courier_time < 300)

X_train['is_cour_bought_dire'] = (X_train.dire_courier_time < 300)

X_test['is_cour_bought_radiant'] = (X_test.radiant_courier_time < 300)

X_test['is_cour_bought_dire'] = (X_test.dire_courier_time < 300)

тут вставил код Сережи

Heroes

In [16]:
to_delete = ["start_time"]

X_train = X_train.drop(to_delete, axis=1)
X_test = X_test.drop(to_delete, axis=1)

In [17]:
X_train = pd.get_dummies(X_train, columns=['lobby_type', 'first_blood_team', 'first_blood_player1', 'first_blood_player2']) #бинаризуем 
X_test = pd.get_dummies(X_test, columns=['lobby_type', 'first_blood_team', 'first_blood_player1', 'first_blood_player2']) #бинаризуем 

In [18]:
heroes = pd.read_csv("./data/dictionaries/heroes.csv")

heroes_names = heroes["localized_name"]

In [19]:
X_train_sack = ftr.sack_of_heroes(X_train, heroes_names)
X_test_sack = ftr.sack_of_heroes(X_test, heroes_names)

(97230, 112)
(17177, 112)


In [20]:
rdheroes = ["r1_hero", "r2_hero", "r3_hero", "r4_hero", "r5_hero", "d1_hero",
            "d2_hero", "d3_hero", "d4_hero", "d5_hero"]
X_train_sack = X_train_sack.drop(rdheroes, axis=1)
X_test_sack = X_test_sack.drop(rdheroes, axis=1)

In [21]:
X_train_sack = X_train_sack.fillna(0)
X_test_sack = X_test_sack.fillna(0)

In [22]:
X_train_matrix = np.asarray(X_train_sack.as_matrix(), dtype=np.float64)
X_test_matrix = np.asarray(X_test_sack.as_matrix(), dtype=np.float64)
print(X_train_matrix.shape)

(97230, 263)


# Кто с кем стоит

In [23]:
heroes_role = pd.read_csv('./dictionaries/HeroesFeatures.csv', index_col='index')

list_features = ["carrie", "support", "jungler", "mider", "melee", "range"]
prob_of_heroes_role = [{j:0 for j in list_features} for i in range(heroes_role.shape[0] + 1)]
for i in range(heroes_role.shape[0]):
    for (j, cur_features) in enumerate(list_features):
        prob_of_heroes_role[i + 1][cur_features] = heroes_role.iloc[i][j] + random.random() / 20.

In [24]:
def get_support_index(ind, pos, prob_of_heroes_role):
    prob = np.ndarray((len(ind)))
    for i in range(len(ind)):
        prob[i] = prob_of_heroes_role[ind[i]]['support']
        
    buf = copy.deepcopy(ind)
    buf_pos = copy.deepcopy(pos)
    buf.pop(np.argmax(prob))
    buf_pos.pop(np.argmax(prob))
    
    return (ind[np.argmax(prob)], pos[np.argmax(prob)], buf, buf_pos)

def get_support_or_jungler_index(ind, pos, prob_of_heroes_role):
    prob_sup = np.ndarray((len(ind)))
    prob_jun = np.ndarray((len(ind)))
    
    for i in range(len(ind)):
        prob_sup[i] = prob_of_heroes_role[ind[i]]['support']
        prob_jun[i] = prob_of_heroes_role[ind[i]]['jungler']
    
    buf = copy.deepcopy(ind)
    buf_pos = copy.deepcopy(pos)
    
    if (np.max(prob_sup) > np.max(prob_jun)):
        buf.pop(np.argmax(prob_sup))
        buf_pos.pop(np.argmax(prob_sup))
        return (ind[np.argmax(prob_sup)], pos[np.argmax(prob_sup)], buf, buf_pos)
    else:
        buf.pop(np.argmax(prob_jun))
        buf_pos.pop(np.argmax(prob_jun))
        return (ind[np.argmax(prob_jun)], pos[np.argmax(prob_jun)], buf, buf_pos)

def get_carrie_index(ind, pos, prob_of_heroes_role):
    prob = np.ndarray((len(ind)))
    for i in range(len(ind)):
        prob[i] = prob_of_heroes_role[ind[i]]['carrie']
        
    buf = copy.deepcopy(ind)
    buf_pos = copy.deepcopy(pos)
    buf.pop(np.argmax(prob))
    buf_pos.pop(np.argmax(prob))
    
    return (ind[np.argmax(prob)], pos[np.argmax(prob)], buf, buf_pos)

def get_mider_index(ind, pos, prob_of_heroes_role):
    prob = np.ndarray((len(ind)))
    for i in range(len(ind)):
        prob[i] = prob_of_heroes_role[ind[i]]['mider']
        
    buf = copy.deepcopy(ind)
    buf_pos = copy.deepcopy(pos)
    buf.pop(np.argmax(prob))
    buf_pos.pop(np.argmax(prob))
    
    return (ind[np.argmax(prob)], pos[np.argmax(prob)], buf, buf_pos)


def get_role_for_single_match(X, prob_of_heroes_role, team):
    cnt_features = len(prob_of_heroes_role)
    ind = [0 for i in range(5)]
    pos = [i for i in range(5)]
    for i in range(5): # cnt heroes
        ind[i] = int(X[team + str(i + 1) + '_hero'])
    
    
    buf = copy.deepcopy(ind)
    buf_pos = copy.deepcopy(pos)
    (ind_sup, pos_sup, ind, pos) = get_support_index(ind, pos, prob_of_heroes_role);
    (ind_sup_or_jun, pos_sup_or_jun, ind, pos) = get_support_or_jungler_index(ind, pos, prob_of_heroes_role);
    (ind_carry, pos_carry, ind, pos) = get_carrie_index(ind, pos, prob_of_heroes_role)
    (ind_mider, pos_mider, ind, pos) = get_mider_index(ind, pos, prob_of_heroes_role)
    ind_hard = ind[0]
    pos_hard = pos[0]
    
    assert(np.array_equal(np.sort(np.array([ind_sup, ind_sup_or_jun, ind_carry, ind_mider, ind_hard])), np.sort(np.array(buf))))
    assert(np.array_equal(np.sort(np.array([pos_sup, pos_sup_or_jun, pos_carry, pos_mider, pos_hard])), np.sort(np.array(buf_pos))))
    
    return (np.array([ind_sup, ind_sup_or_jun, ind_carry, ind_mider, ind_hard]), 
            np.array([pos_sup, pos_sup_or_jun, pos_carry, pos_mider, pos_hard]))


def get_role(X, prob_of_heroes_role, team):
    X_role = np.ndarray((X.shape[0], 5), dtype=int)
    pos = np.ndarray((X.shape[0], 5), dtype=int)
    for i in range(X.shape[0]):
        (X_role[i], pos[i]) = get_role_for_single_match(X.iloc[i], prob_of_heroes_role, team)
    return (X_role, pos)

In [25]:
(train_role_r_X, train_pos_r_X) = get_role(features, prob_of_heroes_role, 'r')
(train_role_d_X, train_pos_d_X) = get_role(features, prob_of_heroes_role, 'd')

(test_role_r_X, test_pos_r_X) = get_role(features_test, prob_of_heroes_role, 'r')
(test_role_d_X, test_pos_d_X) = get_role(features_test, prob_of_heroes_role, 'd')

In [26]:
def get_prob_of_win(train_role_r_X, train_role_d_X, prob_radiant_win):
    X = np.zeros(train_role_r_X.shape)
    for i in range(train_role_r_X.shape[0]):
        for j in range(train_role_r_X.shape[1]):
            X[i, j] = prob_radiant_win[train_role_r_X[i, j]][train_role_d_X[i, j]]
    return X

In [27]:
def get_prob_of_win_with_folds(features, train_role_r_X, train_role_d_X, n_folds = 4):
    kf = KFold(n = features.shape[0], n_folds=n_folds, random_state=42)
    ans = np.ndarray((features.shape[0], 5))
    
    for train_index, test_index in kf:
        cur_features = features.ix[train_index] 
        cur_train_role_r_X = train_role_r_X[train_index]
        cur_train_role_d_X = train_role_d_X[train_index]
        
        prob_radiant_win = np.zeros((CNT_HEROES, CNT_HEROES))
        cnt_radiant_win = np.zeros((CNT_HEROES, CNT_HEROES))
        for i in range(cur_train_role_r_X.shape[0]):
            radiant_win = cur_features['radiant_win'].iloc[i]
            for j in range(0, 5):
                if (radiant_win):
                    prob_radiant_win[cur_train_role_r_X[i][j], cur_train_role_d_X[i][j]] += 1
                cnt_radiant_win[cur_train_role_r_X[i][j], cur_train_role_d_X[i][j]] += 1

        cnt_radiant_win[cnt_radiant_win == 0] = 1
        prob_radiant_win /= cnt_radiant_win

        
        ans[test_index] = get_prob_of_win(train_role_r_X[test_index], train_role_d_X[train_index], prob_radiant_win)
    
    return ans

In [28]:
X_train_prob = get_prob_of_win_with_folds(features, train_role_r_X, train_role_d_X)

In [29]:
def get(X, pos_r, pos_d, what):
    gold = np.ndarray((X.shape[0], 12))
    cnt_bad = 0
    for j in range(X.shape[0]):
        rad_sum = 0
        dire_sum = 0
        for i in range(5):
            rad_gold = X['r' + str(pos_r[j, i] + 1) +  '_' + str(what)].iloc[j]
            dire_gold = X['d' + str(pos_d[j, i] + 1) +  '_' + str(what)].iloc[j]
            gold[j, 2 * i] = rad_gold - dire_gold
            if (rad_gold + dire_gold < EPS):
                gold[j, 2 * i + 1] = 0
            else:
                gold[j, 2 * i + 1] = (rad_gold - dire_gold) / (rad_gold + dire_gold)
            rad_sum += rad_gold
            dire_sum += dire_gold
        
        gold[j, 10] = rad_sum - dire_sum
        if (rad_sum + dire_sum == 0):
            gold[j, 11] = 0
            cnt_bad += 1
        else:
            gold[j, 11] = (rad_sum - dire_sum) / (rad_sum + dire_sum)
    print(cnt_bad)
    
    gold_in_degre = np.ndarray((gold.shape[0], gold.shape[1] * 3))
    for k in range(gold.shape[1]):
        for i in range(3):
            gold_in_degre[:, k * 3 + i] = gold[:, k] ** (i + 1)
    return gold


In [30]:
train_features = pd.read_csv('./features.csv', index_col='match_id')
test_features = pd.read_csv('./features_test.csv', index_col='match_id')

X_train_gold = get(train_features, train_pos_r_X, train_pos_d_X, "gold")
X_test_gold = get(test_features, test_pos_r_X, test_pos_d_X, "gold")
X_train_xp = get(train_features, train_pos_r_X, train_pos_d_X, "xp")
X_test_xp = get(test_features, test_pos_r_X, test_pos_d_X, "xp")
X_train_lh = get(train_features, train_pos_r_X, train_pos_d_X, "lh")
X_test_lh = get(test_features, test_pos_r_X, test_pos_d_X, "lh")

287
52
287
52
287
52


In [31]:

prob_radiant_win = np.zeros((CNT_HEROES, CNT_HEROES))
cnt_radiant_win = np.zeros((CNT_HEROES, CNT_HEROES))
for i in range(train_role_r_X.shape[0]):
    radiant_win = features['radiant_win'].iloc[i]
    for j in range(2, 5):
        if (radiant_win):
            prob_radiant_win[train_role_r_X[i][j], train_role_d_X[i][j]] += 1
        cnt_radiant_win[train_role_r_X[i][j], train_role_d_X[i][j]] += 1
        
cnt_radiant_win[cnt_radiant_win == 0] = 1
prob_radiant_win /= cnt_radiant_win

print(np.count_nonzero(cnt_radiant_win[cnt_radiant_win < 10]))

X_test_prob = get_prob_of_win(test_role_r_X, test_role_d_X, prob_radiant_win)

8975


# Добавляем синергию

In [32]:
def get_ind_by_name(name):
    for i in range(heroes.shape[0]):
        if ((heroes.iloc[i][1] == name) | (heroes.iloc[i][2] == name)):
            return heroes.iloc[i][0]
    print(name)
    return None

In [33]:
with open('./data/synergy.json') as data_file:
    data = json.load(data_file)

name = data['heronames']
cnt_heroes = len(name)

ind_by_name = {}
for cur_name in name:
    ind_by_name[cur_name] = get_ind_by_name(cur_name)
    
synergy_low_skill = np.zeros((CNT_HEROES + 1, CNT_HEROES + 1))
synergy_medium_skill = np.zeros((CNT_HEROES + 1, CNT_HEROES + 1))
synergy_high_skill = np.zeros((CNT_HEROES + 1, CNT_HEROES + 1))

for i in range(cnt_heroes):
    for j in range(cnt_heroes):
        if (i != j):
            ind_i = ind_by_name[name[i]]
            ind_j = ind_by_name[name[j]]
            assert(~(ind_i is None))
            assert(~(ind_j is None))
            synergy_low_skill[ind_i, ind_j] = data['win_rates'][i][j][0]
            synergy_medium_skill[ind_i, ind_j] = data['win_rates'][i][j][1]
            synergy_high_skill[ind_i, ind_j] = data['win_rates'][i][j][2]

stdscaler = StandardScaler(with_mean=False)
synergy_low_skill = stdscaler.fit_transform(synergy_low_skill)
synergy_medium_skill = stdscaler.fit_transform(synergy_medium_skill)
synergy_high_skill = stdscaler.fit_transform(synergy_high_skill)

In [34]:
with open('./data/antisynergy.json') as data_file:
    data = json.load(data_file)

name = data['heronames']
cnt_heroes = len(name)

    
antisynergy_low_skill = np.zeros((CNT_HEROES + 1, CNT_HEROES + 1))
antisynergy_medium_skill = np.zeros((CNT_HEROES + 1, CNT_HEROES + 1))
antisynergy_high_skill = np.zeros((CNT_HEROES + 1, CNT_HEROES + 1))

for i in range(cnt_heroes):
    for j in range(cnt_heroes):
        if (i != j):
            ind_i = ind_by_name[name[i]]
            ind_j = ind_by_name[name[j]]
            assert(~(ind_i is None))
            assert(~(ind_j is None))
            antisynergy_low_skill[ind_i, ind_j] = data['adv_rates'][i][j][0]
            antisynergy_medium_skill[ind_i, ind_j] = data['adv_rates'][i][j][1]
            antisynergy_high_skill[ind_i, ind_j] = data['adv_rates'][i][j][2]
            
stdscaler = StandardScaler(with_mean=False)
antisynergy_low_skill = stdscaler.fit_transform(antisynergy_low_skill)
antisynergy_medium_skill = stdscaler.fit_transform(antisynergy_medium_skill)
antisynergy_high_skill = stdscaler.fit_transform(antisynergy_high_skill)

In [35]:
def get_synergy(synergy_cur_skill, team_r, team_d, ind):
    row_ind = np.ndarray(2 * len(team_r) * (len(team_r) - 1))
    data = np.ndarray(2 * len(team_r) * (len(team_r) - 1))
    col_ind = np.ndarray(2 * len(team_r) * (len(team_r) - 1))
    cnt = 0
    for team in [team_r, team_d]:
        for i in team:
            for j in team:
                if (i != j):
                    row_ind[cnt] = ind
                    data[cnt] = synergy_cur_skill[i, j]
                    col_ind[cnt] = i * CNT_HEROES + j
                    cnt += 1
    assert(cnt == row_ind.shape[0])
    return (data, row_ind, col_ind)

def get_antisynergy(antisynergy_cur_skill, team_r, team_d, ind):
    row_ind = np.ndarray(len(team_r) ** 2)
    data = np.ndarray(len(team_r) ** 2)
    col_ind = np.ndarray(len(team_r) ** 2)
    cnt = 0
    for i in team_r:
        for j in team_d:
            row_ind[cnt] = ind
            data[cnt] = antisynergy_cur_skill[i, j]
            col_ind[cnt] = i * CNT_HEROES + j
            cnt += 1
    assert(cnt == row_ind.shape[0])
    return (data, row_ind, col_ind)

def get_synergy_for_all_data(features, synergy_per_skill, get_features, len_array):
    all_synergy = None
    
    for synergy_cur_skill in synergy_per_skill:
    
        data = np.ndarray((features.shape[0], len_array))
        row_ind = np.ndarray((features.shape[0], len_array))
        col_ind = np.ndarray((features.shape[0], len_array))
        
        for i, index in enumerate(features.index.values):
            team_r = [0,0,0,0,0]
            team_d = [0,0,0,0,0]
            for j in range(5):
                team_r[j] = features['r' + str(j + 1) + "_hero"][index]
                team_d[j] = features['d' + str(j + 1) + "_hero"][index]
            data[i], row_ind[i], col_ind[i] =  get_features(synergy_cur_skill, team_r, team_d, i)
            if (i % 10000 == 0):
                print(i)
        synergy = csr_matrix((data.reshape(-1), (row_ind.reshape(-1), col_ind.reshape(-1))), (features.shape[0], CNT_HEROES * CNT_HEROES))
        if (all_synergy is None):
            all_synergy = synergy
        else:
            all_synergy = hstack([all_synergy, synergy]).toarray()
            
    return all_synergy

#add all levels skill
#sparce_synergy_train = get_synergy_for_all_data(features, [synergy_medium_skill, synergy_high_skill], get_synergy, 2 * 5 * 4) 
#sparce_antisynergy_train = get_synergy_for_all_data(features, [antisynergy_medium_skill, antisynergy_high_skill], get_antisynergy, 5 * 5)

#sparce_synergy_test = get_synergy_for_all_data(features_test, [synergy_medium_skill, synergy_high_skill], get_synergy, 2 * 5 * 4)
#sparce_antisynergy_test = get_synergy_for_all_data(features_test, [antisynergy_medium_skill, antisynergy_high_skill], get_antisynergy, 5 * 5)

In [36]:
def get_sum_synergy(synergy_cur_skill, team_r, team_d, ind):
    cnt = 0
    sum = [0, 0]
    for team in [team_r, team_d]:
        for i in team:
            for j in team:
                if (i != j):
                    sum[cnt] += synergy_cur_skill[i, j]

        cnt += 1
        
    disp = [0, 0]
    cnt = 0
    for team in [team_r, team_d]:
        for i in team:
            for j in team:
                if (i != j):
                    disp[cnt] += (sum[cnt] / (5 * 4) - synergy_cur_skill[i, j]) ** 2
                    
        disp[cnt] = disp[cnt] ** 0.5
        cnt += 1
     
    return np.array((sum[0], 
                    sum[1], 
                    abs(sum[0] - sum[1]) / max(1, abs(sum[0] + sum[1])),
                    disp[0],
                    disp[1],
                   abs(disp[1] - disp[0]) / max(1, abs(disp[0] + disp[1]))))

def get_sum_antisynergy(antisynergy_cur_skill, team_r, team_d, ind):
    cnt = 0
    sum = 0
    for i in team_r:
        for j in team_d:
            if (i != j):
                sum += antisynergy_cur_skill[i, j]

        
    disp = 0
    cnt = 0
    for i in team_r:
        for j in team_d:
            if (i != j):
                disp += (sum / (5 * 5) - antisynergy_cur_skill[i, j]) ** 2
                    
        disp = disp ** 0.5
        cnt += 1
     
    return np.array((sum, 
                    disp))

def get_synergy_for_all_data(features, synergy_per_skill, get_features, CNT_FEATURES):
    all_synergy = np.ndarray((features.shape[0], CNT_FEATURES * len(synergy_per_skill)))
    
    
    for ind, synergy_cur_skill in enumerate(synergy_per_skill):
        
        for i, index in enumerate(features.index.values):
            team_r = [0,0,0,0,0]
            team_d = [0,0,0,0,0]
            for j in range(5):
                team_r[j] = features['r' + str(j + 1) + "_hero"][index]
                team_d[j] = features['d' + str(j + 1) + "_hero"][index]
            
            all_synergy[i, ind * CNT_FEATURES:(ind + 1) * CNT_FEATURES] = get_features(synergy_cur_skill, team_r, team_d, i)
            if (i % 10000 == 0):
                print(i)
        #synergy = csr_matrix((data.reshape(-1), (row_ind.reshape(-1), col_ind.reshape(-1))), (features.shape[0], CNT_HEROES * CNT_HEROES))
        #if (all_synergy is None):
        #    all_synergy = synergy
        #else:
        #    all_synergy = hstack([all_synergy, synergy]).toarray()
            
    return all_synergy

#add all levels skill
#synergy_train = get_synergy_for_all_data(features, [synergy_low_skill, synergy_medium_skill, synergy_high_skill], get_sum_synergy, 6) 
#antisynergy_train = get_synergy_for_all_data(features, [antisynergy_low_skill, antisynergy_medium_skill, antisynergy_high_skill], get_sum_antisynergy, 2)

#synergy_test = get_synergy_for_all_data(features_test, [synergy_low_skill, synergy_medium_skill, synergy_high_skill], get_sum_synergy, 6)
#antisynergy_test = get_synergy_for_all_data(features_test, [antisynergy_low_skill, antisynergy_medium_skill, antisynergy_high_skill], get_sum_antisynergy, 2)

In [37]:
print(X_train_matrix.shape)

(97230, 263)


In [38]:
# they are good features
X_train_matrix = np.concatenate((X_train_matrix, X_train_gold, X_train_xp, X_train_lh), axis = 1)
X_test_matrix = np.concatenate((X_test_matrix, X_test_gold, X_test_xp, X_test_lh), axis = 1)

In [39]:

print(X_test_matrix.shape)

(17177, 299)


In [40]:
#X_train_matrix_buf = np.concatenate((X_train_matrix, synergy_train), axis=1)
#X_test_matrix_buf = np.concatenate((X_test_matrix, synergy_test), axis=1)

#X_test_matrix_buf = csr_matrix(hstack((csr_matrix(X_test_matrix_buf), sparce_synergy_test, sparce_antisynergy_test)))
#_train_matrix_buf = csr_matrix(hstack((csr_matrix(X_train_matrix_buf), sparce_synergy_train, sparce_antisynergy_train)))
#print(X_train_matrix_buf.shape)

In [48]:
stdscaler = StandardScaler()
X_train_matrix_buf = stdscaler.fit_transform(X_train_matrix)
X_test_matrix_buf = stdscaler.fit_transform(X_test_matrix)

In [50]:
print(X_train_matrix_buf.shape)

(97230, 299)


#  Тестируем модель и предсказываем

In [51]:
test_clf1 = LogisticRegression(penalty="l1", C=0.077426368268112694, n_jobs=-1)
test_clf2 = RandomForestClassifier(n_estimators=10, criterion="entropy", n_jobs=-1)

In [52]:
clb_clf2 = CalibratedClassifierCV(test_clf2, method="sigmoid", cv=5)

In [53]:
test_clf = VotingClassifier([("LR", test_clf1), ("RF", test_clf2)], voting="soft",
                            weights=[0.85, 0.15])

In [54]:
avg_error, errors, params_list = test_model(test_clf2, (X_train_matrix_buf), (y_train))

print(avg_error, errors)

0
1
2
3
4
0.871565420797 [ 0.83349601  0.86358901  0.88737556  0.90182508  0.87154144]


In [63]:
print(X_train_matrix_buf.shape)

(97230, 51393)


In [51]:
test_clf.fit(X_train_matrix_buf, y_train)

VotingClassifier(estimators=[('LR', LogisticRegression(C=0.0774263682681127, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))],
         voting='soft', weights=[0.85, 0.15])

In [52]:
ans_df = pd.DataFrame(test_clf.predict_proba(X_test_matrix_buf)[:, 1], index=features_test.index, columns=["radiant_win"])

ans_df.to_csv("ans.csv")

(97230, 283)
