# Dota 2 Winner Prediction
## 1. Предобработка данных 

In [90]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, ensemble

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [91]:
X_train = pd.read_csv('train_features.csv', index_col='match_id_hash')
y = pd.read_csv('train_targets.csv')
y_train = y['radiant_win']

In [1]:
X_train.head()

NameError: name 'X_train' is not defined

In [93]:
y_train.head()

0    False
1     True
2     True
3     True
4    False
Name: radiant_win, dtype: bool

Проанализируем наши данные на пропуски

In [94]:
X_train.isna().sum().sum()

0

## Подход 1: градиентный бустинг "в лоб"

In [None]:
%%time

number_of_trees = [10, 20, 30, 50, 100]

kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=241)

scoring = []

for num in number_of_trees:
    classifier = ensemble.GradientBoostingClassifier(n_estimators=num, 
                                                     random_state=241)
    score = model_selection.cross_val_score(X=X_train, y=y_train, cv=kf, 
                                            estimator=classifier, scoring='roc_auc')
    scoring += [np.mean(score)]

In [None]:
scoring

In [None]:
pylab.plot(number_of_trees, scoring, 
           marker='.', label='GradientBoost')
pylab.grid(True)
pylab.xlabel('n_trees')
pylab.ylabel('score')
pylab.title('Accuracy score')
pylab.legend(loc='lower right')

## Подход 2: логистическая регрессия

In [None]:
%%time

from sklearn import linear_model

log_reg = linear_model.LogisticRegression(solver='saga')
cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=241)

y_train = y_train.astype('float')

score = model_selection.cross_val_score(X=X_train, y=y_train, cv=cv, 
                                            estimator=log_reg, scoring='roc_auc')

In [None]:
score

In [None]:
X_copy_t = X_train.copy()

for x in range(1, 6):
    del X_copy_t['r%d_hero_id' % x]
    del X_copy_t['d%d_hero_id' % x]

In [None]:
from sklearn import preprocessing

X_copy_t = preprocessing.normalize(X_copy_t)

log_reg = linear_model.LogisticRegression(solver='liblinear')
cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=241)

score = model_selection.cross_val_score(X=X_copy_t, y=y_train, cv=cv, 
                                            estimator=log_reg, scoring='roc_auc')

In [None]:
score

In [None]:
X_train['r1_hero_id'].max(), X_train['r1_hero_id'].min()
X_train['r2_hero_id'].max(), X_train['r2_hero_id'].min()
X_train['r3_hero_id'].max(), X_train['r3_hero_id'].min()

N_hero = X_train['r3_hero_id'].max()
N_hero

In [None]:
X_pick = np.zeros((X_train.shape[0], N_hero))

for i, match_id in enumerate(X_train.index):
    for p in range(5):
        X_pick[i, int(X_train.loc[match_id]['r%d_hero_id' % (p+1)]-1)] = 1
        X_pick[i, int(X_train.loc[match_id]['d%d_hero_id' % (p+1)]-1)] = -1

In [None]:
X_train_new = np.concatenate((X_copy_t, X_pick), axis=1)

In [None]:
X_train.shape[0] == X_train_new.shape[0]

In [None]:
log_reg = linear_model.LogisticRegression(solver='liblinear')
cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=241)

score = model_selection.cross_val_score(X=X_train_new, y=y_train, cv=cv, 
                                            estimator=log_reg, scoring='roc_auc')

In [None]:
score

In [None]:
X_test = pd.read_csv('test_features.csv', index_col='match_id_hash')

In [None]:
X_copy_t = X_test.copy()

for x in range(1, 6):
    del X_copy_t['r%d_hero_id' % x]
    del X_copy_t['d%d_hero_id' % x]
    
X_test['r1_hero_id'].max(), X_test['r1_hero_id'].min()
X_test['r2_hero_id'].max(), X_test['r2_hero_id'].min()
X_test['r3_hero_id'].max(), X_test['r3_hero_id'].min()

N_hero = X_test['r3_hero_id'].max()
print(N_hero, X_test['r2_hero_id'].min())

X_pick = np.zeros((X_test.shape[0], N_hero))

for i, match_id in enumerate(X_test.index):
    for p in range(5):
        X_pick[i, int(X_test.loc[match_id]['r%d_hero_id' % (p+1)]-1)] = 1
        X_pick[i, int(X_test.loc[match_id]['d%d_hero_id' % (p+1)]-1)] = -1
        
X_test_new = np.concatenate((X_copy_t, X_pick), axis=1)

In [None]:
log_reg.fit(X_train_new, y_train)

y_pred = log_reg.predict(X_test_new)

In [None]:
y_pred[0]