In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import time
import datetime

In [3]:
features = pd.read_csv('./features.csv', index_col='match_id')
features_test = pd.read_csv('features_test.csv', index_col = 'match_id')

In [4]:
features.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13.0,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16.0,2449,0,4,1974,3,63


In [413]:
y = features['radiant_win'] 
features.drop(['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'], axis='columns', inplace=True)

In [414]:
features.count()

start_time                  97230
lobby_type                  97230
r1_hero                     97230
r1_level                    97230
r1_xp                       97230
                            ...  
dire_tpscroll_count         97230
dire_boots_count            97230
dire_ward_observer_count    97230
dire_ward_sentry_count      97230
dire_first_ward_time        95404
Length: 102, dtype: int64

In [415]:
X = features.fillna(0.)
X_test = features_test.fillna(0.)

# Градиентный бустинг

In [416]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
coeffs = []
times = []
for i in range(10, 31, 10):
    start_time = datetime.datetime.now()
    clf = GradientBoostingClassifier(n_estimators=i, verbose=True, random_state=42, learning_rate=0.2)
    clf.fit(X, y)
    times.append(datetime.datetime.now() - start_time)
    coeffs.append(cross_val_score(clf, X, y, scoring = 'roc_auc', cv=kf))

      Iter       Train Loss   Remaining Time 
         1           1.3729           10.17s
         2           1.3637            8.80s
         3           1.3543            7.68s
         4           1.3466            6.59s
         5           1.3387            5.39s
         6           1.3322            4.28s
         7           1.3257            3.23s
         8           1.3194            2.14s
         9           1.3141            1.06s
        10           1.3089            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3728            8.05s
         2           1.3632            7.00s
         3           1.3535            5.95s
         4           1.3446            5.03s
         5           1.3372            4.26s
         6           1.3302            3.38s
         7           1.3236            2.52s
         8           1.3176            1.67s
         9           1.3123            0.83s
        10           1.3072            0.00s
      It

         5           1.3368           20.27s
         6           1.3298           19.34s
         7           1.3232           18.45s
         8           1.3174           17.59s
         9           1.3117           16.76s
        10           1.3070           16.01s
        20           1.2722            8.02s
        30           1.2517            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3728           24.41s
         2           1.3632           23.37s
         3           1.3538           22.35s
         4           1.3449           21.81s
         5           1.3373           20.88s
         6           1.3304           20.04s
         7           1.3245           19.17s
         8           1.3184           18.30s
         9           1.3128           17.49s
        10           1.3078           16.67s
        20           1.2736            8.26s
        30           1.2543            0.00s
      Iter       Train Loss   Remaining Time 
        

In [417]:
print(times)
for coeff in coeffs:
    print(np.mean(coeff))

[datetime.timedelta(seconds=11, microseconds=251128), datetime.timedelta(seconds=23, microseconds=522712), datetime.timedelta(seconds=37, microseconds=130228)]
0.6771099027716354
0.6912158873482319
0.6981202288645131


# Логистическая регрессия

In [418]:
scaler = StandardScaler()
X_for_log = X.copy()
X_for_log.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis='columns', inplace=True)
scaler.fit(X)
X_fit = scaler.transform(X)
scaler.fit(X_for_log)
X_for_log_fit = scaler.transform(X_for_log)

In [419]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
clf = LogisticRegression(penalty = 'l2')
clf.fit(X_fit, y)
print(np.mean(cross_val_score(clf, X_fit, y, scoring = 'roc_auc', cv=kf)))

0.7165221087939807


In [420]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
clf = LogisticRegression(penalty = 'l2')
clf.fit(X_for_log_fit, y)
print(np.mean(cross_val_score(clf, X_for_log_fit, y, scoring = 'roc_auc', cv=kf)))

0.7165303443778955


In [421]:
N = len(np.unique(np.array(X['r1_hero'])))
heroes = np.unique(np.array(X['r1_hero']))

X_pick = np.zeros((X.shape[0], 112))

for i, match_id in enumerate(X.index):
    for p in range(5):
        X_pick[i, X.loc[match_id, 'r%d_hero' % (p+1)] - 1] = 1
        X_pick[i, X.loc[match_id, 'd%d_hero' % (p+1)] - 1] = -1

In [422]:
X_pick = X_pick.T

In [423]:
l = 0

for i in range(112):
    if i + 1 not in heroes:
        X_pick = np.delete(X_pick, i - l, 0)
        l += 1

In [424]:
X_pick = X_pick.T

In [425]:
col = []
for i in range(108):
    col.append('hero_' + str(i))
col = np.array(col)
lkey = X_for_log.index

In [426]:
X_pick = pd.DataFrame(data = X_pick,
                      index = lkey,
                      columns = col)

In [427]:
train = X_for_log.merge(X_pick, left_on = 'match_id', right_on = 'match_id')

In [428]:
grid = {'C': np.power(10.0, np.arange(-5, 5))}
clf = LogisticRegression(penalty = 'l2')
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=kf)
gs.fit(X_for_log_fit, y)
for a in gs.best_params_.keys():
    C1 = gs.best_params_[a]

In [429]:
scaler.fit(train)
train_fit = scaler.transform(train)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
clf = LogisticRegression(penalty = 'l2', C = C1)
clf.fit(train_fit, y)
print(np.mean(cross_val_score(clf, train_fit, y, scoring = 'roc_auc', cv=kf)))

0.7516116548089558


# Указание вероятности победы Radiant для тестовой выборки с помощью логистической регрессии

In [430]:
X_for_log_test = X_test.copy()
X_for_log_test.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis='columns', inplace=True)

In [431]:
N = len(np.unique(np.array(X_test['r1_hero'])))
heroes = np.unique(np.array(X_test['r1_hero']))

X_pick_test = np.zeros((X_test.shape[0], 112))

for i, match_id in enumerate(X_test.index):
    for p in range(5):
        X_pick_test[i, X_test.loc[match_id, 'r%d_hero' % (p+1)] - 1] = 1
        X_pick_test[i, X_test.loc[match_id, 'd%d_hero' % (p+1)] - 1] = -1


In [432]:
X_pick_test = X_pick_test.T

l = 0

for i in range(112):
    if i + 1 not in heroes:
        X_pick_test = np.delete(X_pick_test, i - l, 0)
        l += 1

X_pick_test = X_pick_test.T


In [433]:
col = []
for i in range(108):
    col.append('hero_' + str(i))
col = np.array(col)
lkey = X_for_log_test.index

X_pick_test = pd.DataFrame(data = X_pick_test,
                      index = lkey,
                      columns = col)


In [434]:
train_test = X_for_log_test.merge(X_pick_test, left_on = 'match_id', right_on = 'match_id')

In [435]:
scaler.fit(train_test)
train_test_fit = scaler.transform(train_test)
pred = clf.predict_proba(train_test_fit)[:, 1]

In [436]:
indexes = np.array(X_test.index)
result = []

In [437]:
for i in range(len(indexes)):
    result.append([indexes[i], pred[i]])

In [438]:
result = np.array(result)

In [441]:
min(result[:,1]), max(result[:,1])

(0.009937096275635118, 0.994870327312102)