In [70]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import time
import datetime

In [32]:
features = pd.read_csv('./features.csv', index_col='match_id')
features_test = pd.read_csv('features_test.csv', index_col = 'match_id')

In [33]:
y = features['radiant_win'] 
features.drop(['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'], axis='columns', inplace=True)

In [34]:
features.count()

start_time                  97230
lobby_type                  97230
r1_hero                     97230
r1_level                    97230
r1_xp                       97230
                            ...  
dire_tpscroll_count         97230
dire_boots_count            97230
dire_ward_observer_count    97230
dire_ward_sentry_count      97230
dire_first_ward_time        95404
Length: 102, dtype: int64

In [63]:
X = features.fillna(0.)
X_test = features_test.fillna(0.)

# Градиентный бустинг (в лоб)

In [23]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
coeffs = []
times = []
for i in range(10, 31, 10):
    start_time = datetime.datetime.now()
    clf = GradientBoostingClassifier(n_estimators=i, verbose=True, random_state=42, learning_rate=0.2)
    clf.fit(X, y)
    times.append(datetime.datetime.now() - start_time)
    coeffs.append(cross_val_score(clf, X, y, scoring = 'roc_auc', cv=kf))

      Iter       Train Loss   Remaining Time 
         1           1.3729           11.26s
         2           1.3637           10.00s
         3           1.3543            8.74s
         4           1.3466            7.53s
         5           1.3387            6.33s
         6           1.3322            5.10s
         7           1.3257            3.83s
         8           1.3194            2.57s
         9           1.3141            1.29s
        10           1.3089            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3728            9.59s
         2           1.3632            8.37s
         3           1.3535            7.45s
         4           1.3446            6.30s
         5           1.3372            5.42s
         6           1.3302            4.28s
         7           1.3236            3.17s
         8           1.3176            2.09s
         9           1.3123            1.04s
        10           1.3072            0.00s
      It

         5           1.3368           23.50s
         6           1.3298           22.46s
         7           1.3232           21.46s
         8           1.3174           20.53s
         9           1.3117           19.66s
        10           1.3070           18.74s
        20           1.2722            9.35s
        30           1.2517            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3728           28.14s
         2           1.3632           26.39s
         3           1.3538           25.34s
         4           1.3449           24.30s
         5           1.3373           23.06s
         6           1.3304           22.22s
         7           1.3245           21.18s
         8           1.3184           20.13s
         9           1.3128           19.21s
        10           1.3078           18.25s
        20           1.2736            9.32s
        30           1.2543            0.00s
      Iter       Train Loss   Remaining Time 
        

In [24]:
print(times)
for coeff in coeffs:
    print(np.mean(coeff))

[datetime.timedelta(seconds=13, microseconds=122004), datetime.timedelta(seconds=27, microseconds=273535), datetime.timedelta(seconds=32, microseconds=995397)]
0.6771099027716354
0.6912158873482319
0.6981202288645131


# Логистическая регрессия

In [78]:
scaler = StandardScaler()
X_for_log = X.copy()
X_for_log.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis='columns', inplace=True)
scaler.fit(X)
X_fit = scaler.transform(X)
scaler.fit(X_for_log)
X_for_log_fit = scaler.transform(X_for_log)

In [79]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
clf = LogisticRegression(penalty = 'l2')
clf.fit(X_fit, y)
print(np.mean(cross_val_score(clf, X_fit, y, scoring = 'roc_auc', cv=kf)))

0.7165221087939807


In [80]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
clf = LogisticRegression(penalty = 'l2')
clf.fit(X_for_log_fit, y)
print(np.mean(cross_val_score(clf, X_for_log_fit, y, scoring = 'roc_auc', cv=kf)))

0.7165303443778955


In [151]:
#for column in X.columns:
    #print(len(np.unique(np.array(X[column]))))
    #if len(np.unique(np.array(X[column]))) == 108:
    #    print (column)#.get_loc('r1_hero')
#kl = np.array(X_fit[[ 'r1_level', 'r1_lh', 'r1_items', 'r1_deaths', 'r1_kills']])
#print(np.unique(kl))
#print(np.array(X[['r1_xp', 'r1_level', 'r1_gold', 'r1_lh', 'r1_kills', 'r1_kills', 'r1_deaths']]))
#rang = np.unique(np.array(X['d4_hero']))
#len(rang)

In [59]:
unique = []
for i in range(len(X_for_log.T)):
    unique.append(np.unique(X_for_log.T[i]))