https://inclass.kaggle.com/c/dota-2-win-probability-prediction

## Градиентный бустинг в лоб

In [1]:
%matplotlib inline
import datetime
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import seaborn as sns

In [2]:
test = pd.read_csv('data/features_test.csv.zip', index_col='match_id')
train = pd.read_csv('data/features.csv.zip', index_col='match_id')
train.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13.0,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16.0,2449,0,4,1974,3,63


In [3]:
[i for i, v in train.count().iteritems() if v < 97230]

['first_blood_time',
 'first_blood_team',
 'first_blood_player1',
 'first_blood_player2',
 'radiant_bottle_time',
 'radiant_courier_time',
 'radiant_flying_courier_time',
 'radiant_first_ward_time',
 'dire_bottle_time',
 'dire_courier_time',
 'dire_flying_courier_time',
 'dire_first_ward_time']

In [4]:
train.fillna(0, inplace=True)
y = train['radiant_win']
X = train.drop(['start_time', 'duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire',
                           'barracks_status_radiant', 'barracks_status_dire'], axis=1)

test.fillna(0, inplace=True)
X_test = test.drop(['start_time'], axis=1)

In [5]:
def testGBC(X, y, kfold, num_tree):
    clf = GradientBoostingClassifier(n_estimators=num_tree, random_state=42, verbose=False)
    start_time = datetime.datetime.now()
    score = cross_val_score(clf, X, y, cv=kfold, scoring='roc_auc')
    end_time = datetime.datetime.now()
    print("%s) time to fit = %s; ROC_AUC = %s\n" %
          (num_tree, end_time-start_time, score.mean()))
    
def describeImportance(clf, X):
    indices = np.argsort(clf.feature_importances_)[::-1]
    for f in range(X.shape[1]):
        print('%d. feature %d %s (%f)' % (f + 1, indices[f], X.columns[indices[f]],
                                          clf.feature_importances_[indices[f]]))

def train(X, y):
    clf = GradientBoostingClassifier(n_estimators=50, random_state=42, verbose=False)
    start_time = datetime.datetime.now()
    clf.fit(X, y)
    end_time = datetime.datetime.now()
    print("Testing data) time to fit = %s;", end_time-start_time)
    return clf

In [6]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
for num_tree in [10,20,30,40,50]:
    testGBC(X, y, kfold, num_tree)

10) time to fit = 0:00:32.763168; ROC_AUC = 0.664850687975

20) time to fit = 0:00:59.053800; ROC_AUC = 0.682461876804

30) time to fit = 0:01:27.755503; ROC_AUC = 0.690006471039

40) time to fit = 0:01:54.582916; ROC_AUC = 0.693941559672

50) time to fit = 0:02:22.777606; ROC_AUC = 0.697493744081



In [7]:
gbc = train(X, y)
describeImportance(gbc, X)
pred = gbc.predict_proba(X_test)[:, 1]

Testing data) time to fit = %s; 0:00:38.200458
1. feature 76 d5_gold (0.092144)
2. feature 52 d2_gold (0.091675)
3. feature 12 r2_gold (0.085039)
4. feature 44 d1_gold (0.084979)
5. feature 28 r4_gold (0.084723)
6. feature 4 r1_gold (0.083765)
7. feature 68 d4_gold (0.079843)
8. feature 20 r3_gold (0.079053)
9. feature 36 r5_gold (0.077562)
10. feature 60 d3_gold (0.071788)
11. feature 83 first_blood_player1 (0.033225)
12. feature 89 radiant_boots_count (0.028435)
13. feature 97 dire_boots_count (0.021370)
14. feature 87 radiant_flying_courier_time (0.012335)
15. feature 37 r5_lh (0.010165)
16. feature 13 r2_lh (0.009575)
17. feature 96 dire_tpscroll_count (0.007485)
18. feature 61 d3_lh (0.006924)
19. feature 69 d4_lh (0.006900)
20. feature 45 d1_lh (0.006542)
21. feature 29 r4_lh (0.006232)
22. feature 5 r1_lh (0.003429)
23. feature 100 dire_first_ward_time (0.002416)
24. feature 31 r4_deaths (0.001953)
25. feature 88 radiant_tpscroll_count (0.001472)
26. feature 71 d4_deaths (0.0014

In [8]:
def prepareData(X):
    X_ = pd.DataFrame()
    
    #X_['first_blood_team'] = X['first_blood_team']
    
    dire_gold = X['d5_gold'] + X['d4_gold'] + X['d3_gold'] + X['d2_gold'] + X['d1_gold']
    radiant_gold = X['r5_gold'] + X['r4_gold'] + X['r3_gold'] + X['r2_gold'] + X['r1_gold']
    X_['gold_delta'] = dire_gold - radiant_gold
    
    dire_lh = X['d5_lh'] + X['d4_lh'] + X['d3_lh']+ X['d2_lh'] +X['d1_lh']
    radiant_lh = X['r5_lh'] + X['r4_lh'] + X['r3_lh']+ X['r2_lh'] +X['r1_lh']
    X_['lh_delta'] = dire_lh - radiant_lh
    
    dire_items = X['d5_items'] + X['d4_items'] + X['d3_items'] + X['d2_items'] + X['d1_items'] 
    radiant_items = X['r5_items'] + X['r4_items'] + X['r3_items'] + X['r2_items'] + X['r1_items']
    X_['items_delta'] = dire_items - radiant_items
    
    dire_boots_count = X['dire_boots_count']
    radiant_boots_count = X['radiant_boots_count']
    X_['boots_count_delta'] = dire_boots_count - radiant_boots_count
    
    dire_xp = X['d5_xp'] + X['d4_xp'] + X['d3_xp'] + X['d2_xp'] + X['d1_xp']
    radiant_xp = X['r5_xp'] + X['r4_xp'] + X['r3_xp'] + X['r2_xp'] + X['r1_xp']
    X_['xp_delta'] = dire_xp - radiant_xp
    
    dire_kills = X['d5_kills'] + X['d4_kills'] + X['d3_kills'] + X['d2_kills'] + X['d1_kills']
    radiant_kills = X['r5_kills'] + X['r4_kills'] + X['r3_kills'] + X['r2_kills'] + X['r1_kills']
    X_['kills_delta'] = dire_kills - radiant_kills
    
    X_['tpscroll_count_delta'] = X['dire_tpscroll_count'] - X['radiant_tpscroll_count']

#     X_['flying_courier_time_delta'] = 0 + (X['dire_flying_courier_time'] > X['radiant_flying_courier_time'])
#     X_['courier_time_delta'] = 0 + (X['dire_courier_time'] > X['radiant_courier_time'])
#     X_['first_ward_time_delta'] = 0 + (X['dire_first_ward_time'] > X['radiant_first_ward_time'])
#     X_['bottle_time_delta'] = 0 + (X['dire_bottle_time'] > X['radiant_bottle_time'])
    X_['dire_bottle_time'] = X['dire_bottle_time']
    X_['radiant_bottle_time'] = X['radiant_bottle_time']
    
    X_['dire_flying_courier_time'] = X['dire_flying_courier_time']
    X_['radiant_flying_courier_time'] = X['radiant_flying_courier_time']

    X_['dire_courier_time'] = X['dire_courier_time']
    X_['radiant_courier_time'] = X['radiant_courier_time']
    
    X_['dire_first_ward_time'] = X['dire_first_ward_time']
    X_['radiant_first_ward_time'] = X['radiant_first_ward_time']
    
    return X_

In [9]:
def visualize(X, y):
    data = prepareData(X)
    data['radiant_win'] = y
    
    sns.set()
    return sns.pairplot(data, hue='radiant_win')

In [10]:
def hypothesis_drop_unused(X, y):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    X_ = prepareData(X)
    testGBC(X_, y, kfold, num_tree=50)
    gbc = train(X_, y)
    describeImportance(gbc, X_)
    pred = gbc.predict_proba(prepareData(X_test))[:, 1]
    return pred

In [11]:
prepareData(X).head()

Unnamed: 0_level_0,gold_delta,lh_delta,items_delta,boots_count_delta,xp_delta,kills_delta,tpscroll_count_delta,dire_bottle_time,radiant_bottle_time,dire_flying_courier_time,radiant_flying_courier_time,dire_courier_time,radiant_courier_time,dire_first_ward_time,radiant_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,-1,2,-4,2,-443,1,1,103.0,134.0,221.0,244.0,-84.0,-80.0,-52.0,35.0
1,807,-2,3,4,-144,1,3,149.0,173.0,195.0,0.0,-84.0,-80.0,-5.0,-20.0
2,-891,-11,3,-1,-932,-1,1,45.0,63.0,221.0,0.0,-77.0,-82.0,13.0,-39.0
3,421,9,8,1,-183,0,0,124.0,208.0,184.0,0.0,-80.0,-75.0,27.0,-30.0
4,180,15,-2,-1,511,-2,5,182.0,166.0,225.0,181.0,-80.0,-81.0,-16.0,46.0


In [12]:
pred = hypothesis_drop_unused(X, y)

50) time to fit = 0:00:29.079769; ROC_AUC = 0.716334574053

Testing data) time to fit = %s; 0:00:08.092733
1. feature 0 gold_delta (0.337329)
2. feature 1 lh_delta (0.152366)
3. feature 4 xp_delta (0.120645)
4. feature 2 items_delta (0.085024)
5. feature 14 radiant_first_ward_time (0.051578)
6. feature 13 dire_first_ward_time (0.048811)
7. feature 10 radiant_flying_courier_time (0.035342)
8. feature 7 dire_bottle_time (0.033010)
9. feature 12 radiant_courier_time (0.030342)
10. feature 6 tpscroll_count_delta (0.029594)
11. feature 11 dire_courier_time (0.026516)
12. feature 3 boots_count_delta (0.020239)
13. feature 9 dire_flying_courier_time (0.014102)
14. feature 8 radiant_bottle_time (0.010991)
15. feature 5 kills_delta (0.004111)


In [13]:
def save(fname, pred, test):
    res1 = pd.DataFrame()
    res1['match_id'] = test.index
    res1['radiant_win'] = pred
    res1.to_csv("submissions/{0}.csv".format(fname), index=False)
    res1

In [14]:
save("gb_without_unused", pred, X_test)

## Логистическая регрессия

In [15]:
def testLR(X, y):
    clf = LogisticRegression(random_state=42)
    XX = StandardScaler().fit_transform(X)
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    grid = {'C': np.linspace(0.001,1,10)}
    grid_search = GridSearchCV(clf, grid, scoring='roc_auc', cv=kfold)
    grid_search.fit(XX, y)

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

In [16]:
def trainLR(X, y, X_test, C):
    clf = LogisticRegression(random_state=42, C=C)
    XX = StandardScaler().fit_transform(X)
    clf.fit(XX, y)
    XX_test = StandardScaler().fit_transform(X_test)
    return clf.predict_proba(XX_test)[:, 1]

In [17]:
testLR(X, y)

Best score: 0.7164096882227959
Best parameters: {'C': 0.112}


In [18]:
testLR(prepareData(X), y)

Best score: 0.7168014373768751
Best parameters: {'C': 1.0}


In [19]:
def countHeroes(X):
    heroes = ['{}{}_hero'.format(team, i + 1) for i in range(5) for team in ['r', 'd']]
    return np.unique(np.concatenate([X[h].unique() for h in heroes])).size

In [20]:
def wordsBag(X, N):
    # N — количество различных героев в выборке
    X_pick = np.zeros((X.shape[0], N))

    for i, match_id in enumerate(X.index):
        for p in range(5):
            X_pick[i, X.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, X.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

    return X_pick

In [21]:
bag = wordsBag(X, 113)

In [22]:
testLR(pd.DataFrame(bag, index=X.index), y)

Best score: 0.607769564074281
Best parameters: {'C': 1.0}


In [23]:
superX = pd.concat([pd.DataFrame(bag, index=X.index), prepareData(X)], axis=1)

In [24]:
testLR(superX, y)

Best score: 0.752073851548298
Best parameters: {'C': 0.112}


In [25]:
bag_test = wordsBag(X_test, 113)
superX_test = pd.concat([pd.DataFrame(bag_test, index=X_test.index), prepareData(X_test)], axis=1)
pred = trainLR(superX, y, superX_test, 0.112)
save("lr_with_bag", pred, X_test) # 0.75490

# Случайный лес

In [26]:
def findParamsRFC(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
    clf = RandomForestClassifier(random_state=42)
    parameter_grid = {'n_estimators' : [20, 100],
                     'max_depth' : [20, 100],
                     'min_samples_split' : [2, 5],
                     'max_leaf_nodes' : [40, 60]
                    }
    grid_search = GridSearchCV(clf, param_grid=parameter_grid, cv=StratifiedKFold(5))
    grid_search.fit(X_train, y_train)
    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

In [27]:
findParamsRFC(superX.head(50), y.head(50))

Best score: 0.6285714285714286
Best parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100, 'max_leaf_nodes': 40}


In [28]:
def trainRFC(X, y, X_test):
    clf = RandomForestClassifier(random_state=42, n_estimators=20)
    clf.fit(X, y)
    return clf.predict_proba(X_test)[:, 1]

In [29]:
pred = trainRFC(superX, y, superX_test)
save("rfc_with_bag", pred, X_test) # 0.70690