In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
train_matches = pd.read_csv('train.csv')
test_matches = pd.read_csv('test.csv')

In [5]:
gold = pd.read_csv('gold.csv', index_col='mid')
gold = gold[gold.times == 600]
gold.drop('times', 1, inplace=True)

gold.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384
1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623
2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491
3,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247
4,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220


In [6]:
radiant_gold = gold[['player_0', 'player_1', 'player_2', 'player_3', 'player_4']].sum(axis=1)
dire_gold = gold[['player_5', 'player_6', 'player_7', 'player_8', 'player_9']].sum(axis=1)

gold['radiant_gold'] = radiant_gold
gold['dire_gold'] = dire_gold
gold.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9,radiant_gold,dire_gold
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384,21454,22095
1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623,22165,24536
2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491,21392,15548
3,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247,20628,17617
4,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220,18038,17484


In [7]:
gold['diff_gold'] = gold['radiant_gold'] - gold['dire_gold']
gold['ratio_gold'] = gold['radiant_gold'] / gold['dire_gold']

gold.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9,radiant_gold,dire_gold,diff_gold,ratio_gold
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384,21454,22095,-641,0.970989
1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623,22165,24536,-2371,0.903366
2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491,21392,15548,5844,1.375868
3,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247,20628,17617,3011,1.170914
4,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220,18038,17484,554,1.031686


In [3]:
def normalizedata(X):
    return pd.DataFrame(
        StandardScaler().fit_transform(X),
        index = X.index, 
        columns=X.columns
    )

In [10]:
gold = normalizedata(gold)
gold.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9,radiant_gold,dire_gold,diff_gold,ratio_gold
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,-0.460819,0.88362,-1.022316,0.302363,1.318719,0.005361,0.009184,1.420892,-1.704481,1.827721,0.524044,0.801772,-0.191463,-0.25978
1,-1.176482,1.291373,-0.146754,0.248927,1.321671,2.659065,0.808769,-0.933944,0.328328,0.498696,0.796655,1.744066,-0.657847,-0.615188
2,-0.350943,-1.514322,3.321283,0.276011,-0.811136,-0.716695,-1.25938,0.836084,-1.101751,-1.110322,0.500272,-1.725554,1.556804,1.868171
3,-0.458622,1.073512,0.301579,-0.770018,0.255267,-0.532482,0.603076,-1.559622,0.993203,-1.294469,0.20734,-0.926862,0.793066,0.790982
4,-0.298934,0.071793,0.827062,-0.71951,-1.396367,-0.10709,-0.367047,-0.455398,0.341805,-1.314846,-0.785714,-0.978204,0.130693,0.05923


In [12]:
gold = gold.reset_index()
gold.head()

Unnamed: 0,mid,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9,radiant_gold,dire_gold,diff_gold,ratio_gold
0,0,-0.460819,0.88362,-1.022316,0.302363,1.318719,0.005361,0.009184,1.420892,-1.704481,1.827721,0.524044,0.801772,-0.191463,-0.25978
1,1,-1.176482,1.291373,-0.146754,0.248927,1.321671,2.659065,0.808769,-0.933944,0.328328,0.498696,0.796655,1.744066,-0.657847,-0.615188
2,2,-0.350943,-1.514322,3.321283,0.276011,-0.811136,-0.716695,-1.25938,0.836084,-1.101751,-1.110322,0.500272,-1.725554,1.556804,1.868171
3,3,-0.458622,1.073512,0.301579,-0.770018,0.255267,-0.532482,0.603076,-1.559622,0.993203,-1.294469,0.20734,-0.926862,0.793066,0.790982
4,4,-0.298934,0.071793,0.827062,-0.71951,-1.396367,-0.10709,-0.367047,-0.455398,0.341805,-1.314846,-0.785714,-0.978204,0.130693,0.05923


In [13]:
train = pd.merge(train_matches, gold, on='mid', how='left')
test = pd.merge(test_matches, gold, on='mid', how='left')

In [14]:
x_train = train.drop('radiant_won', 1)
y_train = train.radiant_won
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=1234)
np.mean(cross_val_score(clf, x_train, y_train, cv=5, scoring='roc_auc'))

0.67818140337565713

In [4]:
def logEstimation(X, y):
    grid = {'C': np.power(10.0, np.arange(-5, 1))}
    kf=KFold(y.size, n_folds=5, shuffle=True, random_state=241)
    clf=LogisticRegression(random_state=241)
    gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=kf)
    gs.fit(X, y)
    return gs

In [16]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.69435, std: 0.00516, params: {'C': 1.0000000000000001e-05}, mean: 0.70492, std: 0.00517, params: {'C': 0.0001}, mean: 0.70525, std: 0.00520, params: {'C': 0.001}, mean: 0.70528, std: 0.00521, params: {'C': 0.01}, mean: 0.70528, std: 0.00522, params: {'C': 0.10000000000000001}, mean: 0.70528, std: 0.00522, params: {'C': 1.0}]
{'C': 1.0}
best score: 0.7052801463


In [17]:
clf=LogisticRegression(random_state=241, C=1)
np.mean(cross_val_score(clf, x_train, y_train, cv=5, scoring='roc_auc'))

0.7055723388145857

In [19]:
test.head()

Unnamed: 0,mid,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9,radiant_gold,dire_gold,diff_gold,ratio_gold
0,3,-0.458622,1.073512,0.301579,-0.770018,0.255267,-0.532482,0.603076,-1.559622,0.993203,-1.294469,0.20734,-0.926862,0.793066,0.790982
1,7,1.031305,0.980038,-0.043405,1.081212,1.177024,0.788822,0.805029,1.217286,-0.344783,-0.542789,2.205336,1.00405,0.849409,0.640897
2,9,-1.426268,-0.151956,-0.449525,0.393863,-0.567598,0.035693,-0.098524,-0.471745,-0.123158,1.65414,-1.151112,0.505302,-1.162243,-1.152692
3,10,0.660655,-0.865892,-0.266844,0.345551,0.875184,-0.859479,-1.049199,0.254248,2.086352,-0.335246,0.389848,0.046315,0.241762,0.144809
4,12,-1.217503,1.130185,0.61454,0.019811,-0.749883,0.339756,0.746687,-0.541595,0.334318,0.76284,-0.103996,0.843849,-0.66243,-0.672553


In [21]:
clf=LogisticRegression(random_state=241, C=1)
clf.fit(x_train, y_train)
test_matches['radiant_won'] = clf.predict_proba(test)[:, 1]

In [22]:
test_matches.head()

Unnamed: 0,mid,radiant_won
0,3,0.654745
1,7,0.656384
2,9,0.282823
3,10,0.548258
4,12,0.369824


In [23]:
test_matches.to_csv('base_log_solve.csv', index=None)

In [24]:
heroes = pd.read_csv('heroes.csv')
heroes.tail()

Unnamed: 0,mid,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
49943,49943,3,89,75,81,29,40,94,47,56,68
49944,49944,64,92,69,82,18,104,81,29,5,28
49945,49945,99,109,96,15,2,69,91,53,77,87
49946,49946,69,32,64,65,8,21,22,40,75,59
49947,49947,8,41,74,10,96,44,66,67,13,48


In [25]:
players = {'player_{0}'.format(i) for i in range(10)}
print players

set(['player_6', 'player_7', 'player_4', 'player_5', 'player_2', 'player_3', 'player_0', 'player_1', 'player_8', 'player_9'])


In [26]:
heroes.player_0.unique()

array([ 91,  69,  17,  80,  25,  65,  64,  23,  59,  33,  81,   1,  26,
        15,   6,  36,  55,  77,   8,  94,  50,  92,  48,  93,  32,  88,
        67,  39, 101,  57,  49, 110,  19,  82,  20,  30,  11,  31,  22,
        60,  35,  29,  54,  75,  74,  96,  46, 104,  70,  44,  84,  16,
        27, 103,  34,  95,  86,  97,   4,  47, 108,  90,  72,  13, 109,
        43,  24,  71,   2,  89,  40,  87,   9,  78,  76, 106,  68,  99,
         3,  66,  58,  41,  42,  85,  53, 107,  79, 105,  52,  56,   5,
        14,  45, 102,  18,  37,  63,  61,  12,  10,  98,  83,  21,  28,
        38,  73,  51, 100,   0,   7,  62], dtype=int64)

In [27]:
heroes_num = heroes.player_0.unique().max()
print 'Heroes num: ', heroes_num
X_pick = np.zeros((heroes.shape[0], heroes_num + 1))
print X_pick

Heroes num:  110
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [28]:
for i, match_id in enumerate(heroes.mid.values):
    for p in xrange(5):
        X_pick[i, heroes.ix[match_id, 'player_{0}'.format(p)]] = 1
        X_pick[i, heroes.ix[match_id, 'player_{0}'.format(5 + p)]] = -1
        
xpick_df = pd.DataFrame(X_pick)
xpick_df = xpick_df.astype(int)
xpick_df.index = heroes.index   

In [29]:
xpick_df['mid'] = xpick_df.index
xpick_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,102,103,104,105,106,107,108,109,110,mid
0,0,0,0,0,0,0,-1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,-1,0,0,...,0,0,-1,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,0,0,0,-1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [30]:
train = pd.merge(train, xpick_df, on='mid', how='left')
test = pd.merge(test, xpick_df, on='mid', how='left')

In [90]:
train.to_csv('train_with_heroes.csv')
test.to_csv('test_with_heroes.csv')

In [31]:
X_train = train.drop('radiant_won', 1)
y_train = train.radiant_won

In [32]:
# X_ext = X_norm.join(xpick_df)
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.69775, std: 0.00544, params: {'C': 1.0000000000000001e-05}, mean: 0.71413, std: 0.00580, params: {'C': 0.0001}, mean: 0.72989, std: 0.01089, params: {'C': 0.001}, mean: 0.75073, std: 0.01082, params: {'C': 0.01}, mean: 0.75089, std: 0.01085, params: {'C': 0.10000000000000001}, mean: 0.75093, std: 0.01090, params: {'C': 1.0}]
{'C': 1.0}
best score: 0.75092996587


In [33]:
clf=LogisticRegression(random_state=241, C=1)
clf.fit(X_train, y_train)
test_matches = pd.read_csv('test.csv')
test_matches['radiant_won'] = clf.predict_proba(test)[:, 1]

In [35]:
test_matches.to_csv('bag_of_words.csv', index=None)

In [76]:
items = pd.read_csv('items.csv')
items.head()

Unnamed: 0,mid,player,item_0,item_1,item_2,item_3,item_4,item_5,item_6,item_7,...,item_111,item_112,item_113,item_114,item_115,item_116,item_117,item_118,item_119,item_120
0,0,0,,,,,,,,,...,,,,,,,,,,
1,0,1,,,,,,,,,...,,,,,,,,,,
2,0,2,,,,,,,,,...,,,,,,,,,,
3,0,3,,,,1.0,,,,,...,,,,,,,,,,
4,0,4,,,,,,,,,...,,,,,1.0,,,,,


In [84]:
items.fillna(0, inplace=True)
radiant_items = items.drop('player', 1).loc[items.player < 5].groupby('mid').sum()
radiant_items.head()

Unnamed: 0_level_0,item_0,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,...,item_111,item_112,item_113,item_114,item_115,item_116,item_117,item_118,item_119,item_120
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
norm_radiant_items = pd.DataFrame(index=radiant_items.index)

In [86]:
for col in radiant_items.columns:
    norm_radiant_items['rad_' + col] = radiant_items[col].astype(int32)

In [87]:
norm_radiant_items.head()

Unnamed: 0_level_0,rad_item_0,rad_item_1,rad_item_2,rad_item_3,rad_item_4,rad_item_5,rad_item_6,rad_item_7,rad_item_8,rad_item_9,...,rad_item_111,rad_item_112,rad_item_113,rad_item_114,rad_item_115,rad_item_116,rad_item_117,rad_item_118,rad_item_119,rad_item_120
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
4,0,5,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### Возможно стоило бы отмасштабировать, но не буду

In [88]:
norm_radiant_items.reset_index(level=0, inplace=True)
norm_radiant_items.head()

Unnamed: 0,mid,rad_item_0,rad_item_1,rad_item_2,rad_item_3,rad_item_4,rad_item_5,rad_item_6,rad_item_7,rad_item_8,...,rad_item_111,rad_item_112,rad_item_113,rad_item_114,rad_item_115,rad_item_116,rad_item_117,rad_item_118,rad_item_119,rad_item_120
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,2,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
4,4,0,5,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [75]:
dire_items = items.drop('player', 1).loc[items.player > 4].groupby('mid').sum()
norm_dire_items = pd.DataFrame(index=dire_items.index)
for col in dire_items.columns:
    norm_dire_items['dire_' + col] = dire_items[col].astype(int32)
norm_dire_items.reset_index(level=0, inplace=True)
norm_dire_items.head()

Unnamed: 0,mid,dire_item_0,dire_item_1,dire_item_2,dire_item_3,dire_item_4,dire_item_5,dire_item_6,dire_item_7,dire_item_8,...,dire_item_111,dire_item_112,dire_item_113,dire_item_114,dire_item_115,dire_item_116,dire_item_117,dire_item_118,dire_item_119,dire_item_120
0,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
1,1,0,2,0,0,0,0,0,0,0,...,0,1,0,0,2,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
3,3,0,2,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,4,0,4,0,0,0,0,0,0,0,...,0,0,1,0,2,0,0,0,0,0


In [83]:
norm_radiant_items.head()

Unnamed: 0_level_0,rad_item_0,rad_item_1,rad_item_2,rad_item_3,rad_item_4,rad_item_5,rad_item_6,rad_item_7,rad_item_8,rad_item_9,...,rad_item_111,rad_item_112,rad_item_113,rad_item_114,rad_item_115,rad_item_116,rad_item_117,rad_item_118,rad_item_119,rad_item_120
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
4,0,5,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [91]:
train = pd.merge(train, norm_radiant_items, on='mid', how='left')
train = pd.merge(train, norm_dire_items, on='mid', how='left')
test = pd.merge(test, norm_radiant_items, on='mid', how='left')
test = pd.merge(test, norm_dire_items, on='mid', how='left')

In [93]:
X_train = train.drop('radiant_won', 1)
y_train = train.radiant_won
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.69718, std: 0.00388, params: {'C': 1.0000000000000001e-05}, mean: 0.71885, std: 0.00505, params: {'C': 0.0001}, mean: 0.74191, std: 0.00684, params: {'C': 0.001}, mean: 0.75933, std: 0.01094, params: {'C': 0.01}, mean: 0.75993, std: 0.01162, params: {'C': 0.10000000000000001}, mean: 0.75991, std: 0.01183, params: {'C': 1.0}]
{'C': 0.10000000000000001}
best score: 0.759925879058


In [94]:
clf=LogisticRegression(random_state=241, C=0.1)
clf.fit(X_train, y_train)
test_matches = pd.read_csv('test.csv')
test_matches['radiant_won'] = clf.predict_proba(test)[:, 1]

In [95]:
test_matches.to_csv('heroes+items_predict.csv', index=None)

In [6]:
train.to_csv('train_with_heroes_items.csv', index=None)
test.to_csv('test_with_heroes_items.csv', index=None)

In [8]:
train = pd.read_csv('train_with_heroes_items.csv')
test = pd.read_csv('test_with_heroes_items.csv')

In [8]:
train.head()

Unnamed: 0,mid,radiant_won,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,...,dire_item_111,dire_item_112,dire_item_113,dire_item_114,dire_item_115,dire_item_116,dire_item_117,dire_item_118,dire_item_119,dire_item_120
0,0,1,-0.460819,0.88362,-1.022316,0.302363,1.318719,0.005361,0.009184,1.420892,...,0,0,0,0,2,0,0,0,0,0
1,1,0,-1.176482,1.291373,-0.146754,0.248927,1.321671,2.659065,0.808769,-0.933944,...,0,1,0,0,2,0,0,0,0,0
2,2,1,-0.350943,-1.514322,3.321283,0.276011,-0.811136,-0.716695,-1.25938,0.836084,...,0,1,1,0,0,0,0,0,0,0
3,4,1,-0.298934,0.071793,0.827062,-0.71951,-1.396367,-0.10709,-0.367047,-0.455398,...,0,0,1,0,2,0,0,0,0,0
4,5,1,0.123724,-1.172811,-1.071807,0.183779,-1.050985,0.508433,1.050364,0.705301,...,0,0,1,0,0,0,0,0,0,0


In [9]:
xp = pd.read_csv('xp.csv', index_col='mid')
xp.head()

Unnamed: 0_level_0,times,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,60,79,214,147,222,147,94,78,396,94,147
0,120,321,719,423,777,421,490,607,895,241,365
0,180,356,1333,424,1300,638,922,937,1259,242,590
0,240,544,1752,441,1782,1348,1460,1163,2037,242,658
0,300,724,2002,565,2087,1807,2102,1498,2389,276,1020


In [10]:
xp = xp.loc[xp.times == 600].drop('times', 1)
xp.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1696,4304,1606,4429,3821,3745,3635,4846,357,2347
1,1542,4481,1961,2437,4810,4514,3510,2248,3403,3662
2,2851,1986,6234,2667,4265,2402,1541,3212,2864,2426
3,1145,4640,4202,1769,3786,3529,2962,1400,4512,1530
4,2197,3525,4855,3239,1518,3373,1935,2563,3281,1924


In [11]:
radiant_xp = xp.player_0 + xp.player_1 + xp.player_2 + xp.player_3 + xp.player_4
dire_xp = xp.player_5 + xp.player_6 + xp.player_7 + xp.player_8 + xp.player_9

In [12]:
norm_xp = pd.DataFrame(index=xp.index)
for col in xp.columns:
    norm_xp['exp_' + col] = xp[col] / (radiant_xp + dire_xp)

In [13]:
norm_xp.head()

Unnamed: 0_level_0,exp_player_0,exp_player_1,exp_player_2,exp_player_3,exp_player_4,exp_player_5,exp_player_6,exp_player_7,exp_player_8,exp_player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.05509,0.139804,0.052167,0.143864,0.124115,0.121646,0.118073,0.157409,0.011596,0.076236
1,0.047347,0.137589,0.060212,0.074828,0.147691,0.138602,0.107775,0.069025,0.104489,0.112442
2,0.093635,0.065226,0.204743,0.087592,0.140075,0.078889,0.050611,0.105491,0.094062,0.079677
3,0.038846,0.157422,0.142561,0.060017,0.128448,0.119729,0.100492,0.047498,0.153079,0.051908
4,0.077332,0.124076,0.170891,0.114009,0.053432,0.118726,0.06811,0.090215,0.115488,0.067723


In [14]:
norm_xp['exp_ratio'] = radiant_xp / dire_xp
norm_xp.reset_index(level=0, inplace=True)
norm_xp.head()

Unnamed: 0,mid,exp_player_0,exp_player_1,exp_player_2,exp_player_3,exp_player_4,exp_player_5,exp_player_6,exp_player_7,exp_player_8,exp_player_9,exp_ratio
0,0,0.05509,0.139804,0.052167,0.143864,0.124115,0.121646,0.118073,0.157409,0.011596,0.076236,1.062023
1,1,0.047347,0.137589,0.060212,0.074828,0.147691,0.138602,0.107775,0.069025,0.104489,0.112442,0.878526
2,2,0.093635,0.065226,0.204743,0.087592,0.140075,0.078889,0.050611,0.105491,0.094062,0.079677,1.446605
3,3,0.038846,0.157422,0.142561,0.060017,0.128448,0.119729,0.100492,0.047498,0.153079,0.051908,1.115481
4,4,0.077332,0.124076,0.170891,0.114009,0.053432,0.118726,0.06811,0.090215,0.115488,0.067723,1.172683


In [15]:
norm_xp['radiant_xp'] = StandardScaler().fit_transform(radiant_xp.astype(float).reshape(-1, 1))
norm_xp['dire_xp'] = StandardScaler().fit_transform(dire_xp.astype(float).reshape(-1, 1))
norm_xp['exp_diff'] = StandardScaler().fit_transform((radiant_xp - dire_xp).astype(float).reshape(-1, 1))
norm_xp.head()

Unnamed: 0,mid,exp_player_0,exp_player_1,exp_player_2,exp_player_3,exp_player_4,exp_player_5,exp_player_6,exp_player_7,exp_player_8,exp_player_9,exp_ratio,radiant_xp,dire_xp,exp_diff
0,0,0.05509,0.139804,0.052167,0.143864,0.124115,0.121646,0.118073,0.157409,0.011596,0.076236,1.062023,0.370217,-0.184408,0.377645
1,1,0.047347,0.137589,0.060212,0.074828,0.147691,0.138602,0.107775,0.069025,0.104489,0.112442,0.878526,-0.003502,1.245843,-0.854239
2,2,0.093635,0.065226,0.204743,0.087592,0.140075,0.078889,0.050611,0.105491,0.094062,0.079677,1.446605,1.654017,-1.661006,2.259601
3,3,0.038846,0.157422,0.142561,0.060017,0.128448,0.119729,0.100492,0.047498,0.153079,0.051908,1.115481,0.182461,-0.77683,0.655144
4,4,0.077332,0.124076,0.170891,0.114009,0.053432,0.118726,0.06811,0.090215,0.115488,0.067723,1.172683,0.058087,-1.286063,0.918829


In [16]:
train = pd.merge(train, norm_xp, on='mid', how='left')
test = pd.merge(test, norm_xp, on='mid', how='left')

In [17]:
X_train = train.drop(['radiant_won'], 1)
y_train = train.radiant_won
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.70386, std: 0.00390, params: {'C': 1.0000000000000001e-05}, mean: 0.72085, std: 0.00480, params: {'C': 0.0001}, mean: 0.74189, std: 0.00645, params: {'C': 0.001}, mean: 0.75971, std: 0.01100, params: {'C': 0.01}, mean: 0.76152, std: 0.01143, params: {'C': 0.10000000000000001}, mean: 0.76161, std: 0.01175, params: {'C': 1.0}]
{'C': 1.0}
best score: 0.761606136515


In [18]:
X_train = train.drop(['radiant_won', 'radiant_xp', 'dire_xp'], 1)
y_train = train.radiant_won
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.70147, std: 0.00392, params: {'C': 1.0000000000000001e-05}, mean: 0.72069, std: 0.00494, params: {'C': 0.0001}, mean: 0.74329, std: 0.00597, params: {'C': 0.001}, mean: 0.76051, std: 0.01137, params: {'C': 0.01}, mean: 0.76182, std: 0.01166, params: {'C': 0.10000000000000001}, mean: 0.76156, std: 0.01193, params: {'C': 1.0}]
{'C': 0.10000000000000001}
best score: 0.76182087041


In [19]:
X_train = train.drop(['radiant_won', 'radiant_xp', 'dire_xp', 'radiant_gold', 'dire_gold'], 1)
y_train = train.radiant_won
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.69697, std: 0.00389, params: {'C': 1.0000000000000001e-05}, mean: 0.71976, std: 0.00491, params: {'C': 0.0001}, mean: 0.74322, std: 0.00645, params: {'C': 0.001}, mean: 0.76070, std: 0.01115, params: {'C': 0.01}, mean: 0.76128, std: 0.01140, params: {'C': 0.10000000000000001}, mean: 0.76126, std: 0.01150, params: {'C': 1.0}]
{'C': 0.10000000000000001}
best score: 0.761276866255


In [19]:
from sklearn.linear_model import Lasso, Ridge

In [40]:
clf = LogisticRegression(random_state=241, C=0.1)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76122172913729569

In [21]:
clf = Ridge(alpha=0.0001)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76012867861426003

In [20]:
clf = Lasso(alpha=0.0001)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76190861887287353

In [35]:
X_dropped = pd.DataFrame(X_train)
for i, val in enumerate(clf.coef_):
    if val < 0.00001 and i>0:
        print X_train.columns[i]
        X_dropped.drop(X_train.columns[i], 1, inplace=True)

player_1
player_2
player_3
player_4
player_5
player_6
player_7
player_8
player_9
radiant_gold
63
64
72
104
105
rad_item_3
rad_item_4
rad_item_10
rad_item_11
rad_item_12
rad_item_13
rad_item_14
rad_item_15
rad_item_16
rad_item_17
rad_item_18
rad_item_19


In [28]:
clf = Lasso(alpha=0.0001)
clf.fit(X_dropped, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76190861887287353

In [39]:
clf = LogisticRegression(random_state=241, C=0.1)
clf.fit(X_dropped, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76122172913729569

In [36]:
gs = logEstimation(X_dropped, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.69489, std: 0.00308, params: {'C': 1.0000000000000001e-05}, mean: 0.71801, std: 0.00459, params: {'C': 0.0001}, mean: 0.74282, std: 0.00666, params: {'C': 0.001}, mean: 0.76085, std: 0.01151, params: {'C': 0.01}, mean: 0.76164, std: 0.01278, params: {'C': 0.10000000000000001}, mean: 0.76152, std: 0.01289, params: {'C': 1.0}]
{'C': 0.10000000000000001}
best score: 0.761643112473


In [153]:
x_train = X_train.drop(['radiant_gold', 'dire_gold'], 1)
y_train = train.radiant_won
gs = logEstimation(x_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.69697, std: 0.00389, params: {'C': 1.0000000000000001e-05}, mean: 0.71976, std: 0.00491, params: {'C': 0.0001}, mean: 0.74322, std: 0.00645, params: {'C': 0.001}, mean: 0.76065, std: 0.01110, params: {'C': 0.01}, mean: 0.76128, std: 0.01141, params: {'C': 0.10000000000000001}, mean: 0.76125, std: 0.01147, params: {'C': 1.0}]
{'C': 0.10000000000000001}
best score: 0.761284636319


In [133]:
clf=LogisticRegression(random_state=241, C=0.1)
clf.fit(X_train, y_train)
test_matches = pd.read_csv('test.csv')
test_matches['radiant_won'] = clf.predict_proba(test)[:, 1]
test_matches.head()

Unnamed: 0,mid,radiant_won
0,3,0.772986
1,7,0.706327
2,9,0.204504
3,10,0.419095
4,12,0.366389


In [134]:
test_matches.to_csv('heroes+items+xp+gold_predict.csv', index=None)

In [143]:
lh = pd.read_csv('lh.csv', index_col='mid')
lh = lh.loc[lh.times == 600].drop('times', 1)
lh.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,4,43,3,57,41,34,35,75,2,46
1,5,63,14,28,47,49,23,3,30,39
2,34,6,69,42,23,6,12,56,13,12
3,1,49,48,3,31,28,51,6,50,8
4,30,41,45,23,4,26,9,23,37,6


In [144]:
radiant_lh = lh.player_0 + lh.player_1 + lh.player_2 + lh.player_3 + lh.player_4
dire_lh = lh.player_5 + lh.player_6 + lh.player_7 + lh.player_8 + lh.player_9