In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, Ridge

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
def normalize_data(X):
    return pd.DataFrame(
        StandardScaler().fit_transform(X),
        index = X.index, 
        columns=X.columns
    )

def logEstimation(X, y):
#     grid = {'C': np.power(10.0, np.arange(-5, 1))}
    grid = {'C': [0.05, 0.01, 0.05, 0.1, 0.5, 0.75, 1]}
    kf=KFold(y.size, n_folds=5, shuffle=True, random_state=241)
    clf=LogisticRegression(random_state=241)
    gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=kf)
    gs.fit(X, y)
    return gs

In [3]:
lh = pd.read_csv('data\lh.csv', index_col='mid')
lh = lh.loc[lh.times == 600].drop('times', 1)
lh.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,4,43,3,57,41,34,35,75,2,46
1,5,63,14,28,47,49,23,3,30,39
2,34,6,69,42,23,6,12,56,13,12
3,1,49,48,3,31,28,51,6,50,8
4,30,41,45,23,4,26,9,23,37,6


In [4]:
radiant_lh = lh.player_0 + lh.player_1 + lh.player_2 + lh.player_3 + lh.player_4
dire_lh = lh.player_5 + lh.player_6 + lh.player_7 + lh.player_8 + lh.player_9
radiant_carry_cs = lh[['player_0', 'player_1', 'player_2', 'player_3', 'player_4']].max(1)
dire_carry_cs = lh[['player_5', 'player_6', 'player_7', 'player_8', 'player_9']].max(1)
creeps = pd.DataFrame(index=lh.index)
for col in lh.columns:
    creeps['creeps_' + col] = lh[col]
creeps['radiant_creeps'] = radiant_lh
creeps['dire_creeps'] = dire_lh
creeps['radiant_carry_creeps'] = radiant_carry_cs
creeps['dire_carry_creeps'] = dire_carry_cs
creeps['creeps_diff'] = radiant_lh - dire_lh
creeps.head()

Unnamed: 0_level_0,creeps_player_0,creeps_player_1,creeps_player_2,creeps_player_3,creeps_player_4,creeps_player_5,creeps_player_6,creeps_player_7,creeps_player_8,creeps_player_9,radiant_creeps,dire_creeps,radiant_carry_creeps,dire_carry_creeps,creeps_diff
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,4,43,3,57,41,34,35,75,2,46,148,192,57,75,-44
1,5,63,14,28,47,49,23,3,30,39,157,144,63,49,13
2,34,6,69,42,23,6,12,56,13,12,174,99,69,56,75
3,1,49,48,3,31,28,51,6,50,8,132,143,49,51,-11
4,30,41,45,23,4,26,9,23,37,6,143,101,45,37,42


In [5]:
creeps = normalize_data(creeps)
creeps['creeps_rel'] = radiant_lh / dire_lh
creeps.reset_index(level=0, inplace=True)

creeps.head()

Unnamed: 0,mid,creeps_player_0,creeps_player_1,creeps_player_2,creeps_player_3,creeps_player_4,creeps_player_5,creeps_player_6,creeps_player_7,creeps_player_8,creeps_player_9,radiant_creeps,dire_creeps,radiant_carry_creeps,dire_carry_creeps,creeps_diff,creeps_rel
0,0,-1.243959,0.839247,-1.237188,1.563027,0.759367,0.323709,0.435091,2.505723,-1.301075,1.028664,0.451955,1.926818,0.400137,1.788257,-1.183,0.770833
1,1,-1.191912,1.878427,-0.668118,0.060755,1.072314,1.106572,-0.191645,-1.244911,0.167004,0.660455,0.763204,0.314862,0.860007,-0.189175,0.338896,1.090278
2,2,0.317441,-1.083236,2.177236,0.78599,-0.179471,-1.137635,-0.766154,1.515972,-0.72433,-0.759781,1.35112,-1.196346,1.319876,0.34321,1.994292,1.757576
3,3,-1.400099,1.151001,1.090828,-1.234307,0.23779,0.010564,1.27074,-1.088635,1.215632,-0.970186,-0.101378,0.28128,-0.213023,-0.037065,-0.301902,0.923077
4,4,0.109254,0.735329,0.935627,-0.198257,-1.170468,-0.093818,-0.922838,-0.203068,0.534024,-1.075389,0.279038,-1.129181,-0.519603,-1.101837,1.113194,1.415842


In [None]:
creeps.to_csv('processing_tables/creeps.csv', index=None)

In [15]:
creeps = pd.read_csv('data\lh.csv', index_col='mid')
creeps = creeps.loc[creeps.times == 600].drop('times', 1)
creeps.reset_index(inplace=True)
radiant_rank_creep = pd.DataFrame(data = np.sort(creeps.values[:,1:6]),
                                       index=creeps.mid,
                                       columns = ['radiant_creep_rank_{0}'.format(i) for i in range(5)]) 
radiant_rank_creep = normalize_data(radiant_rank_creep)
dire_rank_creep = pd.DataFrame(data = np.sort(creeps.values[:,6:11]),
                                       index=creeps.mid,
                                       columns = ['dire_creep_rank_{0}'.format(i) for i in range(5)]) 
dire_rank_creep = normalize_data(dire_rank_creep)

cropped_creeps = pd.DataFrame(index = creeps.mid)
cropped_creeps['rad_best_creeps'] = radiant_rank_creep.radiant_creep_rank_4
cropped_creeps['rad_second_creeps'] = radiant_rank_creep.radiant_creep_rank_3
cropped_creeps['dire_best_creeps'] = dire_rank_creep.dire_creep_rank_4
cropped_creeps['dire_second_creeps'] = dire_rank_creep.dire_creep_rank_3
cropped_creeps['rad_creeps_sum'] = radiant_lh
cropped_creeps['dire_creeps_sum'] = dire_lh
cropped_creeps = normalize_data(cropped_creeps)
cropped_creeps.reset_index(inplace=True)

cropped_creeps.head()

Unnamed: 0,mid,rad_best_creeps,rad_second_creeps,dire_best_creeps,dire_second_creeps,rad_creeps_sum,dire_creeps_sum
0,0,0.400137,0.413201,1.788257,0.714331,0.451955,1.926818
1,1,0.860007,0.782767,-0.189175,0.067585,0.763204,0.314862
2,2,1.319876,0.320809,0.34321,-2.334614,1.35112,-1.196346
3,3,-0.213023,0.875159,-0.037065,1.0839,-0.101378,0.28128
4,4,-0.519603,0.228417,-1.101837,-1.133515,0.279038,-1.129181


In [16]:
cropped_creeps.to_csv('processing_tables/cropped_creeps2.csv', index=None)

In [8]:
creeps = pd.read_csv('data\lh.csv', index_col='mid')
creeps = creeps.loc[creeps.times == 600].drop('times', 1)
creeps.reset_index(inplace=True)
radiant_rank_creep = pd.DataFrame(data = np.sort(creeps.values[:,1:6]),
                                       index=creeps.mid,
                                       columns = ['radiant_creep_rank_{0}'.format(i) for i in range(5)]) 
# radiant_rank_creep = normalize_data(radiant_rank_creep)
dire_rank_creep = pd.DataFrame(data = np.sort(creeps.values[:,6:11]),
                                       index=creeps.mid,
                                       columns = ['dire_creep_rank_{0}'.format(i) for i in range(5)]) 
# dire_rank_creep = normalize_data(dire_rank_creep)

cropped_creeps = pd.DataFrame(index = creeps.mid)
cropped_creeps['rad_best_creeps'] = radiant_rank_creep.radiant_creep_rank_4
cropped_creeps['rad_second_creeps'] = radiant_rank_creep.radiant_creep_rank_3
cropped_creeps['dire_best_creeps'] = dire_rank_creep.dire_creep_rank_4
cropped_creeps['dire_second_creeps'] = dire_rank_creep.dire_creep_rank_3
cropped_creeps['rad_creeps_sum'] = creeps.ix[:, 0:5].sum(1)
cropped_creeps['dire_creeps_sum'] = creeps.ix[:, 5:10].sum(1)
# cropped_creeps = normalize_data(cropped_creeps)
cropped_creeps.reset_index(inplace=True)

creeps_dif = pd.DataFrame(index = creeps.mid)
creeps_dif['best_creeps_dif'] = cropped_creeps.rad_best_creeps - cropped_creeps.dire_best_creeps
creeps_dif['second_creeps_dif'] = cropped_creeps.rad_second_creeps - cropped_creeps.dire_second_creeps
creeps_dif['sum_creeps_dif'] = cropped_creeps.rad_creeps_sum - cropped_creeps.dire_creeps_sum
# creeps_dif = normalize_data(creeps_dif)
creeps_dif.reset_index(inplace=True)
creeps_dif.head()

Unnamed: 0,mid,best_creeps_dif,second_creeps_dif,sum_creeps_dif
0,0,-18,-3,-80
1,1,14,8,-41
2,2,13,29,43
3,3,-2,-2,-62
4,4,8,15,44


In [20]:
creeps_dif.to_csv('processing_tables/cropped_creeps_dif.csv', index=None)

# Merge & Save

In [55]:
train = pd.read_csv('processing_tables/train_gold_heroes_items_exp.csv')
test = pd.read_csv('processing_tables/test_gold_heroes_items_exp.csv')

In [56]:
train = pd.merge(train, creeps, on='mid', how='left')
test = pd.merge(test, creeps, on='mid', how='left')

In [57]:
creeps.to_csv('processing_tables/processed_creeps.csv', index=None)
train.to_csv('processing_tables/train_gold_heroes_items_exp_creeps.csv', index=None)
test.to_csv('processing_tables/test_gold_heroes_items_exp_creeps.csv', index=None)

# Modeling

In [58]:
X_train = train.drop('radiant_won', 1)
y_train = train.radiant_won

In [61]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.76416, std: 0.01157, params: {'C': 0.05}, mean: 0.76272, std: 0.01137, params: {'C': 0.01}, mean: 0.76416, std: 0.01157, params: {'C': 0.05}, mean: 0.76394, std: 0.01148, params: {'C': 0.1}, mean: 0.76455, std: 0.01234, params: {'C': 0.5}, mean: 0.76380, std: 0.01222, params: {'C': 0.75}, mean: 0.76395, std: 0.01240, params: {'C': 1}]
{'C': 0.5}
best score: 0.764554949915


In [62]:
clf = LogisticRegression(C=0.5)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76368533535964866

In [64]:
clf = LogisticRegression(C=0.1)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76421604771695362

In [65]:
clf = Ridge(alpha=0.0001)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.7626876696064695

In [67]:
X_train = train.drop(['radiant_won', 'radiant_xp', 'dire_xp'], 1)
clf = Lasso(alpha=0.0001)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76457487009072733

In [43]:
X_train = train.drop(['radiant_won', 'radiant_xp', 'dire_xp'], 1)
clf = Lasso(alpha=0.0001)
clf.fit(X_train, y_train)

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [48]:
X_test = test.drop(['radiant_xp', 'dire_xp'], 1)
test_matches = pd.read_csv('data/test.csv')
test_matches['radiant_won'] = clf.predict(X_test)

In [49]:
test_matches.head()

Unnamed: 0,mid,radiant_won
0,3,0.723102
1,7,0.654376
2,9,0.218321
3,10,0.443448
4,12,0.419936


In [50]:
test_matches.to_csv('submissions/with_creeps.csv', index=None)

# With gold score

In [68]:
train = pd.read_csv('processing_tables/train_goldScore_heroes_items_exp.csv')
test = pd.read_csv('processing_tables/test_goldScore_heroes_items_exp.csv')

train = pd.merge(train, creeps, on='mid', how='left')
test = pd.merge(test, creeps, on='mid', how='left')

X_train = train.drop('radiant_won', 1)
y_train = train.radiant_won

In [69]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.76355, std: 0.01032, params: {'C': 0.05}, mean: 0.76317, std: 0.01104, params: {'C': 0.01}, mean: 0.76355, std: 0.01032, params: {'C': 0.05}, mean: 0.76357, std: 0.01055, params: {'C': 0.1}, mean: 0.76383, std: 0.01219, params: {'C': 0.5}, mean: 0.76388, std: 0.01217, params: {'C': 0.75}, mean: 0.76351, std: 0.01189, params: {'C': 1}]
{'C': 0.75}
best score: 0.763883589923


In [70]:
clf = LogisticRegression(C=0.75)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.7629120502255835

In [71]:
clf = Ridge(alpha=0.0001)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76424141684420055

In [73]:
clf = Lasso(alpha=0.0001, max_iter=6000)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.765595130335467

In [74]:
creeps.to_csv('processing_tables/processed_creeps.csv', index=None)
train.to_csv('processing_tables/train_goldScore_heroes_items_exp_creeps.csv', index=None)
test.to_csv('processing_tables/test_goldScore_heroes_items_exp_creeps.csv', index=None)

# Creep_score

In [75]:
new_lh = pd.read_csv('data\lh.csv', index_col='mid')
new_lh = new_lh.loc[new_lh.times == 600].drop('times', 1)
new_lh.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,4,43,3,57,41,34,35,75,2,46
1,5,63,14,28,47,49,23,3,30,39
2,34,6,69,42,23,6,12,56,13,12
3,1,49,48,3,31,28,51,6,50,8
4,30,41,45,23,4,26,9,23,37,6


In [76]:
heroes = pd.read_csv('data/heroes.csv', index_col='mid')
heroes_num = heroes.max()[0] + 1
print heroes_num

111


In [81]:
mean_creep_stat = np.zeros(111)
for hero_num in range(111):
    hero_values = []
    for player_num in range(10):
        colname = 'player_{}'.format(player_num)
        hero_index = heroes.loc[heroes[colname] == hero_num].index
        hero_values.extend(new_lh.loc[hero_index][colname].values)
    mean_creep_stat[hero_num] = int(np.mean(hero_values))
mean_creep_stat[0] = 100000
mean_creep_stat = np.array(map(int, mean_creep_stat))
print mean_creep_stat

[100000     46     31     14      8     22     16     15     43     56
      8     37     12     35     38     54     16     32      8      6
     23     12     40     25     34     38     15     40     28     40
     32     37     13     15     43     35     35     38     34      8
      8     22     21     21     38     35     45     36     34     38
      9     34     33     33     20     31     39      7     30     40
      8      8     24     35     35     37      7     34     26     10
     37     11     42     28     45     35     14     35     46     29
      9     25     45     42     31     43      6      9      7     18
      8      8      6     16     25     29     13     12      6     20
      9     36     39      7      7     37     22     29     25      7
     28]
