In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, Ridge

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
def normalize_data(X):
    return pd.DataFrame(
        StandardScaler().fit_transform(X),
        index = X.index, 
        columns=X.columns
    )

def logEstimation(X, y):
    grid = {'C': np.power(10.0, np.arange(-5, 1))}
    kf=KFold(y.size, n_folds=5, shuffle=True, random_state=241)
    clf=LogisticRegression(random_state=241)
    gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=kf)
    gs.fit(X, y)
    return gs

In [3]:
train_matches = pd.read_csv('data/train.csv')
test_matches = pd.read_csv('data/test.csv')

In [20]:
heroes = pd.read_csv('data/heroes.csv')
heroes.head()

Unnamed: 0,mid,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
0,0,91,42,87,15,65,11,6,34,69,74
1,1,69,85,71,24,64,74,68,39,65,11
2,2,17,40,31,67,99,32,7,72,48,104
3,3,80,43,101,71,94,69,70,98,24,39
4,4,25,15,75,29,95,3,32,55,64,86


In [5]:
players = {'player_{0}'.format(i) for i in range(10)}
print players

set(['player_6', 'player_7', 'player_4', 'player_5', 'player_2', 'player_3', 'player_0', 'player_1', 'player_8', 'player_9'])


In [6]:
heroes.player_0.unique()

array([ 91,  69,  17,  80,  25,  65,  64,  23,  59,  33,  81,   1,  26,
        15,   6,  36,  55,  77,   8,  94,  50,  92,  48,  93,  32,  88,
        67,  39, 101,  57,  49, 110,  19,  82,  20,  30,  11,  31,  22,
        60,  35,  29,  54,  75,  74,  96,  46, 104,  70,  44,  84,  16,
        27, 103,  34,  95,  86,  97,   4,  47, 108,  90,  72,  13, 109,
        43,  24,  71,   2,  89,  40,  87,   9,  78,  76, 106,  68,  99,
         3,  66,  58,  41,  42,  85,  53, 107,  79, 105,  52,  56,   5,
        14,  45, 102,  18,  37,  63,  61,  12,  10,  98,  83,  21,  28,
        38,  73,  51, 100,   0,   7,  62], dtype=int64)

In [7]:
heroes_num = heroes.player_0.unique().max()
print 'Heroes num: ', heroes_num
X_pick = np.zeros((heroes.shape[0], heroes_num + 1))
print X_pick

Heroes num:  110
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [8]:
for i, match_id in enumerate(heroes.mid.values):
    for p in xrange(5):
        X_pick[i, heroes.ix[match_id, 'player_{0}'.format(p)]] = 1
        X_pick[i, heroes.ix[match_id, 'player_{0}'.format(5 + p)]] = -1  

In [9]:
print X_pick

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  1. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [10]:
xpick_df = pd.DataFrame(X_pick, columns=['hero_{0}'.format(i) for i in range(111)])
xpick_df = xpick_df.astype(int)
xpick_df.index = heroes.index 

In [11]:
xpick_df['mid'] = xpick_df.index
xpick_df.head()

Unnamed: 0,hero_0,hero_1,hero_2,hero_3,hero_4,hero_5,hero_6,hero_7,hero_8,hero_9,...,hero_102,hero_103,hero_104,hero_105,hero_106,hero_107,hero_108,hero_109,hero_110,mid
0,0,0,0,0,0,0,-1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,-1,0,0,...,0,0,-1,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,0,0,0,-1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [19]:
xpick_df.ix[0,:]

hero_0      0
hero_1      0
hero_2      0
hero_3      0
hero_4      0
hero_5      0
hero_6     -1
hero_7      0
hero_8      0
hero_9      0
hero_10     0
hero_11    -1
hero_12     0
hero_13     0
hero_14     0
hero_15     1
hero_16     0
hero_17     0
hero_18     0
hero_19     0
hero_20     0
hero_21     0
hero_22     0
hero_23     0
hero_24     0
hero_25     0
hero_26     0
hero_27     0
hero_28     0
hero_29     0
           ..
hero_82     0
hero_83     0
hero_84     0
hero_85     0
hero_86     0
hero_87     1
hero_88     0
hero_89     0
hero_90     0
hero_91     1
hero_92     0
hero_93     0
hero_94     0
hero_95     0
hero_96     0
hero_97     0
hero_98     0
hero_99     0
hero_100    0
hero_101    0
hero_102    0
hero_103    0
hero_104    0
hero_105    0
hero_106    0
hero_107    0
hero_108    0
hero_109    0
hero_110    0
mid         0
Name: 0, dtype: int64

# add labels 

In [21]:
labels = pd.read_csv('processing_tables/heroes_labels.csv')
labels = labels.values.ravel()
print labels

[0 1 3 3 2 3 2 2 1 1 2 1 2 1 1 1 3 3 2 2 3 2 1 3 1 1 2 1 1 1 1 1 3 2 1 1 1
 1 1 2 2 2 3 3 1 1 1 1 1 1 2 3 1 3 3 1 1 2 1 1 2 2 3 3 1 1 2 1 3 2 1 2 1 3
 1 1 2 1 1 3 2 3 1 1 3 1 2 2 2 3 2 2 2 3 3 3 2 2 2 3 2 1 1 2 2 1 3 3 3 2 3]


In [23]:
from collections import Counter
dummy = pd.DataFrame(index=heroes.mid)
have_carry = []
have_support = []
to_many_carry = []
all_supps = []
for pick in heroes.ix[:, 1:6].values:
    current_game_labels = Counter([labels[hero] for hero in pick])
    have_carry.append(current_game_labels[3] > 0)
    have_support.append(current_game_labels[1] > 0)
    to_many_carry.append(current_game_labels[3] > 3)
    all_supps.append(current_game_labels[1] == 5)
dummy['Have_carry'] = have_carry
dummy['Have_support'] = have_support
dummy['To_many_carry'] = to_many_carry
dummy['All_supps'] = all_supps

for col in dummy.columns:
    dummy[col] = dummy[col].astype(int)
dummy.reset_index(inplace=True)
dummy.head()
dummy.to_csv('processing_tables/dummy_heroes.csv')

In [28]:
dummy.All_supps.head()

0    0
1    0
2    0
3    0
4    0
Name: All_supps, dtype: int32

# Merge and save

In [16]:
xpick_df.to_csv('processing_tables/only_heroes.csv', index=None)
xpick_df_with_dummy = pd.merge(xpick_df, dummy, on='mid', how='left')
xpick_df_with_dummy.head()

Unnamed: 0,hero_0,hero_1,hero_2,hero_3,hero_4,hero_5,hero_6,hero_7,hero_8,hero_9,...,hero_105,hero_106,hero_107,hero_108,hero_109,hero_110,mid,Have_carry,Have_support,To_many_carry
0,0,0,0,0,0,0,-1,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,0,0,0,0,0,0,0,-1,0,0,...,0,0,0,0,0,0,2,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3,1,1,0
4,0,0,0,-1,0,0,0,0,0,0,...,0,0,0,0,0,0,4,1,1,0


In [17]:
xpick_df_with_dummy.to_csv('processing_tables/heroes_with_dummy.csv', index=None)

In [45]:
gold = pd.read_csv('processing_tables/goldScoreWithoutCarry.csv')
train = pd.merge(train_matches, gold, on='mid', how='left')
test = pd.merge(test_matches, gold, on='mid', how='left')
train = pd.merge(train, xpick_df, on='mid', how='left')
test = pd.merge(test, xpick_df, on='mid', how='left')

In [71]:
heroes.to_csv('processing_tables/only_heroes.csv', index=None)

In [46]:
train.to_csv('processing_tables/train_gold_heroes.csv', index=None)
test.to_csv('processing_tables/test_gold_heroes.csv', index=None)

# Modeling

In [47]:
X_train = train.drop('radiant_won', 1)
y_train = train.radiant_won
clf = RandomForestClassifier(n_estimators=500, max_features=2, max_depth=15, n_jobs=8, random_state=228)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.72600174674994766

In [32]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.69775, std: 0.00544, params: {'C': 1.0000000000000001e-05}, mean: 0.71413, std: 0.00580, params: {'C': 0.0001}, mean: 0.72989, std: 0.01089, params: {'C': 0.001}, mean: 0.75073, std: 0.01082, params: {'C': 0.01}, mean: 0.75089, std: 0.01085, params: {'C': 0.10000000000000001}, mean: 0.75093, std: 0.01090, params: {'C': 1.0}]
{'C': 1.0}
best score: 0.75092996587


In [50]:
clf = LogisticRegression(C=1)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.7580958759010592

In [51]:
clf = Ridge(alpha=0.0001)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76133928961936892

In [53]:
clf = Lasso(alpha=0.0001, max_iter=6000)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76107113339945687

In [66]:
gold = pd.read_csv('processing_tables/goldScoreWithoutCarry.csv')
train = pd.merge(train_matches, gold, on='mid', how='left')
test = pd.merge(test_matches, gold, on='mid', how='left')
train = pd.merge(train, xpick_df, on='mid', how='left')
test = pd.merge(test, xpick_df, on='mid', how='left')
train = pd.merge(train, dummy, on='mid', how='left')
test = pd.merge(test, dummy, on='mid', how='left')
X_train = train.drop('radiant_won', 1)
X_train.drop('To_many_carry', 1, inplace=True)
y_train = train.radiant_won

In [67]:
clf = LogisticRegression(C=1)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.75898662501875536

In [68]:
clf = Ridge(alpha=0.0001)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.7613496538873733

In [69]:
clf = Lasso(alpha=0.0001, max_iter=6000)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76107034784193373

# With gold score 

In [80]:
train = pd.read_csv('processing_tables/train_score_gold.csv')
test = pd.read_csv('processing_tables/test_score_gold.csv')
train = pd.merge(train, xpick_df, on='mid', how='left')
test = pd.merge(test, xpick_df, on='mid', how='left')
X_train = train.drop('radiant_won', 1)
y_train = train.radiant_won

In [75]:
train.to_csv('processing_tables/train_goldScore_heroes.csv', index=None)
test.to_csv('processing_tables/test_goldScore_heroes.csv', index=None)

In [76]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=228)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.71539742051466038

In [49]:
clf = RandomForestClassifier(n_estimators=500, max_features=2, max_depth=15, n_jobs=8, random_state=228)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.7268397582311632

In [51]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.69528, std: 0.00766, params: {'C': 1.0000000000000001e-05}, mean: 0.72065, std: 0.00669, params: {'C': 0.0001}, mean: 0.73958, std: 0.01040, params: {'C': 0.001}, mean: 0.75598, std: 0.01253, params: {'C': 0.01}, mean: 0.75865, std: 0.01320, params: {'C': 0.10000000000000001}, mean: 0.75835, std: 0.01329, params: {'C': 1.0}]
{'C': 0.10000000000000001}
best score: 0.758645377934


In [52]:
clf = LogisticRegression(C=0.1)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.75818852493699951

In [78]:
clf = Ridge(alpha=0.0001)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76117230821541371

In [79]:
clf = Lasso(alpha=0.0001, max_iter=4000)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76095259485290812

In [81]:
clf = Lasso(alpha=0.0001, max_iter=6000)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.7610657057487904

# With score rank gold

In [82]:
train = pd.read_csv('processing_tables/train_scoreRankGold.csv')
test = pd.read_csv('processing_tables/test_scoreRankGold.csv')
train = pd.merge(train, xpick_df, on='mid', how='left')
test = pd.merge(test, xpick_df, on='mid', how='left')
X_train = train.drop('radiant_won', 1)
y_train = train.radiant_won

In [83]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=228)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.7138418514032927

In [84]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.70440, std: 0.00578, params: {'C': 1.0000000000000001e-05}, mean: 0.71755, std: 0.00642, params: {'C': 0.0001}, mean: 0.74032, std: 0.00837, params: {'C': 0.001}, mean: 0.75853, std: 0.01259, params: {'C': 0.01}, mean: 0.75957, std: 0.01325, params: {'C': 0.10000000000000001}, mean: 0.75939, std: 0.01347, params: {'C': 1.0}]
{'C': 0.10000000000000001}
best score: 0.759568180325


In [85]:
clf = LogisticRegression(C=0.1)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.75964782691047805

In [86]:
clf = Ridge(alpha=0.0001)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76126429022575492

In [87]:
clf = Lasso(alpha=0.0001, max_iter=6000)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76096374322755056

In [88]:
train.to_csv('processing_tables/train_goldRankScore_heroes.csv', index=None)
test.to_csv('processing_tables/test_goldRankScore_heroes.csv', index=None)

# With norm gold and scores

In [56]:
train = pd.read_csv('processing_tables/train_ns_gold.csv')
test = pd.read_csv('processing_tables/test_ns_gold.csv')
train = pd.merge(train, xpick_df, on='mid', how='left')
test = pd.merge(test, xpick_df, on='mid', how='left')
X_train = train.drop('radiant_won', 1)
y_train = train.radiant_won

In [57]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.70706, std: 0.00621, params: {'C': 1.0000000000000001e-05}, mean: 0.71932, std: 0.00641, params: {'C': 0.0001}, mean: 0.74053, std: 0.00886, params: {'C': 0.001}, mean: 0.75480, std: 0.01156, params: {'C': 0.01}, mean: 0.75802, std: 0.01320, params: {'C': 0.10000000000000001}, mean: 0.75779, std: 0.01357, params: {'C': 1.0}]
{'C': 0.10000000000000001}
best score: 0.758023993849


In [58]:
clf = LogisticRegression(C=1)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.75787616643120415

In [59]:
clf = Ridge(alpha=0.0001)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.7610980369989182

In [60]:
clf = Lasso(alpha=0.0001, max_iter=4000)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76089173776232444

In [62]:
X_train.drop(['diff_gold', 'ratio_gold'], 1, inplace=True)

In [63]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.70202, std: 0.00636, params: {'C': 1.0000000000000001e-05}, mean: 0.72062, std: 0.00656, params: {'C': 0.0001}, mean: 0.73894, std: 0.01061, params: {'C': 0.001}, mean: 0.75586, std: 0.01267, params: {'C': 0.01}, mean: 0.75808, std: 0.01320, params: {'C': 0.10000000000000001}, mean: 0.75786, std: 0.01330, params: {'C': 1.0}]
{'C': 0.10000000000000001}
best score: 0.758084203985


In [64]:
clf = LogisticRegression(C=0.1)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.75863719891860515

In [65]:
clf = Ridge(alpha=0.0001)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76110828286893872

In [66]:
clf = Lasso(alpha=0.0001, max_iter=4000)
clf.fit(X_train, y_train)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.76086964031378523