In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold

from sklearn.cross_validation import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, Ridge

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
def normalize_data(X):
    return pd.DataFrame(
        StandardScaler().fit_transform(X),
        index = X.index, 
        columns=X.columns
    )

def logEstimation(X, y):
    grid = {'C': np.power(10.0, np.arange(-5, 1))}
    kf=KFold(y.size, n_folds=5, shuffle=True, random_state=241)
    clf=LogisticRegression(random_state=241)
    gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=kf)
    gs.fit(X, y)
    return gs

In [3]:
train_matches = pd.read_csv('data/train.csv')
test_matches = pd.read_csv('data/test.csv')

In [4]:
gold = pd.read_csv('data/gold.csv', index_col='mid')
gold = gold[gold.times == 600]
gold.drop('times', 1, inplace=True)

gold.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384
1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623
2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491
3,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247
4,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220


# Preprocessing

In [5]:
norm_gold = pd.DataFrame(index=gold.index)
for col in gold.columns:
    norm_gold['gold_' + col] = gold[col]
norm_gold.head()

Unnamed: 0_level_0,gold_player_0,gold_player_1,gold_player_2,gold_player_3,gold_player_4,gold_player_5,gold_player_6,gold_player_7,gold_player_8,gold_player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384
1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623
2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491
3,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247
4,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220


In [6]:
radiant_gold = gold[['player_0', 'player_1', 'player_2', 'player_3', 'player_4']].sum(axis=1)
dire_gold = gold[['player_5', 'player_6', 'player_7', 'player_8', 'player_9']].sum(axis=1)
radiant_carry_gold = gold[['player_0', 'player_1', 'player_2', 'player_3', 'player_4']].max(axis=1)
dire_carry_gold = gold[['player_5', 'player_6', 'player_7', 'player_8', 'player_9']].max(axis=1)

In [7]:
norm_gold['radiant_gold'] = radiant_gold
norm_gold['dire_gold'] = dire_gold
norm_gold['radiant_carry_gold'] = radiant_carry_gold
norm_gold['dire_carry_gold'] = dire_carry_gold
# norm_gold['diff_gold'] = radiant_gold - dire_gold
norm_gold.head()

Unnamed: 0_level_0,gold_player_0,gold_player_1,gold_player_2,gold_player_3,gold_player_4,gold_player_5,gold_player_6,gold_player_7,gold_player_8,gold_player_9,radiant_gold,dire_gold,radiant_carry_gold,dire_carry_gold
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384,21454,22095,5755,6384
1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623,22165,24536,5760,7659
2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491,21392,15548,8581,5130
3,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247,20628,17617,5464,5328
4,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220,18038,17484,5154,4458


In [8]:
norm_gold = normalize_data(norm_gold)
norm_gold.head()

Unnamed: 0_level_0,gold_player_0,gold_player_1,gold_player_2,gold_player_3,gold_player_4,gold_player_5,gold_player_6,gold_player_7,gold_player_8,gold_player_9,radiant_gold,dire_gold,radiant_carry_gold,dire_carry_gold
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,-0.460819,0.88362,-1.022316,0.302363,1.318719,0.005361,0.009184,1.420892,-1.704481,1.827721,0.524044,0.801772,0.007513,0.665735
1,-1.176482,1.291373,-0.146754,0.248927,1.321671,2.659065,0.808769,-0.933944,0.328328,0.498696,0.796655,1.744066,0.012183,1.882619
2,-0.350943,-1.514322,3.321283,0.276011,-0.811136,-0.716695,-1.25938,0.836084,-1.101751,-1.110322,0.500272,-1.725554,2.64668,-0.531107
3,-0.458622,1.073512,0.301579,-0.770018,0.255267,-0.532482,0.603076,-1.559622,0.993203,-1.294469,0.20734,-0.926862,-0.264248,-0.342132
4,-0.298934,0.071793,0.827062,-0.71951,-1.396367,-0.10709,-0.367047,-0.455398,0.341805,-1.314846,-0.785714,-0.978204,-0.553753,-1.172477


In [9]:
norm_gold['ratio_gold'] = radiant_gold / dire_gold
norm_gold.head()

Unnamed: 0_level_0,gold_player_0,gold_player_1,gold_player_2,gold_player_3,gold_player_4,gold_player_5,gold_player_6,gold_player_7,gold_player_8,gold_player_9,radiant_gold,dire_gold,radiant_carry_gold,dire_carry_gold,ratio_gold
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,-0.460819,0.88362,-1.022316,0.302363,1.318719,0.005361,0.009184,1.420892,-1.704481,1.827721,0.524044,0.801772,0.007513,0.665735,0.970989
1,-1.176482,1.291373,-0.146754,0.248927,1.321671,2.659065,0.808769,-0.933944,0.328328,0.498696,0.796655,1.744066,0.012183,1.882619,0.903366
2,-0.350943,-1.514322,3.321283,0.276011,-0.811136,-0.716695,-1.25938,0.836084,-1.101751,-1.110322,0.500272,-1.725554,2.64668,-0.531107,1.375868
3,-0.458622,1.073512,0.301579,-0.770018,0.255267,-0.532482,0.603076,-1.559622,0.993203,-1.294469,0.20734,-0.926862,-0.264248,-0.342132,1.170914
4,-0.298934,0.071793,0.827062,-0.71951,-1.396367,-0.10709,-0.367047,-0.455398,0.341805,-1.314846,-0.785714,-0.978204,-0.553753,-1.172477,1.031686


In [10]:
norm_gold = norm_gold.reset_index()
norm_gold.head()

Unnamed: 0,mid,gold_player_0,gold_player_1,gold_player_2,gold_player_3,gold_player_4,gold_player_5,gold_player_6,gold_player_7,gold_player_8,gold_player_9,radiant_gold,dire_gold,radiant_carry_gold,dire_carry_gold,ratio_gold
0,0,-0.460819,0.88362,-1.022316,0.302363,1.318719,0.005361,0.009184,1.420892,-1.704481,1.827721,0.524044,0.801772,0.007513,0.665735,0.970989
1,1,-1.176482,1.291373,-0.146754,0.248927,1.321671,2.659065,0.808769,-0.933944,0.328328,0.498696,0.796655,1.744066,0.012183,1.882619,0.903366
2,2,-0.350943,-1.514322,3.321283,0.276011,-0.811136,-0.716695,-1.25938,0.836084,-1.101751,-1.110322,0.500272,-1.725554,2.64668,-0.531107,1.375868
3,3,-0.458622,1.073512,0.301579,-0.770018,0.255267,-0.532482,0.603076,-1.559622,0.993203,-1.294469,0.20734,-0.926862,-0.264248,-0.342132,1.170914
4,4,-0.298934,0.071793,0.827062,-0.71951,-1.396367,-0.10709,-0.367047,-0.455398,0.341805,-1.314846,-0.785714,-0.978204,-0.553753,-1.172477,1.031686


In [11]:
norm_gold.to_csv('processing_tables/norm_gold.csv', index=None)

# New features

In [40]:
new_gold = pd.read_csv('data/gold.csv', index_col='mid')
new_gold = new_gold[new_gold.times == 600]
new_gold.drop('times', 1, inplace=True)
new_gold.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384
1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623
2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491
3,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247
4,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220


In [41]:
heroes = pd.read_csv('data/heroes.csv', index_col='mid')
heroes.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,91,42,87,15,65,11,6,34,69,74
1,69,85,71,24,64,74,68,39,65,11
2,17,40,31,67,99,32,7,72,48,104
3,80,43,101,71,94,69,70,98,24,39
4,25,15,75,29,95,3,32,55,64,86


In [42]:
heroes.max()[0]

110

In [48]:
mean_gold = np.zeros(111)
for hero_num in range(111):
    hero_values = []
    for player_num in range(10):
        colname = 'player_{}'.format(player_num)
        hero_index = heroes.loc[heroes[colname] == hero_num].index
        hero_values.extend(new_gold.loc[hero_index][colname].values)
    mean_gold[hero_num] = int(np.mean(hero_values))

mean_gold[0] = 100000
print mean_gold

[ 100000.    4793.    4149.    3651.    3124.    3789.    3245.    3224.
    4364.    4589.    3101.    4808.    3217.    4635.    4535.    5610.
    3314.    4150.    3062.    2610.    3785.    3000.    4603.    3873.
    4939.    4383.    3343.    4516.    4352.    4605.    4443.    4447.
    3635.    3305.    5088.    4666.    4527.    4603.    4285.    2838.
    2793.    3345.    3678.    3807.    4264.    4556.    4426.    4225.
    4485.    4493.    2736.    4022.    4648.    4187.    4117.    4461.
    4843.    3359.    4398.    4403.    2970.    2825.    3845.    3786.
    4562.    4364.    2684.    4646.    4162.    2895.    4601.    3486.
    4865.    4075.    6446.    4589.    3293.    4323.    4425.    4224.
    2965.    4139.    4968.    4354.    4374.    4719.    2680.    2754.
    2921.    3712.    2940.    2838.    2795.    3290.    3938.    3838.
    3374.    3016.    2959.    3370.    3129.    4483.    4430.    2908.
    2873.    4453.    3710.    4110.    3927.    28

mean_gold == 0 - соответствует вышедшему из игры человеку

In [49]:
mean_predict_gold = heroes.apply(lambda x: mean_gold[x])
mean_predict_gold.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2838.0,3678.0,2754.0,5610.0,4364.0,4808.0,3245.0,5088.0,2895.0,6446.0
1,2895.0,4719.0,3486.0,4939.0,4562.0,6446.0,4162.0,2838.0,4364.0,4808.0
2,4150.0,2793.0,4447.0,4646.0,3370.0,3635.0,3224.0,4865.0,4485.0,2873.0
3,2965.0,3807.0,4483.0,3486.0,3938.0,2895.0,4601.0,2959.0,4939.0,2838.0
4,4383.0,5610.0,4589.0,4605.0,3838.0,3651.0,3635.0,4461.0,4562.0,2680.0


In [50]:
temp_gold_score = new_gold / mean_predict_gold
gold_score_by_mean = pd.DataFrame(index=temp_gold_score.index)
for col in temp_gold_score.columns:
#     gold_score_by_mean['gold_score_' + col] = StandardScaler().fit_transform(temp_gold_score[col].reshape(-1, 1))
    gold_score_by_mean['gold_score_' + col] = temp_gold_score[col]

# gold_score_by_mean['radiant_norm_gold'] = norm_gold.radiant_gold
# gold_score_by_mean['dire_norm_gold'] = norm_gold.dire_gold
# gold_score_by_mean['radiant_carry_norm_gold'] = norm_gold.radiant_carry_gold
# gold_score_by_mean['dire_carry_norm_gold'] = norm_gold.dire_carry_gold

gold_score_by_mean['radiant_best_gold_score'] = gold_score_by_mean.iloc[:,1:6].max(1)
gold_score_by_mean['dire_best_gold_score'] = gold_score_by_mean.iloc[:,6:11].max(1)
gold_score_by_mean['radiant_sum_gold_score'] = StandardScaler().fit_transform(gold_score_by_mean.iloc[:,1:6].sum(1).reshape(-1, 1))
gold_score_by_mean['dire_sum_gold_score'] = StandardScaler().fit_transform(gold_score_by_mean.iloc[:,6:11].sum(1).reshape(-1, 1))

# gold_score_by_mean = normalize_data(gold_score_by_mean)
gold_score_by_mean.reset_index(level=0, inplace=True)
gold_score_by_mean.head()

Unnamed: 0,mid,gold_score_player_0,gold_score_player_1,gold_score_player_2,gold_score_player_3,gold_score_player_4,gold_score_player_5,gold_score_player_6,gold_score_player_7,gold_score_player_8,gold_score_player_9,radiant_best_gold_score,dire_best_gold_score,radiant_sum_gold_score,dire_sum_gold_score
0,0,1.217054,1.415443,0.948802,0.788948,1.318744,0.846922,1.231741,1.162932,0.595855,0.990382,1.415443,1.415443,0.499345,0.146006
1,1,0.855613,1.220598,1.094664,0.881353,1.262385,1.188179,1.217203,0.968288,1.017415,0.961522,1.262385,1.262385,1.015678,0.193635
2,2,0.868434,0.697458,1.929615,0.944899,0.851335,0.851719,0.71371,1.054471,0.564103,0.867038,1.929615,1.929615,0.430412,-0.27214
3,3,1.165936,1.435251,0.988624,0.849398,1.09548,1.15544,1.041295,0.644137,1.078761,0.791755,1.435251,1.435251,0.822264,-0.487514
4,4,0.838467,0.731373,1.123121,0.65798,0.540907,1.073678,0.96121,0.760368,0.977203,0.828358,1.123121,1.123121,-1.374948,-1.020622


In [51]:
norm_gold.head()

Unnamed: 0,mid,gold_player_0,gold_player_1,gold_player_2,gold_player_3,gold_player_4,gold_player_5,gold_player_6,gold_player_7,gold_player_8,gold_player_9,radiant_gold,dire_gold,radiant_carry_gold,dire_carry_gold,ratio_gold
0,0,-0.460819,0.88362,-1.022316,0.302363,1.318719,0.005361,0.009184,1.420892,-1.704481,1.827721,0.524044,0.801772,0.007513,0.665735,0.970989
1,1,-1.176482,1.291373,-0.146754,0.248927,1.321671,2.659065,0.808769,-0.933944,0.328328,0.498696,0.796655,1.744066,0.012183,1.882619,0.903366
2,2,-0.350943,-1.514322,3.321283,0.276011,-0.811136,-0.716695,-1.25938,0.836084,-1.101751,-1.110322,0.500272,-1.725554,2.64668,-0.531107,1.375868
3,3,-0.458622,1.073512,0.301579,-0.770018,0.255267,-0.532482,0.603076,-1.559622,0.993203,-1.294469,0.20734,-0.926862,-0.264248,-0.342132,1.170914
4,4,-0.298934,0.071793,0.827062,-0.71951,-1.396367,-0.10709,-0.367047,-0.455398,0.341805,-1.314846,-0.785714,-0.978204,-0.553753,-1.172477,1.031686


In [47]:
pd.merge(gold_score_by_mean, norm_gold, on='mid').to_csv('processing_tables/old_gold.csv', index=None)

In [18]:
gold_score_by_mean.to_csv('processing_tables/goldScoreWithCarry.csv', index=None)

In [31]:
cropped_gold_score = gold_score_by_mean.copy()
cropped_gold_score.drop(['radiant_carry_norm_gold', 'dire_carry_norm_gold'], 1)
cropped_gold_score.head()
cropped_gold_score.to_csv('processing_tables/goldScoreWithoutCarry.csv', index=None)

# Test only norm gold

In [19]:
train = pd.merge(train_matches, norm_gold, on='mid', how='left')
test = pd.merge(test_matches, norm_gold, on='mid', how='left')
X_train = train.drop('radiant_won', 1)
y_train = train.radiant_won

In [20]:
norm_gold.to_csv('processing_tables/norm_gold.csv', index=None)
train.to_csv('processing_tables/train_carry_gold.csv', index=None)
test.to_csv('processing_tables/test_carry_gold.csv', index=None)

In [21]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=228)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.68262955795920532

In [46]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.68967, std: 0.00681, params: {'C': 1.0000000000000001e-05}, mean: 0.70764, std: 0.00645, params: {'C': 0.0001}, mean: 0.70784, std: 0.00654, params: {'C': 0.001}, mean: 0.70785, std: 0.00655, params: {'C': 0.01}, mean: 0.70785, std: 0.00655, params: {'C': 0.10000000000000001}, mean: 0.70785, std: 0.00656, params: {'C': 1.0}]
{'C': 0.01}
best score: 0.707850412385


In [48]:
clf=LogisticRegression(random_state=241, C=0.01)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.70773748356589905

In [49]:
clf=Ridge(alpha=0.0001)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.70800015376238057

In [50]:
clf=Lasso(alpha=0.0001, max_iter=6000)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.70802949628560896

# Test only gold score

In [51]:
train = pd.merge(train_matches, gold_score_by_mean, on='mid', how='left')
test = pd.merge(test_matches, gold_score_by_mean, on='mid', how='left')
X_train = train.drop('radiant_won', 1)
y_train = train.radiant_won

In [52]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=228)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.69685786733527155

In [53]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.67999, std: 0.00792, params: {'C': 1.0000000000000001e-05}, mean: 0.70581, std: 0.00620, params: {'C': 0.0001}, mean: 0.71012, std: 0.00592, params: {'C': 0.001}, mean: 0.71094, std: 0.00598, params: {'C': 0.01}, mean: 0.71096, std: 0.00597, params: {'C': 0.10000000000000001}, mean: 0.71095, std: 0.00597, params: {'C': 1.0}]
{'C': 0.10000000000000001}
best score: 0.710962365258


In [54]:
clf=LogisticRegression(random_state=241, C=0.1)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.71081699986728908

In [55]:
clf=Ridge(alpha=0.0001)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.71334095613179094

In [56]:
clf=Lasso(alpha=0.0001, max_iter=4000)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.7133690808550458

In [57]:
train.to_csv('processing_tables/train_score_gold.csv', index=None)
test.to_csv('processing_tables/test_score_gold.csv', index=None)
gold_score_by_mean.to_csv('processing_tables/gold_scores.csv', index=None)

# RANKS

## NormByRowsRanks

In [19]:
gold = pd.read_csv('data/gold.csv', index_col='mid')
gold = gold[gold.times == 600]
gold.drop('times', 1, inplace=True)

In [20]:
radiant_rank_gold = pd.DataFrame(data = np.sort(gold.values[:,:5]),
                                 index=gold.index,
                                 columns = ['radiant_gold_rank_{0}'.format(i) for i in range(5)]) 
# radiant_rank_gold = normalize_data(radiant_rank_gold)

radiant_rank_gold.reset_index(inplace=True)
radiant_rank_gold.head()

Unnamed: 0,mid,radiant_gold_rank_0,radiant_gold_rank_1,radiant_gold_rank_2,radiant_gold_rank_3,radiant_gold_rank_4
0,0,2613,3454,4426,5206,5755
1,1,2477,3816,4353,5759,5760
2,2,1948,2869,3604,4390,8581
3,3,2961,3457,4314,4432,5464
4,4,2076,3030,3675,4103,5154


In [21]:
dire_rank_gold = pd.DataFrame(data = np.sort(gold.values[:,5:10]),
                                 index=gold.index,
                                 columns = ['dire_gold_rank_{0}'.format(i) for i in range(5)]) 
# dire_rank_gold = normalize_data(dire_rank_gold)

dire_rank_gold.reset_index(inplace=True)

dire_rank_gold.head()

Unnamed: 0,mid,dire_gold_rank_0,dire_gold_rank_1,dire_gold_rank_2,dire_gold_rank_3,dire_gold_rank_4
0,0,1725,3997,4072,5917,6384
1,1,2748,4440,4623,5066,7659
2,2,2301,2491,2530,3096,5130
3,3,1906,2247,3345,4791,5328
4,4,2220,3392,3494,3920,4458


In [22]:
rank_gold = pd.merge(radiant_rank_gold, dire_rank_gold, on='mid', how='left')
rank_gold.head()

Unnamed: 0,mid,radiant_gold_rank_0,radiant_gold_rank_1,radiant_gold_rank_2,radiant_gold_rank_3,radiant_gold_rank_4,dire_gold_rank_0,dire_gold_rank_1,dire_gold_rank_2,dire_gold_rank_3,dire_gold_rank_4
0,0,2613,3454,4426,5206,5755,1725,3997,4072,5917,6384
1,1,2477,3816,4353,5759,5760,2748,4440,4623,5066,7659
2,2,1948,2869,3604,4390,8581,2301,2491,2530,3096,5130
3,3,2961,3457,4314,4432,5464,1906,2247,3345,4791,5328
4,4,2076,3030,3675,4103,5154,2220,3392,3494,3920,4458


In [23]:
norm_rank_gold = pd.DataFrame(rank_gold.mid)
row_sum = rank_gold.drop('mid', 1).sum(1)
for col in rank_gold.columns[1:]:
    norm_rank_gold[col] = rank_gold[col] / row_sum
norm_rank_gold.ix[:,1:] = normalize_data(norm_rank_gold.drop('mid', 1))
norm_rank_gold.head()

Unnamed: 0,mid,radiant_gold_rank_0,radiant_gold_rank_1,radiant_gold_rank_2,radiant_gold_rank_3,radiant_gold_rank_4,dire_gold_rank_0,dire_gold_rank_1,dire_gold_rank_2,dire_gold_rank_3,dire_gold_rank_4
0,0,-0.264298,0.003371,0.286861,0.114283,-0.486818,-1.982586,0.917339,-0.275545,1.161705,0.214201
1,1,-0.846838,0.180427,-0.292264,0.347176,-0.872878,-0.374487,1.161269,0.106051,-0.5411,0.989521
2,2,-0.872392,-0.118177,0.007082,0.070942,3.900556,-0.086196,-0.887209,-2.014996,-2.074395,-0.129632
3,3,1.193386,0.821186,1.054641,-0.111582,-0.017247,-1.127523,-1.530593,-0.695655,0.502861,-0.110102
4,4,-0.394726,0.445302,0.412301,-0.134949,0.080223,-0.068948,1.19219,0.06225,-0.424432,-0.725317


In [24]:
norm_rank_gold.to_csv('processing_tables/normByRowsRankGold.csv', index=None)

In [57]:
train = pd.merge(norm_rank_gold, train_matches, on='mid', how='right')
test = pd.merge(norm_rank_gold, test_matches, on='mid', how='right')
X_train = train.drop('radiant_won', 1)
y_train = train.radiant_won

In [58]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=228)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.68176935801531036

In [59]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.69120, std: 0.00482, params: {'C': 1.0000000000000001e-05}, mean: 0.70635, std: 0.00514, params: {'C': 0.0001}, mean: 0.70820, std: 0.00582, params: {'C': 0.001}, mean: 0.70832, std: 0.00599, params: {'C': 0.01}, mean: 0.70832, std: 0.00601, params: {'C': 0.10000000000000001}, mean: 0.70832, std: 0.00601, params: {'C': 1.0}]
{'C': 1.0}
best score: 0.708323548422


In [60]:
clf=LogisticRegression(random_state=241, C=0.001)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.70845337984152956

In [61]:
clf=Ridge(alpha=0.0001)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.70850717633608307

# NormByRowsRanks and scores

In [62]:
train = pd.merge(norm_rank_gold, train_matches, on='mid', how='right')
test = pd.merge(norm_rank_gold, test_matches, on='mid', how='right')

train = pd.merge(train, gold_score_by_mean, on='mid', how='left')
test = pd.merge(test, gold_score_by_mean, on='mid', how='left')

X_train = train.drop(['radiant_carry_norm_gold', 'dire_carry_norm_gold'], 1)
X_test = test.drop(['radiant_carry_norm_gold', 'dire_carry_norm_gold'], 1)

X_train = train.drop('radiant_won', 1)
y_train = train.radiant_won

In [63]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=228)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.69825666646363904

In [64]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.70311, std: 0.00541, params: {'C': 1.0000000000000001e-05}, mean: 0.70920, std: 0.00572, params: {'C': 0.0001}, mean: 0.70985, std: 0.00570, params: {'C': 0.001}, mean: 0.71021, std: 0.00548, params: {'C': 0.01}, mean: 0.71004, std: 0.00562, params: {'C': 0.10000000000000001}, mean: 0.71003, std: 0.00563, params: {'C': 1.0}]
{'C': 0.01}
best score: 0.710213914303


In [65]:
clf=LogisticRegression(random_state=241, C=0.01)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.71098463805857992

In [66]:
clf=Ridge(alpha=0.0001)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.71404182454961085

In [67]:
clf=Lasso(alpha=0.0001, max_iter=6000)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.7141212821701044

## Just rank gold and gold scores

In [32]:
rank_gold.head()

Unnamed: 0,mid,radiant_gold_rank_0,radiant_gold_rank_1,radiant_gold_rank_2,radiant_gold_rank_3,radiant_gold_rank_4,dire_gold_rank_0,dire_gold_rank_1,dire_gold_rank_2,dire_gold_rank_3,dire_gold_rank_4
0,0,2613,3454,4426,5206,5755,1725,3997,4072,5917,6384
1,1,2477,3816,4353,5759,5760,2748,4440,4623,5066,7659
2,2,1948,2869,3604,4390,8581,2301,2491,2530,3096,5130
3,3,2961,3457,4314,4432,5464,1906,2247,3345,4791,5328
4,4,2076,3030,3675,4103,5154,2220,3392,3494,3920,4458


In [33]:
stupid_norm_rank_gold = rank_gold.copy()
stupid_norm_rank_gold.ix[:,1:] = normalize_data(rank_gold.drop('mid', 1))
stupid_norm_rank_gold.head()

Unnamed: 0,mid,radiant_gold_rank_0,radiant_gold_rank_1,radiant_gold_rank_2,radiant_gold_rank_3,radiant_gold_rank_4,dire_gold_rank_0,dire_gold_rank_1,dire_gold_rank_2,dire_gold_rank_3,dire_gold_rank_4
0,0,0.155574,0.432631,0.74358,0.631293,0.007513,-1.529347,1.287882,0.236226,1.596343,0.665735
1,1,-0.101736,1.006005,0.638412,1.347826,0.012183,0.396231,1.991675,1.038963,0.482048,1.882619
2,2,-1.102595,-0.493955,-0.440642,-0.426014,2.64668,-0.445151,-1.104698,-2.01027,-2.097459,-0.531107
3,3,0.813984,0.437382,0.582226,-0.371594,-0.264248,-1.188654,-1.49234,-0.822919,0.121965,-0.342132
4,4,-0.860421,-0.238946,-0.338355,-0.797886,-0.553753,-0.597616,0.326719,-0.605846,-1.018518,-1.172477


In [39]:
stupid_norm_rank_gold.to_csv('processing_tables/normByColsRankGold.csv', index=None)

In [78]:
train = pd.merge(train_matches, stupid_norm_rank_gold, on='mid', how='left')
test = pd.merge(test_matches, stupid_norm_rank_gold, on='mid', how='left')
train = pd.merge(train, gold_score_by_mean, on='mid', how='left')
test = pd.merge(test, gold_score_by_mean, on='mid', how='left')
X_train = train.drop(['radiant_carry_norm_gold', 'dire_carry_norm_gold'], 1)
X_test = test.drop(['radiant_carry_norm_gold', 'dire_carry_norm_gold'], 1)

X_train = train.drop('radiant_won', 1)
y_train = train.radiant_won

In [79]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=228)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.69758600239845026

In [82]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.70089, std: 0.00624, params: {'C': 1.0000000000000001e-05}, mean: 0.70929, std: 0.00584, params: {'C': 0.0001}, mean: 0.70982, std: 0.00569, params: {'C': 0.001}, mean: 0.71015, std: 0.00625, params: {'C': 0.01}, mean: 0.71044, std: 0.00605, params: {'C': 0.10000000000000001}, mean: 0.71043, std: 0.00605, params: {'C': 1.0}]
{'C': 0.10000000000000001}
best score: 0.710437947905


In [83]:
clf=LogisticRegression(random_state=241, C=0.1)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.71045225358876185

In [80]:
clf=Ridge(alpha=0.0001)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.714064953004098

In [81]:
clf=Lasso(alpha=0.0001, max_iter=4000)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.71410844283760944

In [None]:
train.to_csv('processing_tables/train_rank_score_gold.csv', index=None)
test.to_csv('processing_tables/test_rank_score_gold.csv', index=None)
gold_score_by_mean.to_csv('processing_tables/stupidRank+gold_scores.csv', index=None)

# Diff rank

In [35]:
gold_rank_dif = pd.DataFrame(data = np.sort(gold.values[:,:5]) - np.sort(gold.values[:,5:10]),
                                 index=gold.index,
                                 columns = ['gold_diff_rank_{0}'.format(i) for i in range(5)]) 
gold_rank_dif = normalize_data(gold_rank_dif)

gold_rank_dif.reset_index(inplace=True)
gold_rank_dif.head()

Unnamed: 0,mid,gold_diff_rank_0,gold_diff_rank_1,gold_diff_rank_2,gold_diff_rank_3,gold_diff_rank_4
0,0,1.2354,-0.636307,0.361294,-0.665978,-0.463171
1,1,-0.364904,-0.732195,-0.275577,0.611509,-1.316314
2,2,-0.478126,0.453976,1.096144,1.158354,2.277633
3,3,1.465987,1.4389,0.988979,-0.345696,0.05073
4,4,-0.189547,-0.422038,0.184725,0.147465,0.426918


In [36]:
gold_rank_dif.to_csv('processing_tables/goldRankDif.csv', index=None)

In [85]:
train = pd.merge(train_matches, gold_rank_dif, on='mid', how='left')
test = pd.merge(test_matches, gold_rank_dif, on='mid', how='left')
train = pd.merge(train, gold_score_by_mean, on='mid', how='left')
test = pd.merge(test, gold_score_by_mean, on='mid', how='left')

X_train = train.drop(['radiant_carry_norm_gold', 'dire_carry_norm_gold'], 1)
X_test = test.drop(['radiant_carry_norm_gold', 'dire_carry_norm_gold'], 1)

X_train = train.drop(['radiant_won'], 1)
y_train = train.radiant_won

X_train.head()

Unnamed: 0,mid,gold_diff_rank_0,gold_diff_rank_1,gold_diff_rank_2,gold_diff_rank_3,gold_diff_rank_4,gold_score_player_0,gold_score_player_1,gold_score_player_2,gold_score_player_3,...,gold_score_player_8,gold_score_player_9,radiant_norm_gold,dire_norm_gold,radiant_carry_norm_gold,dire_carry_norm_gold,radiant_best_gold_score,dire_best_gold_score,radiant_sum_gold_score,dire_sum_gold_score
0,0,1.2354,-0.636307,0.361294,-0.665978,-0.463171,1.217054,1.415443,0.948802,0.788948,...,0.595855,0.990382,0.524044,0.801772,0.007513,0.665735,1.415443,1.231741,0.499268,0.44876
1,1,-0.364904,-0.732195,-0.275577,0.611509,-1.316314,0.855613,1.220598,1.094664,0.881353,...,1.017415,0.961522,0.796655,1.744066,0.012183,1.882619,1.262385,1.217203,1.01561,0.846631
2,2,-0.478126,0.453976,1.096144,1.158354,2.277633,0.868434,0.697458,1.929615,0.944899,...,0.564103,0.867038,0.500272,-1.725554,2.64668,-0.531107,1.929615,1.054471,0.430333,-0.253739
3,4,-0.189547,-0.422038,0.184725,0.147465,0.426918,0.838467,0.731373,1.123121,0.65798,...,0.977203,0.828358,-0.785714,-0.978204,-0.553753,-1.172477,1.123121,0.977203,-1.37506,-1.089529
4,5,-0.79156,-1.662664,-2.25253,-0.657789,-0.796367,0.974335,0.743297,0.614883,0.97285,...,1.171903,1.045423,-1.560604,0.780926,-1.384913,-0.283913,1.03552,1.465198,-1.423393,-0.665986


In [87]:
train.to_csv('processing_tables/train_scoreDiffRankGold.csv', index=None)
test.to_csv('processing_tables/test_scoreDiffRankGold.csv', index=None)

In [88]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=228)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.69860716177691806

In [89]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.70078, std: 0.00607, params: {'C': 1.0000000000000001e-05}, mean: 0.70919, std: 0.00576, params: {'C': 0.0001}, mean: 0.70991, std: 0.00560, params: {'C': 0.001}, mean: 0.71092, std: 0.00484, params: {'C': 0.01}, mean: 0.71066, std: 0.00502, params: {'C': 0.10000000000000001}, mean: 0.71066, std: 0.00505, params: {'C': 1.0}]
{'C': 0.01}
best score: 0.710915002029


In [90]:
clf=LogisticRegression(random_state=241, C=0.01)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.71037668327284587

In [91]:
clf=Ridge(alpha=0.0001)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.71424622599317356

In [92]:
clf=Lasso(alpha=0.0001, max_iter=4000)
np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

0.71428920162133758

## Diff rank rates

In [93]:
rank_gold.head()

Unnamed: 0,mid,radiant_gold_rank_0,radiant_gold_rank_1,radiant_gold_rank_2,radiant_gold_rank_3,radiant_gold_rank_4,dire_gold_rank_0,dire_gold_rank_1,dire_gold_rank_2,dire_gold_rank_3,dire_gold_rank_4
0,0,2613,3454,4426,5206,5755,1725,3997,4072,5917,6384
1,1,2477,3816,4353,5759,5760,2748,4440,4623,5066,7659
2,2,1948,2869,3604,4390,8581,2301,2491,2530,3096,5130
3,3,2961,3457,4314,4432,5464,1906,2247,3345,4791,5328
4,4,2076,3030,3675,4103,5154,2220,3392,3494,3920,4458


In [96]:
norm_by_row_gold_rank_dif = pd.DataFrame(data = np.sort(gold.values[:,:5]) - np.sort(gold.values[:,5:10]),
                                 index=gold.index,
                                 columns = ['gold_diff_rank_{0}'.format(i) for i in range(5)]) 
norm_by_row_gold_rank_dif.head()

Unnamed: 0_level_0,gold_diff_rank_0,gold_diff_rank_1,gold_diff_rank_2,gold_diff_rank_3,gold_diff_rank_4
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,888,-543,354,-711,-629
1,-271,-624,-270,693,-1899
2,-353,378,1074,1294,3451
3,1055,1210,969,-359,136
4,-144,-362,181,183,696


In [None]:
norm_by_row_dif_gold = 