In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, Ridge

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
def normalize_data(X):
    return pd.DataFrame(
        StandardScaler().fit_transform(X),
        index = X.index, 
        columns=X.columns
    )

def logEstimation(X, y):
    grid = {'C': np.power(10.0, np.arange(-5, 1))}
    kf=KFold(y.size, n_folds=5, shuffle=True, random_state=241)
    clf=LogisticRegression(random_state=241)
    gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=kf)
    gs.fit(X, y)
    return gs

# Norm xp

In [25]:
xp = pd.read_csv('data/xp.csv', index_col='mid')
xp = xp.loc[xp.times == 600].drop('times', 1)

radiant_xp = xp[['player_0', 'player_1', 'player_2', 'player_3', 'player_4']].sum(axis=1)
dire_xp = xp[['player_5', 'player_6', 'player_7', 'player_8', 'player_9']].sum(axis=1)
radiant_carry_xp = xp[['player_0', 'player_1', 'player_2', 'player_3', 'player_4']].max(axis=1)
dire_carry_xp = xp[['player_5', 'player_6', 'player_7', 'player_8', 'player_9']].max(axis=1)
norm_xp = pd.DataFrame(index=xp.index)
for col in xp.columns:
    norm_xp['exp_' + col] = xp[col]
    
norm_xp['radiant_xp'] = radiant_xp
norm_xp['dire_xp'] = dire_xp
norm_xp['radiant_carry_xp'] = radiant_carry_xp
norm_xp['dire_carry_xp'] = dire_carry_xp

norm_xp = normalize_data(norm_xp)
norm_xp.reset_index(level=0, inplace=True)
norm_xp.head()

Unnamed: 0,mid,exp_player_0,exp_player_1,exp_player_2,exp_player_3,exp_player_4,exp_player_5,exp_player_6,exp_player_7,exp_player_8,exp_player_9,radiant_xp,dire_xp,radiant_carry_xp,dire_carry_xp
0,0,-1.2745,1.162958,-1.302607,1.270762,0.733066,0.597751,0.555076,1.665819,-2.495594,-0.628572,0.370217,-0.184408,-0.143615,0.547836
1,1,-1.414502,1.325397,-0.980861,-0.54998,1.638381,1.30684,0.439235,-0.733371,0.335357,0.593712,-0.003502,1.245843,0.420384,0.047362
2,2,-0.224488,-0.964361,2.891867,-0.339754,1.139497,-0.640619,-1.385491,0.156859,-0.165589,-0.555142,1.654017,-1.661006,2.528345,-1.915341
3,3,-1.775415,1.471318,1.050214,-1.16055,0.701027,0.398579,-0.068611,-1.516479,1.366061,-1.387969,0.182461,-0.77683,0.168731,0.044347
4,4,-0.81904,0.448039,1.642045,0.18307,-1.375065,0.254733,-1.02036,-0.442476,0.22197,-1.021748,0.058087,-1.286063,0.486998,-1.672641


In [7]:
norm_xp.to_csv('processing_tables/only_exp.csv', index=None)

In [27]:
xp = pd.read_csv('data/xp.csv', index_col='mid')
xp = xp[xp.times == 600]
xp.drop('times', 1, inplace=True)
radiant_rank_xp = pd.DataFrame(data = np.sort(xp.values[:,:5]),
                                 index=xp.index,
                                 columns = ['radiant_xp_rank_{0}'.format(i) for i in range(5)]) 
radiant_rank_xp.reset_index(inplace=True)

dire_rank_xp = pd.DataFrame(data = np.sort(xp.values[:,5:10]),
                                 index=xp.index,
                                 columns = ['dire_xp_rank_{0}'.format(i) for i in range(5)]) 

dire_rank_xp.reset_index(inplace=True)
dire_rank_xp.head()

xp_stat = pd.DataFrame(index=xp.index)
xp_stat['rad_best_xp'] = radiant_rank_xp.radiant_xp_rank_4
xp_stat['rad_second_xp'] = radiant_rank_xp.radiant_xp_rank_3

xp_stat['dire_best_xp'] = dire_rank_xp.dire_xp_rank_4
xp_stat['dire_second_xp'] = dire_rank_xp.dire_xp_rank_3

xp_stat['rad_sum_xp'] = xp[['player_0', 'player_1', 'player_2', 'player_3', 'player_4']].sum(axis=1)
xp_stat['dire_sum_xp'] = xp[['player_5', 'player_6', 'player_7', 'player_8', 'player_9']].sum(axis=1)

# xp_stat = normalize_data(xp_stat)
xp_stat.reset_index(inplace=True)
# xp_stat.to_csv('processing_tables/xp_stat.csv', index=None)
xp_stat.head()


Unnamed: 0,mid,rad_best_xp,rad_second_xp,dire_best_xp,dire_second_xp,rad_sum_xp,dire_sum_xp
0,0,4429,4304,4846,3745,15856,14930
1,1,4810,4481,4514,3662,15231,17337
2,2,6234,4265,3212,2864,18003,12445
3,3,4640,4202,4512,3529,15542,13933
4,4,4855,3525,3373,3281,15334,13076


In [29]:
xp_cropped_stats = pd.DataFrame(index=xp.index)
xp_cropped_stats['xp_best_dif'] = xp_stat.rad_best_xp - xp_stat.dire_best_xp
xp_cropped_stats['xp_second_dif'] = xp_stat.rad_second_xp - xp_stat.dire_second_xp
xp_cropped_stats['xp_sum_dif'] = xp_stat.rad_sum_xp - xp_stat.dire_sum_xp
xp_cropped_stats = normalize_data(xp_cropped_stats)
xp_cropped_stats.reset_index(inplace=True)
xp_cropped_stats.to_csv('processing_tables/xp_cropped_stats.csv', index=None)
xp_cropped_stats.head()

Unnamed: 0,mid,xp_best_dif,xp_second_dif,xp_sum_dif
0,0,-0.451679,0.70071,0.377645
1,1,0.247762,1.02712,-0.854239
2,2,2.921918,1.757776,2.259601
3,3,0.082957,0.843828,0.655144
4,4,1.411206,0.305251,0.918829


# Exp Score

In [10]:
new_xp = pd.read_csv('data/xp.csv', index_col='mid')
new_xp = new_xp[new_xp.times == 600]
new_xp.drop('times', 1, inplace=True)
new_xp.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1696,4304,1606,4429,3821,3745,3635,4846,357,2347
1,1542,4481,1961,2437,4810,4514,3510,2248,3403,3662
2,2851,1986,6234,2667,4265,2402,1541,3212,2864,2426
3,1145,4640,4202,1769,3786,3529,2962,1400,4512,1530
4,2197,3525,4855,3239,1518,3373,1935,2563,3281,1924


In [11]:
heroes = pd.read_csv('data/heroes.csv', index_col='mid')
heroes_num = heroes.max()[0] + 1
print heroes_num

111


In [12]:
mean_xp = np.zeros(111)
for hero_num in range(111):
    hero_values = []
    for player_num in range(10):
        colname = 'player_{}'.format(player_num)
        hero_index = heroes.loc[heroes[colname] == hero_num].index
        hero_values.extend(new_xp.loc[hero_index][colname].values)
    mean_xp[hero_num] = int(np.mean(hero_values))
mean_xp[0] = 100000
print mean_xp

[ 100000.    3304.    3227.    2648.    2291.    3101.    2451.    2569.
    3222.    3474.    1877.    3825.    2544.    4052.    3245.    4601.
    2939.    3329.    2284.    1981.    3028.    2410.    3258.    3114.
    4019.    3122.    2506.    3141.    3705.    3180.    3763.    3222.
    2846.    2522.    4383.    3710.    3628.    3322.    3465.    2109.
    2084.    2410.    3017.    3052.    2863.    3847.    3136.    3298.
    3526.    3106.    2204.    3015.    3455.    3192.    2971.    3348.
    4104.    2170.    3439.    3159.    2189.    2241.    3055.    2454.
    3720.    3051.    2021.    3132.    3442.    2160.    3354.    2369.
    3434.    3188.    3842.    4055.    2606.    3812.    3098.    3352.
    2094.    3282.    3433.    3426.    3029.    4083.    2086.    2080.
    2221.    2631.    2166.    2079.    2057.    2837.    2998.    2820.
    2383.    2264.    2151.    2916.    2347.    3342.    3464.    2158.
    2134.    3263.    2981.    3452.    3376.    27

Hero = 0 - соответствует вышедшему из игры человеку, поэтому сделаем соответствующее значение побольше

In [13]:
mean_predict_xp = heroes.apply(lambda x: mean_xp[x])
mean_predict_xp.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2079.0,3017.0,2080.0,4601.0,3051.0,3825.0,2451.0,4383.0,2160.0,3842.0
1,2160.0,4083.0,2369.0,4019.0,3720.0,3842.0,3442.0,2109.0,3051.0,3825.0
2,3329.0,2084.0,3222.0,3132.0,2916.0,2846.0,2569.0,3434.0,3526.0,2134.0
3,2094.0,3052.0,3342.0,2369.0,2998.0,2160.0,3354.0,2151.0,4019.0,2109.0
4,3122.0,4601.0,4055.0,3180.0,2820.0,2648.0,2846.0,3348.0,3720.0,2086.0


In [14]:
temp_xp_score = new_xp / mean_predict_xp
xp_score = pd.DataFrame(index=temp_xp_score.index)
for col in temp_xp_score.columns:
    xp_score['xp_score_' + col] = temp_xp_score[col]

# xp_score['radiant_norm_xp'] = norm_xp.radiant_xp
# xp_score['dire_norm_xp'] = norm_xp.dire_xp
# xp_score['radiant_carry_norm_xp'] = norm_xp.radiant_carry_xp
# xp_score['dire_carry_norm_xp'] = norm_xp.dire_carry_xp

xp_score['radiant_best_xp_score'] = xp_score.iloc[:,1:6].max(1)
xp_score['dire_best_xp_score'] = xp_score.iloc[:,6:11].max(1)
xp_score['radiant_sum_xp_score'] = StandardScaler().fit_transform(xp_score.iloc[:,1:6].sum(1).reshape(-1, 1))
xp_score['dire_sum_xp_score'] = StandardScaler().fit_transform(xp_score.iloc[:,6:11].sum(1).reshape(-1, 1))

xp_score.reset_index(level=0, inplace=True)
xp_score.head()

Unnamed: 0,mid,xp_score_player_0,xp_score_player_1,xp_score_player_2,xp_score_player_3,xp_score_player_4,xp_score_player_5,xp_score_player_6,xp_score_player_7,xp_score_player_8,xp_score_player_9,radiant_best_xp_score,dire_best_xp_score,radiant_sum_xp_score,dire_sum_xp_score
0,0,0.815777,1.426583,0.772115,0.962617,1.252376,0.979085,1.483068,1.105635,0.165278,0.61088,1.426583,1.483068,0.644328,-0.929431
1,1,0.713889,1.097477,0.827775,0.60637,1.293011,1.174909,1.019756,1.065908,1.115372,0.957386,1.293011,1.293011,0.003595,0.179923
2,2,0.856413,0.952975,1.934823,0.851533,1.46262,0.843992,0.599844,0.935352,0.812252,1.136832,1.934823,1.934823,1.708594,0.125583
3,3,0.5468,1.520315,1.257331,0.746729,1.262842,1.633796,0.883125,0.65086,1.122667,0.725462,1.633796,1.633796,2.319731,-0.552131
4,4,0.703716,0.766138,1.197287,1.018553,0.538298,1.273792,0.679902,0.765532,0.881989,0.922339,1.273792,1.273792,-0.331204,-1.379719


In [15]:
xp_score.to_csv('processing_tables/xp_score.csv', index=None)

In [16]:
pd.merge(xp_score, norm_xp, on='mid').head()

Unnamed: 0,mid,xp_score_player_0,xp_score_player_1,xp_score_player_2,xp_score_player_3,xp_score_player_4,xp_score_player_5,xp_score_player_6,xp_score_player_7,xp_score_player_8,...,exp_player_4,exp_player_5,exp_player_6,exp_player_7,exp_player_8,exp_player_9,radiant_xp,dire_xp,radiant_carry_xp,dire_carry_xp
0,0,0.815777,1.426583,0.772115,0.962617,1.252376,0.979085,1.483068,1.105635,0.165278,...,0.733066,0.597751,0.555076,1.665819,-2.495594,-0.628572,0.370217,-0.184408,-0.143615,0.547836
1,1,0.713889,1.097477,0.827775,0.60637,1.293011,1.174909,1.019756,1.065908,1.115372,...,1.638381,1.30684,0.439235,-0.733371,0.335357,0.593712,-0.003502,1.245843,0.420384,0.047362
2,2,0.856413,0.952975,1.934823,0.851533,1.46262,0.843992,0.599844,0.935352,0.812252,...,1.139497,-0.640619,-1.385491,0.156859,-0.165589,-0.555142,1.654017,-1.661006,2.528345,-1.915341
3,3,0.5468,1.520315,1.257331,0.746729,1.262842,1.633796,0.883125,0.65086,1.122667,...,0.701027,0.398579,-0.068611,-1.516479,1.366061,-1.387969,0.182461,-0.77683,0.168731,0.044347
4,4,0.703716,0.766138,1.197287,1.018553,0.538298,1.273792,0.679902,0.765532,0.881989,...,-1.375065,0.254733,-1.02036,-0.442476,0.22197,-1.021748,0.058087,-1.286063,0.486998,-1.672641


# rank xp dif

In [22]:
xp = pd.read_csv('data/xp.csv', index_col='mid')
xp = xp[xp.times == 600]
xp.drop('times', 1, inplace=True)
xp.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1696,4304,1606,4429,3821,3745,3635,4846,357,2347
1,1542,4481,1961,2437,4810,4514,3510,2248,3403,3662
2,2851,1986,6234,2667,4265,2402,1541,3212,2864,2426
3,1145,4640,4202,1769,3786,3529,2962,1400,4512,1530
4,2197,3525,4855,3239,1518,3373,1935,2563,3281,1924


In [23]:
rank_xp_dif = pd.DataFrame(data = np.sort(xp.values[:,:5]) - np.sort(xp.values[:,5:10]),
                                 index=xp.index,
                                 columns = ['xp_dif_{0}'.format(i) for i in range(5)]) 
rank_xp_dif = normalize_data(rank_xp_dif)
rank_xp_dif.reset_index(inplace=True)

rank_xp_dif.head()

Unnamed: 0,mid,xp_dif_0,xp_dif_1,xp_dif_2,xp_dif_3,xp_dif_4
0,0,1.950787,-0.99829,0.305137,0.70071,-0.451679
1,1,-1.095529,-2.244083,-1.492623,1.02712,0.247762
2,2,0.69798,0.444373,0.646412,1.757776,2.921918
3,3,-0.392773,0.403424,1.216154,0.843828,0.082957
4,4,-0.628064,0.439648,1.004821,0.305251,1.411206


In [46]:
rank_xp_dif.to_csv('processing_tables/xpDif.csv', index=None)