In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold

from sklearn.cross_validation import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, Ridge

%pylab inline

pd.set_option("display.max_rows", 15)
pd.set_option("display.max_columns", 150)
sns.set_style('whitegrid')
%config InlineBackend.figure_format = 'svg'


plt.rcParams['figure.figsize'] = (12.0, 5.0)

Populating the interactive namespace from numpy and matplotlib


In [3]:
def normalize_data(X):
    return pd.DataFrame(
        StandardScaler().fit_transform(X),
        index = X.index, 
        columns=X.columns
    )

In [4]:
def my_clf_cross_val(X_train, y_train, with_lasso = True): 
    clf=LogisticRegression(random_state=241, C=0.1)
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
    print 'logreg: ', np.mean(score), np.std(score)
    clf=Ridge(alpha=0.0001)
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
    print 'Ridge: ', np.mean(score), np.std(score)
    clf=Lasso(alpha=0.0001, max_iter=6000)
    if with_lasso:
        score = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
        print 'Lasso: ', np.mean(score), np.std(score)

In [6]:
def total_merge(gold, heroes, items, xp, creeps, events):
    train_matches = pd.read_csv('data/train.csv')
    test_matches = pd.read_csv('data/test.csv')
    
    train = pd.merge(train_matches, gold, on='mid', how='left')
    test = pd.merge(test_matches, gold, on='mid', how='left')
    train = pd.merge(train, heroes, on='mid', how='left')
    test = pd.merge(test, heroes, on='mid', how='left')
    train = pd.merge(train, items, on='mid', how='left')
    test = pd.merge(test, items, on='mid', how='left')
    train = pd.merge(train, xp, on='mid', how='left')
    test = pd.merge(test, xp, on='mid', how='left')
    train = pd.merge(train, creeps, on='mid', how='left')
    test = pd.merge(test, creeps, on='mid', how='left')
    train = pd.merge(train, events, on='mid', how='left')
    test = pd.merge(test, events, on='mid', how='left')

    X_train = train.drop(['radiant_won'], 1)
    y_train = train.radiant_won

    return X_train, y_train, test

## GOLD

In [10]:
new_gold = pd.read_csv('data/gold.csv', index_col='mid')
new_gold = new_gold[new_gold.times == 600]
new_gold.drop('times', 1, inplace=True)
new_gold.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384
1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623
2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491
3,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247
4,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220


Посчитаем среднее число золота, которое зарабатывает каждый герой за первые 10 минут и нормируем на него. Полученную величину будем называть gold_score

Отдельно зададим это значение для героя с номером 0 - очевидно, что так отмечаются случаи "ливнувшего" персонажа. Поэтому в каждой он набирает одинаковое очень малое количество золота, и стабильно получает gold_score равный одному. Чтобы это избежать, искусственно зададим для него очень большое среднее значение, например 100000

In [12]:
heroes = pd.read_csv('data/heroes.csv', index_col='mid')
mean_gold = np.zeros(111)
for hero_num in range(111):
    hero_values = []
    for player_num in range(10):
        colname = 'player_{}'.format(player_num)
        hero_index = heroes.loc[heroes[colname] == hero_num].index
        hero_values.extend(new_gold.loc[hero_index][colname].values)
    mean_gold[hero_num] = int(np.mean(hero_values))

mean_gold[0] = 100000
print 'Source mean gold by heroes:\n', mean_gold

print 'Sorted mean gold by heroes:\n', np.sort(mean_gold)

Source mean gold by heroes:
[ 100000.    4793.    4149.    3651.    3124.    3789.    3245.    3224.
    4364.    4589.    3101.    4808.    3217.    4635.    4535.    5610.
    3314.    4150.    3062.    2610.    3785.    3000.    4603.    3873.
    4939.    4383.    3343.    4516.    4352.    4605.    4443.    4447.
    3635.    3305.    5088.    4666.    4527.    4603.    4285.    2838.
    2793.    3345.    3678.    3807.    4264.    4556.    4426.    4225.
    4485.    4493.    2736.    4022.    4648.    4187.    4117.    4461.
    4843.    3359.    4398.    4403.    2970.    2825.    3845.    3786.
    4562.    4364.    2684.    4646.    4162.    2895.    4601.    3486.
    4865.    4075.    6446.    4589.    3293.    4323.    4425.    4224.
    2965.    4139.    4968.    4354.    4374.    4719.    2680.    2754.
    2921.    3712.    2940.    2838.    2795.    3290.    3938.    3838.
    3374.    3016.    2959.    3370.    3129.    4483.    4430.    2908.
    2873.    4453.    3

Еще одна особенность: герои которые обычно мало набирают золота, могут в конкретных играх набирать намного больше среднего, а посему получать неоправданно высокий gold_score. Посему аналогично тому, как мы делали с "ливерами", зададим им очень высокую планку. Забегая вперед скажу, что в решение как признак пойдут только самые высокие gold_score с каждой команды, поэтому ничего страшного произойти не должно - просто герои, которые обычно набирают мало золота, врят ли попадут в нашу оценку.

In [13]:
new_mean_gold = np.array(map(lambda x: x if x > 4400 else 10000, mean_gold))
mean_gold = new_mean_gold
print 'After threshold:\n', mean_gold

After threshold:
[ 100000.    4793.   10000.   10000.   10000.   10000.   10000.   10000.
   10000.    4589.   10000.    4808.   10000.    4635.    4535.    5610.
   10000.   10000.   10000.   10000.   10000.   10000.    4603.   10000.
    4939.   10000.   10000.    4516.   10000.    4605.    4443.    4447.
   10000.   10000.    5088.    4666.    4527.    4603.   10000.   10000.
   10000.   10000.   10000.   10000.   10000.    4556.    4426.   10000.
    4485.    4493.   10000.   10000.    4648.   10000.   10000.    4461.
    4843.   10000.   10000.    4403.   10000.   10000.   10000.   10000.
    4562.   10000.   10000.    4646.   10000.   10000.    4601.   10000.
    4865.   10000.    6446.    4589.   10000.   10000.    4425.   10000.
   10000.   10000.    4968.   10000.   10000.    4719.   10000.   10000.
   10000.   10000.   10000.   10000.   10000.   10000.   10000.   10000.
   10000.   10000.   10000.   10000.   10000.    4483.    4430.   10000.
   10000.    4453.   10000.   1000

Отмечу, что то, что получилось выше я называл threshold_gold_score за некоторую пороговость оценки. 
Я пробовал другие варианты, например что-то вроде

new_mean_gold = np.array(map(lambda x: x if x > 4500 else 4500 + 5*(4500 - x), mean_gold))

new_mean_gold = np.array(map(lambda x: x if x > 4500 else 4500 + int((4500 - x)*2 / 1000), mean_gold))

или другие значения порога, однако на кросс-валидации и 30% тестовой выборки лучше себя показывал пороговый gold_score.

Возможно стоило устроить полный перебор порога, однако я ограничился несколькими вариантами. В целом где-то здесь возможно усиление алгоритма.

Вот так выглядит что-то вроде предсказанного количества золота в матче, вычисленное чисто по threshold_gold_score. Нормируем на него реальные данные

In [14]:
mean_predict_gold = heroes.apply(lambda x: mean_gold[x])
mean_predict_gold.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,10000.0,10000.0,10000.0,5610.0,10000.0,4808.0,10000.0,5088.0,10000.0,6446.0
1,10000.0,4719.0,10000.0,4939.0,4562.0,6446.0,10000.0,10000.0,10000.0,4808.0
2,10000.0,10000.0,4447.0,4646.0,10000.0,10000.0,10000.0,4865.0,4485.0,10000.0
3,10000.0,10000.0,4483.0,10000.0,10000.0,10000.0,4601.0,10000.0,4939.0,10000.0
4,10000.0,5610.0,4589.0,4605.0,10000.0,10000.0,10000.0,4461.0,4562.0,10000.0


In [17]:
gold_score = new_gold / mean_predict_gold
gold_score.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.3454,0.5206,0.2613,0.788948,0.5755,0.846922,0.3997,1.162932,0.1725,0.990382
1,0.2477,1.220598,0.3816,0.881353,1.262385,1.188179,0.5066,0.2748,0.444,0.961522
2,0.3604,0.1948,1.929615,0.944899,0.2869,0.3096,0.2301,1.054471,0.564103,0.2491
3,0.3457,0.5464,0.988624,0.2961,0.4314,0.3345,1.041295,0.1906,1.078761,0.2247
4,0.3675,0.731373,1.123121,0.65798,0.2076,0.392,0.3494,0.760368,0.977203,0.222


Посортируем значения gold_score в каждой команде и найдем соответствующую разность:

In [21]:
gold_score_rank_dif = pd.DataFrame(
    data = np.sort(gold_score.values[:,0:5]) - np.sort(gold_score.values[:,5:10]),
    index=gold.index,
    columns = ['gold_score_rank_{}'.format(i) for i in range(5)]) 

gold_score_rank_dif.head()

Unnamed: 0,gold_score_rank_0,gold_score_rank_1,gold_score_rank_2,gold_score_rank_3,gold_score_rank_4
0,0.0888,-0.0543,-0.326322,-0.414882,-0.373984
1,-0.0271,-0.0624,0.374753,0.259075,0.074206
2,-0.0353,0.0378,0.0508,0.380796,0.875145
3,0.1055,0.121,0.0969,-0.494895,-0.090137
4,-0.0144,0.0181,0.26598,-0.028995,0.145918


Аналогичную разность посчитаем для просто заработанного золота

In [23]:
gold_rank_dif = pd.DataFrame(data = np.sort(new_gold.values[:,0:5]) - np.sort(new_gold.values[:,5:10]),
                             index=gold.index,
                             columns = ['gold_rank_{}'.format(i) for i in range(5)]) 
gold_rank_dif.head()

Unnamed: 0,gold_rank_0,gold_rank_1,gold_rank_2,gold_rank_3,gold_rank_4
0,888,-543,354,-711,-629
1,-271,-624,-270,693,-1899
2,-353,378,1074,1294,3451
3,1055,1210,969,-359,136
4,-144,-362,181,183,696


Составим некоторую выжимку из этих таблиц. В том, что именно лучше выкидывать, я разбирался по результатам многочисленных экспериментов по итогам кроссвалидации, или что зануляло Лассо. Каждый признак нормируется стандартным образом(на среднее и стандартное отклонение). В итоге выжимка получилась такой:

In [29]:
gold_stats = pd.DataFrame(index=new_gold.index)
gold_stats['gold_dif'] = gold_rank_dif.sum(1)
gold_stats['top_gold_dif'] = gold_rank_dif.gold_rank_4
gold_stats['threshold_score_sum_dif'] = gold_score_rank_dif.sum(1)
gold_stats['top_threshold_gold_score_dif'] = gold_score_rank_dif.gold_score_rank_4
gold_stats['second_threshold_gold_score_dif'] = gold_score_rank_dif.gold_score_rank_3
gold_stats['third_threshold_gold_score_dif'] = gold_score_rank_dif.gold_score_rank_2
gold_stats = normalize_data(gold_stats)
gold_stats.reset_index(inplace=True)
gold_stats.head()

Unnamed: 0,mid,gold_dif,top_gold_dif,threshold_score_sum_dif,top_threshold_gold_score_dif,second_threshold_gold_score_dif,third_threshold_gold_score_dif
0,0,-0.191463,-0.463171,-1.251202,-1.140037,-1.157989,-1.121482
1,1,-0.657847,-1.316314,0.702919,0.197841,0.712788,1.299354
2,2,1.556804,2.277633,1.497239,2.588696,1.050663,0.180734
3,3,0.793066,0.05073,-0.309281,-0.292735,-1.380092,0.339919
4,4,0.130693,0.426918,0.436196,0.411904,-0.086841,0.92376


## Heroes

In [32]:
heroes = pd.read_csv('data/heroes.csv')

In [35]:
heroes_num = heroes.player_0.unique().max()
print 'Heroes num: ', heroes_num

Heroes num:  110


Составим матрицу со 110 колонками, в которой

- будет отмечаться 1, если соответствующий герой играл за команду radiant,
- будет отмечаться -1, если за команду dire
- будет отмечаться 0, если герой в данном матче представлен не был

In [36]:
X_pick = np.zeros((heroes.shape[0], heroes_num + 1))
for i, match_id in enumerate(heroes.mid.values):
    for p in xrange(5):
        X_pick[i, heroes.ix[match_id, 'player_{0}'.format(p)]] = 1
        X_pick[i, heroes.ix[match_id, 'player_{0}'.format(5 + p)]] = -1  
        
print X_pick

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  1. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [38]:
heroes_pick = pd.DataFrame(X_pick, columns=['hero_{0}'.format(i) for i in range(111)])
heroes_pick = heroes_pick.astype(int)
heroes_pick.index = heroes.index 
heroes_pick['mid'] = heroes_pick.index
heroes_pick.head()

Unnamed: 0,hero_0,hero_1,hero_2,hero_3,hero_4,hero_5,hero_6,hero_7,hero_8,hero_9,hero_10,hero_11,hero_12,hero_13,hero_14,hero_15,hero_16,hero_17,hero_18,hero_19,hero_20,hero_21,hero_22,hero_23,hero_24,hero_25,hero_26,hero_27,hero_28,hero_29,hero_30,hero_31,hero_32,hero_33,hero_34,hero_35,hero_36,hero_37,hero_38,hero_39,hero_40,hero_41,hero_42,hero_43,hero_44,hero_45,hero_46,hero_47,hero_48,hero_49,hero_50,hero_51,hero_52,hero_53,hero_54,hero_55,hero_56,hero_57,hero_58,hero_59,hero_60,hero_61,hero_62,hero_63,hero_64,hero_65,hero_66,hero_67,hero_68,hero_69,hero_70,hero_71,hero_72,hero_73,hero_74,hero_75,hero_76,hero_77,hero_78,hero_79,hero_80,hero_81,hero_82,hero_83,hero_84,hero_85,hero_86,hero_87,hero_88,hero_89,hero_90,hero_91,hero_92,hero_93,hero_94,hero_95,hero_96,hero_97,hero_98,hero_99,hero_100,hero_101,hero_102,hero_103,hero_104,hero_105,hero_106,hero_107,hero_108,hero_109,hero_110,mid
0,0,0,0,0,0,0,-1,0,0,0,0,-1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,-1,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,-1,0,0,-1,1,0,1,0,0,-1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,-1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-1,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,-1,0,0,1,0,0,0,0,0,0,0,0,0,3
4,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4


## Items

In [40]:
items = pd.read_csv('data/items.csv')
items.fillna(0, inplace=True)
items.head()

Unnamed: 0,mid,player,item_0,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,item_11,item_12,item_13,item_14,item_15,item_16,item_17,item_18,item_19,item_20,item_21,item_22,item_23,item_24,item_25,item_26,item_27,item_28,item_29,item_30,item_31,item_32,item_33,item_34,item_35,item_36,item_37,item_38,item_39,item_40,item_41,item_42,item_43,item_44,item_45,item_46,item_47,item_48,item_49,item_50,item_51,item_52,item_53,item_54,item_55,item_56,item_57,item_58,item_59,item_60,item_61,item_62,item_63,item_64,item_65,item_66,item_67,item_68,item_69,item_70,item_71,item_72,item_73,item_74,item_75,item_76,item_77,item_78,item_79,item_80,item_81,item_82,item_83,item_84,item_85,item_86,item_87,item_88,item_89,item_90,item_91,item_92,item_93,item_94,item_95,item_96,item_97,item_98,item_99,item_100,item_101,item_102,item_103,item_104,item_105,item_106,item_107,item_108,item_109,item_110,item_111,item_112,item_113,item_114,item_115,item_116,item_117,item_118,item_119,item_120
0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,3.0,0.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


Просто возьмем сложим все купленные предметы по командам и вычислим разность по каждому предмету

In [41]:
radiant_items = items.drop('player', 1).loc[items.player < 5].groupby('mid').sum()
dire_items = items.drop('player', 1).loc[items.player > 4].groupby('mid').sum()

items_dif = pd.DataFrame(index=dire_items.index)
for col in dire_items.columns:
    items_dif[col + '_dif'] = (radiant_items[col] - dire_items[col]).astype(int32)
items_dif.reset_index(level=0, inplace=True)
# items_dif.to_csv('processing_tables/items_dif.csv', index=None)
items_dif.head()

Unnamed: 0,mid,item_0_dif,item_1_dif,item_2_dif,item_3_dif,item_4_dif,item_5_dif,item_6_dif,item_7_dif,item_8_dif,item_9_dif,item_10_dif,item_11_dif,item_12_dif,item_13_dif,item_14_dif,item_15_dif,item_16_dif,item_17_dif,item_18_dif,item_19_dif,item_20_dif,item_21_dif,item_22_dif,item_23_dif,item_24_dif,item_25_dif,item_26_dif,item_27_dif,item_28_dif,item_29_dif,item_30_dif,item_31_dif,item_32_dif,item_33_dif,item_34_dif,item_35_dif,item_36_dif,item_37_dif,item_38_dif,item_39_dif,item_40_dif,item_41_dif,item_42_dif,item_43_dif,item_44_dif,item_45_dif,item_46_dif,item_47_dif,item_48_dif,item_49_dif,item_50_dif,item_51_dif,item_52_dif,item_53_dif,item_54_dif,item_55_dif,item_56_dif,item_57_dif,item_58_dif,item_59_dif,item_60_dif,item_61_dif,item_62_dif,item_63_dif,item_64_dif,item_65_dif,item_66_dif,item_67_dif,item_68_dif,item_69_dif,item_70_dif,item_71_dif,item_72_dif,item_73_dif,item_74_dif,item_75_dif,item_76_dif,item_77_dif,item_78_dif,item_79_dif,item_80_dif,item_81_dif,item_82_dif,item_83_dif,item_84_dif,item_85_dif,item_86_dif,item_87_dif,item_88_dif,item_89_dif,item_90_dif,item_91_dif,item_92_dif,item_93_dif,item_94_dif,item_95_dif,item_96_dif,item_97_dif,item_98_dif,item_99_dif,item_100_dif,item_101_dif,item_102_dif,item_103_dif,item_104_dif,item_105_dif,item_106_dif,item_107_dif,item_108_dif,item_109_dif,item_110_dif,item_111_dif,item_112_dif,item_113_dif,item_114_dif,item_115_dif,item_116_dif,item_117_dif,item_118_dif,item_119_dif,item_120_dif
0,0,0,-2,0,1,0,0,0,0,0,0,-1,0,2,2,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,2,0,0,0,1,1,0,0,-2,0,-1,0,0,0,0,0,0,0,2,0,0,1,-1,0,0,1,0,-1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,-1,-1,-2,-3,1,0,2,0,0,-1,1,0,0,0,2,0,-2,-4,0,0,0,0,0,-1,0,2,0,0,1,3,0,-2,0,4,1,0,0,0,0,0,0,0,0,-1,0,0,2,0,0,0,-1,1,-1,1,0,0,0,0,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-2,-1,1,-3,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,-2,0,0,0,0,0
2,2,0,2,0,0,0,0,0,0,0,0,2,0,1,0,0,-1,0,0,-1,0,0,0,0,0,1,0,-3,-1,-1,0,0,0,-1,0,0,4,1,0,0,0,1,1,0,-5,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0
3,3,0,-2,0,0,0,0,0,0,0,0,1,-1,0,3,0,-2,1,0,0,1,0,0,0,0,1,-1,0,1,1,0,0,0,-1,0,0,4,1,0,-2,-1,1,2,0,-2,0,-1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,2,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,4,0,1,0,0,0,0,0,0,0,0,2,-1,-2,1,1,3,-1,0,0,0,0,0,0,0,1,0,-1,0,0,0,0,0,1,-1,0,-1,1,0,-1,-1,0,0,0,-4,0,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,-1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,-1,0,-2,0,0,0,0,0


## Experiance

In [43]:
xp = pd.read_csv('data/xp.csv', index_col='mid')
xp = xp[xp.times == 600]
xp.drop('times', 1, inplace=True)
xp.head()

Unnamed: 0_level_0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1696,4304,1606,4429,3821,3745,3635,4846,357,2347
1,1542,4481,1961,2437,4810,4514,3510,2248,3403,3662
2,2851,1986,6234,2667,4265,2402,1541,3212,2864,2426
3,1145,4640,4202,1769,3786,3529,2962,1400,4512,1530
4,2197,3525,4855,3239,1518,3373,1935,2563,3281,1924


Просто всевозможные разности упорядоченных значений опыта игроков

In [44]:
rank_xp_dif = pd.DataFrame(data = np.sort(xp.values[:,:5]) - np.sort(xp.values[:,5:10]),
                                 index=xp.index,
                                 columns = ['xp_dif_{0}'.format(i) for i in range(5)]) 
rank_xp_dif = normalize_data(rank_xp_dif)
rank_xp_dif.reset_index(inplace=True)

rank_xp_dif.head()

Unnamed: 0,mid,xp_dif_0,xp_dif_1,xp_dif_2,xp_dif_3,xp_dif_4
0,0,1.950787,-0.99829,0.305137,0.70071,-0.451679
1,1,-1.095529,-2.244083,-1.492623,1.02712,0.247762
2,2,0.69798,0.444373,0.646412,1.757776,2.921918
3,3,-0.392773,0.403424,1.216154,0.843828,0.082957
4,4,-0.628064,0.439648,1.004821,0.305251,1.411206


## Creeps

Небольшая выжимка: разность первого и второго игрока, а также всей команды по крипам. 

In [59]:
creeps = pd.read_csv('data\lh.csv', index_col='mid')
creeps = creeps.loc[creeps.times == 600].drop('times', 1)
creeps.reset_index(inplace=True)

creeps_dif = pd.DataFrame(
    data = np.sort(creeps.values[:,1:6]) - np.sort(creeps.values[:,6:11]),
    index=creeps.mid,
    columns = ['creep_dif_{0}'.format(i) for i in range(5)]) 

creeps_dif['sum_creeps_dif'] = creeps_dif.sum(1)
creeps_dif.drop(['creep_dif_0', 'creep_dif_1', 'creep_dif_2'], 1, inplace=True)
creeps_dif = normalize_data(creeps_dif)
creeps_dif = pd.DataFrame(creeps_dif[['creep_dif_4', 'creep_dif_3', 'sum_creeps_dif']])
creeps_dif.columns = [u'best_creeps_dif', u'second_creeps_dif', u'sum_creeps_dif']
creeps_dif.reset_index(inplace=True)
creeps_dif.head()

Unnamed: 0,mid,best_creeps_dif,second_creeps_dif,sum_creeps_dif
0,0,-1.134062,-0.235939,-1.183
1,1,0.849864,0.560363,0.338896
2,2,0.787866,2.080576,1.994292
3,3,-0.142099,-0.163548,-0.301902
4,4,0.477878,1.0671,1.113194


## Events 

In [71]:
events = pd.read_csv('data/events.csv')
events.head()

Unnamed: 0,mid,event_type,from_team,time
0,0,3,radiant,1
1,1,3,radiant,222
2,2,3,dire,143
3,3,3,radiant,143
4,4,3,dire,53


Хитро перевожу все типы событий в dummy-признаки. Использоваться не будут take_aegis, steal_aegis, соответствующие первым двум типам

In [76]:
events = pd.read_csv('data/events.csv')
l = gold.shape[0]

eventname = ['take_aegis', 'steal_aegis', 'destroy_barracks', 'make_fb', 'kill_roshan', 'denay_tower', 'destroy_tower']

dummy_events = pd.DataFrame(data = np.arange(l), columns=['mid'])
dummy_events.head()

for event_index in range(2,7):
    current_event = events.loc[(events.event_type == event_index)][['mid', 'from_team']]
    current_event['radiant_' + eventname[event_index]] = (current_event.from_team == 'radiant').astype(int)
    current_event['dire_' +  eventname[event_index]] = (current_event.from_team == 'dire').astype(int)
    current_event.drop('from_team', 1, inplace=True)
    current_event = current_event.groupby('mid').sum()
    current_event.reset_index(inplace=True)
    dummy_events = pd.merge(dummy_events, current_event, on='mid', how='left')

dummy_events.fillna(0, inplace=True)
for col in dummy_events.columns:
    dummy_events[col] = dummy_events[col].astype(int)
dummy_events.head()

Unnamed: 0,mid,radiant_destroy_barracks,dire_destroy_barracks,radiant_make_fb,dire_make_fb,radiant_kill_roshan,dire_kill_roshan,radiant_denay_tower,dire_denay_tower,radiant_destroy_tower,dire_destroy_tower
0,0,0,0,1,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,0
2,2,0,0,0,1,0,0,0,0,0,0
3,3,0,0,1,0,0,0,0,0,0,0
4,4,0,0,0,1,0,0,0,0,0,0


Смержим все полученные таблицы и проверим на них модели логистической регрессии, Ridge и Lasso на кросс-валидации по 5 фолдам

In [82]:
X_train, y_train, test= total_merge(gold_stats, heroes_pick, items_dif, rank_xp_dif, creeps_dif, model_events)
X_train.index = X_train.mid
X_train.drop('mid', 1,inplace=True)
my_clf_cross_val(X_train, y_train)

logreg:  0.769098493374 0.00599311804189
Ridge:  0.768202455192 0.00574751213618
Lasso:  0.768747992879 0.00607401194039


Обучим логистическую регрессию по всем тренировочным данным и получим предсказание на тестовых.

In [81]:
X_test = test.copy()
X_test.index = test.mid
X_test.drop('mid', 1, inplace=True)
clf=LogisticRegression(random_state=241, C=0.1)
clf.fit(X_train, y_train)

test_matches = pd.read_csv('data/test.csv')
test_matches['radiant_won'] = clf.predict_proba(X_test)[:, 1]

Небольшой хак для крохоборов: в полученной табличке заменим предсказания во всех матчах, где были сломаны бараки, на победой той команды, что сломала больше

In [80]:
midRadWin = dummy_events.loc[dummy_events.radiant_destroy_barracks!=0].mid.values
midDireWin = dummy_events.loc[dummy_events.dire_destroy_barracks!=0].mid.values
print midRadWin, midDireWin

[14337 34802 40699] [ 7341  9348  9512 29462 36614 47375]


In [83]:
for mid in midRadWin:
    if mid in test_matches.mid.values:
        test_matches.ix[test_matches[test_matches.mid == mid].index[0],1] = 1
for mid in midDireWin:
    if mid in test_matches.mid.values:
        test_matches.ix[test_matches[test_matches.mid == mid].index[0],1] = 0
        
test_matches.to_csv('submissions/final_to_kaggle.csv', index=None)
test_matches.head()

Unnamed: 0,mid,radiant_won
0,3,0.6985
1,7,0.513712
2,9,0.158692
3,10,0.387342
4,12,0.561898
