In [1]:
import pandas as pd
import numpy as np

from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_matches = pd.read_csv('train.csv')
test_matches = pd.read_csv('test.csv')
gold = pd.read_csv('gold.csv')

In [3]:
train_matches.head()

Unnamed: 0,mid,radiant_won
0,0,1
1,1,0
2,2,1
3,4,1
4,5,1


In [4]:
gold.head(20)

Unnamed: 0,mid,times,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
0,0,60,750,350,389,437,428,398,344,654,287,1056
1,0,120,957,1071,633,655,1080,669,1147,1164,438,1360
2,0,180,1161,1527,782,1103,1346,1058,1479,1574,587,2072
3,0,240,1571,2033,932,1515,2058,1760,1767,2387,737,2283
4,0,300,1721,2313,1082,1790,2699,2087,1986,2898,887,3302
5,0,360,1871,2753,1232,2126,3645,2417,2382,3416,1037,4071
6,0,420,2022,3216,1382,2703,4176,2567,2778,4115,1187,4686
7,0,480,2850,3941,2129,3249,5040,2717,3326,4931,1424,5207
8,0,540,3303,4686,2402,3716,5546,3428,3596,5580,1574,5609
9,0,600,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384


Как признаки будем использовать количество денег, которое игроки успели получить к 10-ой минуте.

In [5]:
gold = gold[gold.times == 600]

In [6]:
gold.head()

Unnamed: 0,mid,times,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
9,0,600,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384
19,1,600,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623
29,2,600,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491
39,3,600,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247
49,4,600,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220


Просуммируем деньги для каждой команды.

In [7]:
radiant_gold = gold[['player_0', 'player_1', 'player_2', 'player_3', 'player_4']].sum(axis=1)
dire_gold = gold[['player_5', 'player_6', 'player_7', 'player_8', 'player_9']].sum(axis=1)

gold['radiant_gold'] = radiant_gold
gold['dire_gold'] = dire_gold

In [8]:
gold.head()

Unnamed: 0,mid,times,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9,radiant_gold,dire_gold
9,0,600,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384,21454,22095
19,1,600,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623,22165,24536
29,2,600,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491,21392,15548
39,3,600,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247,20628,17617
49,4,600,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220,18038,17484


Сделаем несколько простых признаков: разницу суммы денег и отношение.

In [9]:
gold['diff_gold'] = gold['radiant_gold'] - gold['dire_gold']
gold['ratio_gold'] = gold['radiant_gold'] / gold['dire_gold']

Склеим всё с нужными играми из train и test.

In [10]:
train = pd.merge(train_matches[['mid']], gold, on='mid', how='left').drop(['mid', 'times'], 1)
test = pd.merge(test_matches[['mid']], gold, on='mid', how='left').drop(['mid', 'times'], 1)

In [11]:
train.head()

Unnamed: 0,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9,radiant_gold,dire_gold,diff_gold,ratio_gold
0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384,21454,22095,-641,0.970989
1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623,22165,24536,-2371,0.903366
2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491,21392,15548,5844,1.375868
3,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220,18038,17484,554,1.031686
4,4252,2412,2545,4264,2544,4752,5389,4954,3954,2992,16017,22041,-6024,0.726691


Зафиксируем что является признаками и таргетом.

In [8]:
x_train = train.values
x_test = test.values
y_train = train_matches.radiant_won.values

В качестве модели будем использовать RandomForestClassifier со 100 деревьями.

In [9]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=1234)
np.mean(cross_val_score(clf, x_train, y_train, cv=5, scoring='roc_auc'))

0.6859411256907274

Обучимся на полных данных.

In [10]:
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=1234, verbose=0,
            warm_start=False)

Добавим новую колонку с предсказаниями в файл с тестовыми идентификаторами.

In [11]:
test_matches['radiant_won'] = clf.predict_proba(x_test)[:, 1]

И запишем в файл для отправки в Kaggle

In [12]:
test_matches.to_csv('baseline.csv', index=None)