In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
events = pd.read_csv('events.csv')
events.head()

Unnamed: 0,mid,event_type,from_team,time
0,0,3,radiant,1
1,1,3,radiant,222
2,2,3,dire,143
3,3,3,radiant,143
4,4,3,dire,53


In [3]:
train_matches = pd.read_csv('train.csv')
test_matches = pd.read_csv('test.csv')
print test_matches.shape

(24974, 1)


event_type — тип события

0 — Командой был забран Aegis

1 — Командой был украден Aegis

2 — Командой были разрушены бараки соперника

3 — Командой был сделано первое убийство героя соперника

4 — Командой был убит Roshan.

5 — Командой была разрушена своя башня

6 — Командой была разрушена башня соперника

### 3 FB

In [5]:
temp = events.loc[(events['event_type'] == 3)]
temp.loc[:, 'made_fb'] = temp.from_team.apply(lambda x: 1 if x=='radiant' else 0)
temp = temp[['mid', 'made_fb']]
temp.head()

Unnamed: 0,mid,made_fb
0,0,1
1,1,1
2,2,0
3,3,1
4,4,0


In [6]:
train = pd.merge(train_matches, temp, on='mid', how='left')
test = pd.merge(test_matches, temp, on='mid', how='left')
print test.shape
print train.shape

(24974, 2)
(24974, 3)


In [7]:
train.head()

Unnamed: 0,mid,radiant_won,made_fb
0,0,1,1.0
1,1,0,1.0
2,2,1,0.0
3,4,1,0.0
4,5,1,


In [8]:
train.loc[train.made_fb != 0.5][['radiant_won', 'made_fb']].corr()

Unnamed: 0,radiant_won,made_fb
radiant_won,1.0,0.114045
made_fb,0.114045,1.0


### Roshan killed

In [8]:
temp = events.loc[(events['event_type'] == 4)]
temp.loc[:, 'roshan_killed'] = temp.from_team.apply(lambda x: 1 if x=='radiant' else 0)
temp = temp[['mid', 'roshan_killed']]
temp.head()

Unnamed: 0,mid,roshan_killed
170,128,0
224,166,1
423,309,0
428,311,0
508,370,0


In [9]:
train = pd.merge(train, temp, on='mid', how='left')
test2 = pd.merge(test, temp, on='mid', how='left')
print test2.shape
print test.shape

(24975, 3)
(24974, 2)


In [16]:
temp.groupby('mid').count().max()

roshan_killed    2
dtype: int64

In [12]:
for elem in test2.mid.values:
    if elem not in test.mid.values:
        print elem

In [11]:
train.roshan_killed.sum()

209.0

In [12]:
train.loc[train.roshan_killed != 0.5][['radiant_won', 'roshan_killed']].corr()

Unnamed: 0,radiant_won,roshan_killed
radiant_won,1.0,0.278342
roshan_killed,0.278342,1.0


### Destroy Barracks

In [9]:
temp = events.loc[(events['event_type'] == 2)]
temp.loc[:, 'destroy_barracks'] = temp.from_team.apply(lambda x: 1 if x=='radiant' else 0)
temp = temp[['mid', 'destroy_barracks']]
temp.head()
# train = pd.merge(train, temp, on='mid', how='left')
# test = pd.merge(test, temp, on='mid', how='left')

In [10]:
train.fillna(0.5, inplace=True)
test.fillna(0.5, inplace = True)
train.head(10)

Unnamed: 0,mid,radiant_won,made_fb,destroy_barracks
0,0,1,1.0,0.5
1,1,0,1.0,0.5
2,2,1,0.0,0.5
3,4,1,0.0,0.5
4,5,1,0.5,0.5
5,6,1,1.0,0.5
6,8,0,0.0,0.5
7,11,0,1.0,0.5
8,20,1,1.0,0.5
9,26,1,1.0,0.5


In [15]:
train.loc[train.destroy_barracks != 0.5][['radiant_won', 'destroy_barracks']]

Unnamed: 0,radiant_won,destroy_barracks
4587,0,0.0
4588,0,0.0
7118,1,1.0
7119,1,1.0
7120,1,1.0
18365,0,0.0
18366,0,0.0
20361,1,1.0
20362,1,1.0
23699,0,0.0


In [16]:
train.loc[train.destroy_barracks != 0.5][['radiant_won', 'destroy_barracks']].corr()

Unnamed: 0,radiant_won,destroy_barracks
radiant_won,1.0,1.0
destroy_barracks,1.0,1.0


### Towers

In [8]:
temp = events.loc[(events['event_type'] == 6)]
temp.head()

Unnamed: 0,mid,event_type,from_team,time
6,6,6,radiant,523
8,7,6,radiant,283
9,7,6,radiant,595
11,8,6,dire,523
20,15,6,radiant,495


In [9]:
radiant_push = events.loc[(events['event_type'] == 6) & (events.from_team == 'radiant')]
radiant_push = radiant_push.groupby('mid').count().from_team
radiant_push = radiant_push.reset_index(level=0)
radiant_push.columns = ['mid', 'radiant_destroy_tower']
train = pd.merge(train, radiant_push, on='mid', how='left')
test = pd.merge(test, radiant_push, on='mid', how='left')

In [10]:
dire_push = events.loc[(events['event_type'] == 6) & (events.from_team == 'dire')]
dire_push = dire_push.groupby('mid').count().from_team
dire_push = dire_push.reset_index(level=0)
dire_push.columns = ['mid', 'dire_destroy_tower']
train = pd.merge(train, dire_push, on='mid', how='left')
test = pd.merge(test, dire_push, on='mid', how='left')

In [11]:
radiant_dinay = events.loc[(events['event_type'] == 5) & (events.from_team == 'radiant')]
radiant_dinay = radiant_dinay.groupby('mid').count().from_team
radiant_dinay = radiant_dinay.reset_index(level=0)
radiant_dinay.columns = ['mid', 'radiant_dinay_tower']
train = pd.merge(train, radiant_dinay, on='mid', how='left')
test = pd.merge(test, radiant_dinay, on='mid', how='left')

In [12]:
dire_dinay = events.loc[(events['event_type'] == 5) & (events.from_team == 'dire')]
dire_dinay = dire_dinay.groupby('mid').count().from_team
dire_dinay = dire_dinay.reset_index(level=0)
dire_dinay.columns = ['mid', 'dire_dinay_tower']
train = pd.merge(train, dire_dinay, on='mid', how='left')
test = pd.merge(test, dire_dinay, on='mid', how='left')

In [13]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)
train.head(20)

Unnamed: 0,mid,radiant_won,made_fb,radiant_destroy_tower,dire_destroy_tower,radiant_dinay_tower,dire_dinay_tower
0,0,1,1.0,0.0,0.0,0.0,0.0
1,1,0,1.0,0.0,0.0,0.0,0.0
2,2,1,0.0,0.0,0.0,0.0,0.0
3,4,1,0.0,0.0,0.0,0.0,0.0
4,5,1,0.0,0.0,0.0,0.0,0.0
5,6,1,1.0,1.0,0.0,0.0,0.0
6,8,0,0.0,0.0,1.0,0.0,0.0
7,11,0,1.0,0.0,0.0,0.0,0.0
8,20,1,1.0,0.0,0.0,0.0,0.0
9,26,1,1.0,0.0,0.0,0.0,0.0


In [14]:
test.shape

(24974, 6)

In [15]:
train.radiant_destroy_tower = train.radiant_destroy_tower.astype(int32)
train.dire_destroy_tower = train.dire_destroy_tower.astype(int32)
train.radiant_dinay_tower = train.radiant_dinay_tower.astype(int32)
train.dire_dinay_tower = train.dire_dinay_tower.astype(int32)
train.head()

Unnamed: 0,mid,radiant_won,made_fb,radiant_destroy_tower,dire_destroy_tower,radiant_dinay_tower,dire_dinay_tower
0,0,1,1.0,0,0,0,0
1,1,0,1.0,0,0,0,0
2,2,1,0.0,0,0,0,0
3,4,1,0.0,0,0,0,0
4,5,1,0.0,0,0,0,0


In [16]:
train.to_csv('train_events.csv', index=None)
test.to_csv('test_events.csv', index=None)

In [25]:
train = pd.read_csv('train_events.csv')
test = pd.read_csv('test_events.csv')
print test.shape
y_train = train.radiant_won
x_train = train.drop('radiant_won', 1)

(24974, 6)


In [26]:
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=1234)
np.mean(cross_val_score(clf, x_train, y_train, cv=5, scoring='roc_auc'))

0.4925798599938398

In [20]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(random_state=241, C=0.01)
np.mean(cross_val_score(clf, x_train, y_train, cv=5, scoring='roc_auc'))

0.5872906442241842

## GOLD

In [27]:
gold = pd.read_csv('gold.csv')
gold = gold[gold.times == 600]
gold.drop('times', 1, inplace=True)
gold.head()

Unnamed: 0,mid,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9
9,0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384
19,1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623
29,2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491
39,3,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247
49,4,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220


In [28]:
radiant_gold = gold[['player_0', 'player_1', 'player_2', 'player_3', 'player_4']].sum(axis=1)
dire_gold = gold[['player_5', 'player_6', 'player_7', 'player_8', 'player_9']].sum(axis=1)

gold['radiant_gold'] = radiant_gold
gold['dire_gold'] = dire_gold
gold.head()

Unnamed: 0,mid,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9,radiant_gold,dire_gold
9,0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384,21454,22095
19,1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623,22165,24536
29,2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491,21392,15548
39,3,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247,20628,17617
49,4,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220,18038,17484


In [29]:
gold['diff_gold'] = gold['radiant_gold'] - gold['dire_gold']
gold['ratio_gold'] = gold['radiant_gold'] / gold['dire_gold']

gold.head()

Unnamed: 0,mid,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9,radiant_gold,dire_gold,diff_gold,ratio_gold
9,0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384,21454,22095,-641,0.970989
19,1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623,22165,24536,-2371,0.903366
29,2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491,21392,15548,5844,1.375868
39,3,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247,20628,17617,3011,1.170914
49,4,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220,18038,17484,554,1.031686


In [30]:
train = pd.merge(train, gold, on='mid', how='left')
test = pd.merge(test, gold, on='mid', how='left')

In [31]:
train.head()

Unnamed: 0,mid,radiant_won,made_fb,radiant_destroy_tower,dire_destroy_tower,radiant_dinay_tower,dire_dinay_tower,player_0,player_1,player_2,...,player_4,player_5,player_6,player_7,player_8,player_9,radiant_gold,dire_gold,diff_gold,ratio_gold
0,0,1,1.0,0,0,0,0,3454,5206,2613,...,5755,4072,3997,5917,1725,6384,21454,22095,-641,0.970989
1,1,0,1.0,0,0,0,0,2477,5760,3816,...,5759,7659,5066,2748,4440,4623,22165,24536,-2371,0.903366
2,2,1,0.0,0,0,0,0,3604,1948,8581,...,2869,3096,2301,5130,2530,2491,21392,15548,5844,1.375868
3,4,1,0.0,0,0,0,0,3675,4103,5154,...,2076,3920,3494,3392,4458,2220,18038,17484,554,1.031686
4,5,1,0.0,0,0,0,0,4252,2412,2545,...,2544,4752,5389,4954,3954,2992,16017,22041,-6024,0.726691


In [32]:
x_train = train.drop('radiant_won', 1)
y_train = train.radiant_won
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=1234)
np.mean(cross_val_score(clf, x_train, y_train, cv=5, scoring='roc_auc'))

0.67855996631968751

In [33]:
def normalizedata(X):
    return pd.DataFrame(StandardScaler().fit_transform(X), index = X.index, columns=X.columns)
def logEstimation(X, y):
    grid = {'C': np.power(10.0, np.arange(-5, 1))}
    kf=KFold(y.size, n_folds=5, shuffle=True, random_state=241)
    clf=LogisticRegression(random_state=241)
    gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=kf)
    gs.fit(X, y)
    return gs

In [34]:
X = x_train
X_norm = normalizedata(X)
gs = logEstimation(X_norm, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score: {}'.format(gs.best_score_))

[mean: 0.70224, std: 0.00509, params: {'C': 1.0000000000000001e-05}, mean: 0.70413, std: 0.00512, params: {'C': 0.0001}, mean: 0.70680, std: 0.00514, params: {'C': 0.001}, mean: 0.70704, std: 0.00517, params: {'C': 0.01}, mean: 0.70704, std: 0.00519, params: {'C': 0.10000000000000001}, mean: 0.70703, std: 0.00521, params: {'C': 1.0}]
{'C': 0.10000000000000001}
best score: 0.707041678752


In [39]:
clf=LogisticRegression(random_state=241, C=0.1)
np.mean(cross_val_score(clf, x_train, y_train, cv=5, scoring='roc_auc'))

0.70530212931477165

In [40]:
test.head()

Unnamed: 0,mid,made_fb,radiant_destroy_tower,dire_destroy_tower,radiant_dinay_tower,dire_dinay_tower,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9,radiant_gold,dire_gold,diff_gold,ratio_gold
0,3,1.0,0.0,0.0,0.0,0.0,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247,20628,17617,3011,1.170914
1,7,0.0,2.0,0.0,0.0,0.0,5491,5337,3958,5490,5563,5131,5061,5643,3541,3243,25839,22619,3220,1.142358
2,9,0.0,0.0,0.0,0.0,0.0,2136,3799,3400,4551,3199,4113,3853,3370,3837,6154,17085,21327,-4242,0.801097
3,10,1.0,0.0,0.0,0.0,0.0,4985,2829,3651,4485,5154,2903,2582,4347,6788,3518,21104,20138,966,1.047969
4,12,0.0,0.0,0.0,0.0,0.0,2421,5541,4862,4040,2952,4524,4983,3276,4448,4973,19816,22204,-2388,0.892452


In [41]:
test_matches = pd.read_csv('test.csv')

In [42]:
clf=LogisticRegression(random_state=241, C=1)
clf.fit(x_train, y_train)
test_matches['radiant_won'] = clf.predict_proba(test)[:, 1]

In [43]:
test_matches.head()

Unnamed: 0,mid,radiant_won
0,3,0.678157
1,7,0.691814
2,9,0.275644
3,10,0.571543
4,12,0.371858


In [45]:
test.head()

Unnamed: 0,mid,made_fb,radiant_destroy_tower,dire_destroy_tower,radiant_dinay_tower,dire_dinay_tower,player_0,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9,radiant_gold,dire_gold,diff_gold,ratio_gold
0,3,1.0,0.0,0.0,0.0,0.0,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247,20628,17617,3011,1.170914
1,7,0.0,2.0,0.0,0.0,0.0,5491,5337,3958,5490,5563,5131,5061,5643,3541,3243,25839,22619,3220,1.142358
2,9,0.0,0.0,0.0,0.0,0.0,2136,3799,3400,4551,3199,4113,3853,3370,3837,6154,17085,21327,-4242,0.801097
3,10,1.0,0.0,0.0,0.0,0.0,4985,2829,3651,4485,5154,2903,2582,4347,6788,3518,21104,20138,966,1.047969
4,12,0.0,0.0,0.0,0.0,0.0,2421,5541,4862,4040,2952,4524,4983,3276,4448,4973,19816,22204,-2388,0.892452


In [54]:
ids = events.loc[(events['event_type'] == 2)].mid.values
events.loc[(events['event_type'] == 2)]

Unnamed: 0,mid,event_type,from_team,time
10232,7341,2,dire,536
10233,7341,2,dire,548
12977,9348,2,dire,568
12978,9348,2,dire,576
13209,9512,2,dire,497
13210,9512,2,dire,502
19879,14337,2,radiant,430
19880,14337,2,radiant,459
19885,14337,2,radiant,595
40779,29462,2,dire,595


In [55]:
l = []
for mid in test_matches.mid.values:
    if mid in ids:
        l.append(mid)
        print mid

7341
9512
29462
34802


In [44]:
test_matches.to_csv('log_solve+gold+towers.csv', index=None)

In [56]:
for mid in l:
    print test_matches.loc[test_matches.mid == mid]

       mid  radiant_won
3729  7341     0.055096
       mid  radiant_won
4848  9512     0.000084
         mid  radiant_won
14669  29462     0.000343
         mid  radiant_won
17331  34802     0.408486
