In [1]:
import pandas as pd
import numpy as np
import random


from sklearn.model_selection import cross_val_score, KFold

### Data reading

In [2]:
df_train = pd.read_csv('../features.csv')
df_test = pd.read_csv('../features_test.csv')
df_all = pd.concat([df_train, df_test]).reset_index(drop=True)

df_train.shape, df_test.shape, df_all.shape

((97230, 109), (17177, 103), (114407, 109))

In [3]:
df_all =df_all[ list(df_test.columns) + ['radiant_win']] 
df_all

Unnamed: 0,match_id,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,...,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,radiant_win
0,0,1430198770,7,11,5,2098,1489,20,0,0,...,35.0,103.0,-84.0,221.0,3,4,2,2,-52.0,1.0
1,1,1430220345,0,42,4,1188,1033,9,0,1,...,-20.0,149.0,-84.0,195.0,5,4,3,1,-5.0,1.0
2,2,1430227081,7,33,4,1319,1270,22,0,0,...,-39.0,45.0,-77.0,221.0,3,4,3,1,13.0,0.0
3,3,1430263531,1,29,4,1779,1056,14,0,0,...,-30.0,124.0,-80.0,184.0,0,4,2,0,27.0,0.0
4,4,1430282290,7,13,4,1431,1090,8,1,0,...,46.0,182.0,-80.0,225.0,6,3,3,0,-16.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114402,114369,1450212780,7,11,5,2054,1941,27,0,1,...,8.0,253.0,-87.0,,4,3,2,1,-33.0,
114403,114377,1450222875,1,3,3,748,605,1,0,0,...,-1.0,133.0,-85.0,184.0,2,3,4,1,-18.0,
114404,114378,1450223593,1,85,2,575,499,0,0,0,...,20.0,133.0,-88.0,239.0,4,4,4,0,-36.0,
114405,114393,1450244771,0,7,4,1844,1176,8,1,2,...,-28.0,,-83.0,,1,4,1,0,,


### Gradient Boosting

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

In [5]:
df_train = df_all[df_all['radiant_win'].notnull()].fillna(99999)
df_test = df_all[df_all['radiant_win'].isnull()].fillna(99999)

In [6]:
drop_features = ['radiant_win', 'match_id', 'start_time']
features = [f for f in df_all.columns if f not in drop_features]

In [7]:
def cross_val_GB(df_train, features=None, target='radiant_win', n_est = [10, 20, 30], frac=1):
    if frac < 1:
        df_train = df_train.sample(frac=frac)
    if features is None:
        features = [f for f in df_train.columns if f != target ]
    X_train = df_train[features].values
    y_train = df_train[target].values
    cvs = {}
    for n in n_est:
        clf = GradientBoostingClassifier(n_estimators=n, max_depth=3)
        cv = KFold(n_splits=5, shuffle=True)
        accur = cross_val_score(clf, X_train, y_train, cv=cv)
        cvs[n] = accur.mean()
        # print(n, 'done')
    cvs = pd.Series(cvs)
    return cvs

In [8]:
%%time

cross_val_GB(df_train, features, frac=0.3)

Wall time: 2min


10    0.618362
20    0.629915
30    0.633275
dtype: float64

### Logistic regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [10]:
df_train = df_all[df_all['radiant_win'].notnull()].fillna(0)
df_test = df_all[df_all['radiant_win'].isnull()].fillna(0)

In [11]:
drop_features = ['radiant_win', 'match_id', 'start_time']
features_all = [f for f in df_all.columns if f not in drop_features]

In [12]:
def cross_val_LR(df_train, features=None, target='radiant_win', C = [0.25, 0.5, 1, 1.5, 2], frac=1, scale=False):
    if frac < 1:
        df_train = df_train.sample(frac=frac)
    if features is None:
        features = [f for f in df_train.columns if f != target ]
    X_train = df_train[features].values
    y_train = df_train[target].values
    
    if scale:
        ss = StandardScaler()
        X_train = ss.fit_transform(X_train)
        
    cvs = {}
    for c in C:
        clf = LogisticRegression(C=c)
        cv = KFold(n_splits=5, shuffle=True)
        accur = cross_val_score(clf, X_train, y_train, cv=cv)
        cvs[c] = accur.mean()
    cvs = pd.Series(cvs)
    return cvs
                   

In [14]:
%%time
features = features_all
stats = cross_val_LR(df_train, features, scale=True)
stats

Wall time: 39.3 s


0.25    0.655394
0.50    0.654623
1.00    0.654716
1.50    0.654520
2.00    0.655034
dtype: float64

In [15]:
categ_features = [f for f in features_all if f.endswith('_hero')] + ['lobby_type']
categ_features

['r1_hero',
 'r2_hero',
 'r3_hero',
 'r4_hero',
 'r5_hero',
 'd1_hero',
 'd2_hero',
 'd3_hero',
 'd4_hero',
 'd5_hero',
 'lobby_type']

In [17]:
%%time 

features = [f for f in features_all if f not in categ_features]
stats = cross_val_LR(df_train, features, C=[stats.idxmax()] ,scale=True)
stats

Wall time: 9.79 s


0.25    0.654767
dtype: float64

In [20]:
heroes = np.unique( df_train[[x for x in df_all.columns if x.endswith('_hero')]].values.reshape(-1) )
heroes

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 109, 110, 112], dtype=int64)

In [21]:
%%time

X_pick = np.zeros((df_train.shape[0], heroes.max()))

for i, match_id in enumerate(df_train.index):
    for p in range(1,6):
        X_pick[i, int(df_train.loc[i][ 'r{}_hero'.format(p)]-1)] = 1
        X_pick[i, int(df_train.loc[i][ 'd{}_hero'.format(p)]-1)] = -1
    if i % 10000 == 0:
        print(i)
X_pick

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
Wall time: 4min 59s


array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0., -1.]])

In [None]:
%%time
df = pd.concat(df_train, pd.DataFrame(X_pick))
df

In [None]:
%%time


cross_val_LR(df, features