In [None]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import cross_val_score, KFold

import time
import datetime

## Data reading

In [None]:
df_train = pd.read_csv('../data/features.csv')
df_test = pd.read_csv('../data/features_test.csv')
df_all = pd.concat([df_train, df_test]).reset_index(drop=True)

df_train.shape, df_test.shape, df_all.shape

#### drop excess features from train sample

In [None]:
df_all =df_all[ list(df_test.columns) + ['radiant_win']] 
df_all

#### empty values in columns

In [None]:
df_train.columns[ list(df_train.count() < df_train.shape[0]) ]

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#### fill null fields as big value

In [None]:
def cross_val_GB(df_train, features=None, target='radiant_win', n_est = [10, 20, 30], frac=1):
    if frac < 1:
        df_train = df_train.sample(frac=frac)
    if features is None:
        features = [f for f in df_train.columns if f != target ]
    X_train = df_train[features].values
    y_train = df_train[target].values
    cvs = {}
    for n in n_est:
        start_time = datetime.datetime.now()
        clf = GradientBoostingClassifier(n_estimators=n, max_depth=3)
        cv = KFold(n_splits=5, shuffle=True)
        accur = cross_val_score(clf, X_train, y_train, cv=cv, scoring='roc_auc')
        cvs[n] = accur.mean()
        
        print('time for {0} n_estimators: {1}'.format(n, datetime.datetime.now() - start_time))
    cvs = pd.Series(cvs)
    return cvs

In [None]:
def predict_GB(df_train, df_test, features, n_esti = 20, target='radiant_win', filename='predict'):
    
    clf = GradientBoostingClassifier(n_estimators=n_esti)
    
    X_train = df_train[features].values
    y_train = df_train[target].values
    X_test = df_test[features].values
    
    clf.fit(X_train, y_train)
    y_proba = clf.predict_proba(X_test)
    
    df_res = pd.DataFrame()
    df_res['match_id'] = df_test['match_id']
    df_res['radiant_win'] = y_proba[:,1]
    df_res.to_csv('GB_{}.csv'.format(filename), index=False)


In [None]:
drop_features = ['radiant_win', 'match_id', 'start_time']
features_all = [f for f in df_all.columns if f not in drop_features]

In [None]:
df_train = df_all[df_all['radiant_win'].notnull()].fillna(99999)
df_test = df_all[df_all['radiant_win'].isnull()].fillna(99999)

In [None]:
%%time

features = features_all
scores = cross_val_GB(df_train, features, frac=0.5)
scores

In [None]:
best_param = scores.idxmax()
best_param, scores[best_param]

In [None]:

predict_GB(df_train, df_test, features, n_esti=best_param, filename='simpleGB')

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
def cross_val_LR(df_train, features=None, target='radiant_win', C = [0.25, 0.5, 1, 1.5, 2, 2.5, 3], frac=1, scale=False):
    if frac < 1:
        df_train = df_train.sample(frac=frac)
    if features is None:
        features = [f for f in df_train.columns if f != target ]
    X_train = df_train[features].values
    y_train = df_train[target].values
    
    if scale:
        ss = StandardScaler()
        X_train = ss.fit_transform(X_train)
        
    cvs = {}
    for c in C:
        clf = LogisticRegression(C=c)
        cv = KFold(n_splits=5, shuffle=True)
        accur = cross_val_score(clf, X_train, y_train, cv=cv, scoring='roc_auc')
        cvs[c] = accur.mean()
    cvs = pd.Series(cvs, name='[n_estimators] : accuracy')
    return cvs
                   

In [None]:
def predict_LR(df_train, df_test, features, c=1, target='radiant_win', filename='predict'):
    clf = LogisticRegression(C=c)
    ss = StandardScaler()
    
    X_train = ss.fit_transform(df_train[features].values)
    y_train = df_train[target].values
    X_test = ss.transform(df_test[features].values)
    
    clf.fit(X_train, y_train)
    y_proba = clf.predict_proba(X_test)
    
    df_res = pd.DataFrame()
    df_res['match_id'] = df_test['match_id']
    df_res['radiant_win'] = y_proba[:,1]
    df_res.to_csv('LR_{}.csv'.format(filename), index=False)


In [None]:
drop_features = ['radiant_win', 'match_id', 'start_time']
features_all = [f for f in df_all.columns if f not in drop_features]

#### simple classification

In [None]:
df_train = df_all[df_all['radiant_win'].notnull()].fillna(0)
df_test = df_all[df_all['radiant_win'].isnull()].fillna(0)

In [None]:
%%time

features = features_all
scores = cross_val_LR(df_train, features, scale=True)
scores

In [None]:
best_param = scores.idxmax()
best_param, scores[best_param]

In [None]:
predict_LR(df_train, df_test, features, c=best_param, filename='simple')

#### drop categorial features

In [None]:
categ_features = [f for f in features_all if f.endswith('_hero')] + ['lobby_type']
categ_features

In [None]:
%%time 

features = [f for f in features_all if f not in categ_features]
scores = cross_val_LR(df_train, features,C=[best_param], scale=True)
scores

In [None]:
predict_LR(df_train, df_test, features, c=best_param, filename='no_categ_feats')

#### using bag of heroes

In [None]:
heroes = np.unique( df_train[[x for x in df_all.columns if x.endswith('_hero')]].values.reshape(-1) )
len(heroes), heroes

In [None]:
def bag_of_heroes(df, heroes=None, target='radiant_win'):
    if heroes is None:
        heroes = np.unique( df[[x for x in df.columns if x.endswith('_hero')]].values.reshape(-1) )
        
    X_pick = np.zeros((df.shape[0], heroes.max()))
    
    i_mod = int(0.1 * df.shape[0])
    for i, match_id in enumerate(df.index):
        for p in range(1,6):
            X_pick[i, int(df.iloc[i][ 'r{}_hero'.format(p)]-1)] = 1
            X_pick[i, int(df.iloc[i][ 'd{}_hero'.format(p)]-1)] = -1
        if i % i_mod == 0:
            print(int(100 * i / df.shape[0]), '%')
            
    cols = [str(h) for h in range(max(heroes))]
    
    df = pd.concat([df, pd.DataFrame( X_pick, columns=cols)], axis=1 )
    df_train = df[df[target].notnull()]
    df_test = df[df[target].isnull()]
    return df_train, df_test

In [None]:
%%time

df_train, df_test = bag_of_heroes(df_all)
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

df_train.shape, df_test.shape

In [None]:
%%time

features = [f for f in features_all if f not in categ_features] + [str(h) for h in range(max(heroes))]
scores = cross_val_LR(df_train, features,C=[best_param], scale=True)
scores

In [None]:
predict_LR(df_train, df_test, features, c=best_param, filename='bag_of_heroes')