In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import numpy as np
import pandas as pd

In [0]:
import os

PATH_TO_DATA = '/content/gdrive/My Drive/mlcourse/dota/2019-11-12/'

for dirname, _, filenames in os.walk(PATH_TO_DATA):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/content/gdrive/My Drive/mlcourse/dota/2019-11-12/df_test_features_ext.csv
/content/gdrive/My Drive/mlcourse/dota/2019-11-12/df_train_features_ext.csv
/content/gdrive/My Drive/mlcourse/dota/2019-11-12/df_train_targets.csv
/content/gdrive/My Drive/mlcourse/dota/2019-11-12/submission_2019-11-12_18-31-47.csv
/content/gdrive/My Drive/mlcourse/dota/2019-11-12/dota_SKL_pipe_v4.ipynb
/content/gdrive/My Drive/mlcourse/dota/2019-11-12/skl-json-v6.ipynb
/content/gdrive/My Drive/mlcourse/dota/2019-11-12/dota_SKL_pipe_v5.ipynb
/content/gdrive/My Drive/mlcourse/dota/2019-11-12/submission_2019-11-13_06-31-05.csv


**1. Download data**

In [0]:
df_train_features = pd.read_csv(PATH_TO_DATA + 'df_train_features_ext.csv', index_col = 'match_id_hash')

In [0]:
(len(df_train_features.columns) - 8)/10

85.0

In [0]:
df_train_targets = pd.read_csv(PATH_TO_DATA + 'df_train_targets.csv', index_col='match_id_hash')

In [0]:
#features:
PLAYER_FIELDS = [
#baseline
    'hero_id',
    
    'kills',
    'deaths',
    'assists',
    'denies',
    
    'gold',
    'lh',
    'xp',
    'health',
    'max_health',
    'max_mana',
    'level',

    'x',
    'y',
    
    'stuns',
    'creeps_stacked',
    'camps_stacked',
    'rune_pickups',
    'firstblood_claimed',
    'teamfight_participation',
    'towers_killed',
    'roshans_killed',
    'obs_placed',
    'sen_placed',
#add new
    'observers_placed',
    'nearby_creep_death_count',
#levels
    'ability_upgrades'

#dict sum
    'purchase',
    'killed',
    'item_uses',
    'ability_uses',
    'hero_hits',
    'damage',
    'damage_taken',
    'damage_inflictor',
    'killed_by',
    'multi_kills',
    'healing',
    'damage_inflictor_received',
#time
    'gold_t',
    'lh_t',
    'dn_t',
    'xp_t',    
]

MATCH_FIELDS = [
    'radiant_tower_kills',
    'dire_tower_kills',
    'diff_tower_kills',
]

In [0]:
def make_col(features):
  columns = []
  for c in features:
    columns.append('r_total_' + c)
    columns.append('d_total_' + c)
    columns.append('total_'   + c + '_ratio')

    columns.append('r_std_' + c)
    columns.append('d_std_' + c)
    columns.append('std_'   + c + '_ratio')

    columns.append('r_mean_' + c)
    columns.append('d_mean_' + c)
    columns.append('mean_'   + c + '_ratio')

    columns.append('r_min_' + c)
    columns.append('d_min_' + c)
    columns.append('min_'   + c + '_ratio')

    columns.append('r_max_' + c)
    columns.append('d_max_' + c)
    columns.append('max_'   + c + '_ratio')
  return columns

In [0]:
features = [
            'kills',
            ]
new_col = make_col(features)

In [0]:
new_col

['r_total_kills',
 'd_total_kills',
 'total_kills_ratio',
 'r_std_kills',
 'd_std_kills',
 'std_kills_ratio',
 'r_mean_kills',
 'd_mean_kills',
 'mean_kills_ratio',
 'r_min_kills',
 'd_min_kills',
 'min_kills_ratio',
 'r_max_kills',
 'd_max_kills',
 'max_kills_ratio']

In [0]:
len(df_train_features[df_train_features['d_max_kills'] == 0].index)

3299

In [0]:
len((df_train_features['d_max_kills'] <= 0).index)

39675

In [0]:
df_train = df_train_features[new_col]

**2. Create training data**

In [0]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)

In [0]:
#X = df_train_features.values
X = scaler.fit_transform(df_train_features)
y = df_train_targets['radiant_win'].values

**3. Create model**

In [0]:
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(n_estimators=100,
#                                n_jobs=-1, random_state=17)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=17)

**4. Cross-validation**

In [0]:
from sklearn.model_selection import ShuffleSplit, KFold
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=17)

In [0]:
from sklearn.model_selection import cross_val_score

In [0]:
cv_score = cross_val_score(model, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
print(round(cv_score.mean(), 5))

0.81835


In [0]:
#RF:              0.78552 -> 0.79204;
#RF:              0.78850 -> 0.79255; delta ~0.0030, improve ~ 0.0005
#LR(Scaled data): 0.81273 -> 0.82582; delta ~0.0240, improve ~ 0.0333
#LR             : 0.81316 -> 0.82662; delta ~0.0004, improve ~ 0.0008
#LR             : 0.81835 -> 0.83197; delta ~0.0052, improve ~ 0.0053

**6. GridSearch**

**7. Make submission**

In [0]:
model.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=17, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
df_test_features = pd.read_csv(PATH_TO_DATA + 'df_test_features_ext.csv', index_col='match_id_hash')

In [0]:
df_test_features_scaled = scaler.transform(df_test_features)

In [0]:
#X_test = df_test_features.values
X_test = df_test_features_scaled
y_test_pred = model.predict_proba(X_test)[:, 1]

In [0]:
df_submission = pd.DataFrame({'radiant_win_prob': y_test_pred}, index=df_test_features.index)

**8. Export submission**

In [0]:
import datetime
submission_filename = 'submission_{}.csv'.format(
    datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
df_submission.to_csv(PATH_TO_DATA + submission_filename)
print('Submission saved to {}'.format(submission_filename))

Submission saved to submission_2019-11-13_06-31-05.csv
