In [1]:
import pandas
import numpy as np

RADIANT_WIN = 'radiant_win'
ADDITIONAL_RESULT_FIELDS = ['duration', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire']
RESULT_FIELDS = [RADIANT_WIN] + ADDITIONAL_RESULT_FIELDS

#1) read table from csv, remove result fields
#====================================
features_filename = '/home/rk/submissions/final_work/features.csv'
index_col = 'match_id'
features = pandas.read_csv(features_filename, index_col=index_col)

def remove_resulting_columns(_df):
    global RADIANT_WIN
    return _df[list(set(_df.columns.values.tolist()) - set(RESULT_FIELDS))], _df[RADIANT_WIN]

X,Y = remove_resulting_columns(features)
#====================================

In [134]:
#====================================
#2) Get epmty values  

def get_features_with_misses(_df):
    return [_x for _x in _df.columns.values if len(_df[pandas.isnull(_df[_x])]) > 0]

len(get_features_with_misses(X))
# 12 columns with missing data
# first_blood_* <- nan if there was no 'first blood event' during first 5 minutes
# radiant_bottle_time <- wasn't bought
# radiant_courier_time <- wasn't bought
# radiant_flying_courier_time <- wasn't bought
# the same is for dire
#====================================
print ", ".join(get_features_with_misses(remove_resulting_columns(features)[0]))


dire_bottle_time, radiant_first_ward_time, radiant_flying_courier_time, dire_courier_time, first_blood_team, dire_flying_courier_time, first_blood_time, dire_first_ward_time, first_blood_player1, first_blood_player2, radiant_bottle_time, radiant_courier_time


In [3]:
#====================================
#3) Fill empty values
X = X.fillna(0)
#====================================

#====================================
#4) Target variable column is 'radiant_win'
#====================================

array([ 0.66820321,  0.66402863,  0.6670671 ,  0.66105802,  0.66454867])

In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
import time
import datetime

TREES_COUNT = 30
TIMINGS = []

cv = KFold(len(X), n_folds=5, shuffle=True)
#calc cross val score for several trees count

classifier_scores = {}
for trees_num in [20, 30, 40, 50]:
    print 'start to calc for %s trees' % trees_num
    classifier = GradientBoostingClassifier(n_estimators=trees_num)
    start_time = datetime.datetime.now()
    
    _scores = cross_val_score(estimator=classifier, X=X, y=Y, cv=cv, scoring='roc_auc', n_jobs=-1)
    
    final_time = datetime.datetime.now() - start_time
    classifier_scores[trees_num] = {
        'scores': _scores,
        'time': final_time
    }


start to calc for 20 trees
start to calc for 30 trees
start to calc for 40 trees
start to calc for 50 trees


In [10]:
from copy import deepcopy
#print results
_scores = deepcopy(classifier_scores)
for tree_num in _scores:
    _scores[tree_num]['scores'] = np.mean(_scores[tree_num]['scores'])
    print "trees: '%s', time: '%s', roc_auc: '%s'" % (tree_num, str(_scores[tree_num]['time']), _scores[tree_num]['scores'])


trees: '40', time: '0:02:08.995060', roc_auc: '0.69402516975'
trees: '50', time: '0:02:40.961570', roc_auc: '0.697246250333'
trees: '20', time: '0:01:08.258663', roc_auc: '0.681777441497'
trees: '30', time: '0:01:38.068877', roc_auc: '0.689439163418'


In [155]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_scaled = scaler.fit_transform(X)
regression = LogisticRegression()

C_CONSTANTS = np.power(10.0, range(-4,4))

def search_c(func, c_list):
    def calc_with_time(func, c):
        _start_time = datetime.datetime.now()
        _res = func(c)
        _final_time = str(datetime.datetime.now() - _start_time)
        print "ready func '%s' with c='%s', res='%s', time='%s'" % (func.__name__, c, _res, _final_time)
        return _res, _final_time, c
        
    return max(map(lambda x: calc_with_time(func, x), c_list), key=lambda x: x[0])

#1) logistic regression on all features        
def simple_regression_scaled(c):
    global x_scaled, Y, cv
    regression = LogisticRegression(C=c)
    _res = cross_val_score(estimator=regression, X=x_scaled, y=Y, cv=cv, scoring='roc_auc', n_jobs=-1)
    return np.mean(_res)
    
simple_regression_scaled_res = search_c(simple_regression_scaled, C_CONSTANTS)
simple_regression_scaled_res

ready func 'simple_regression_scaled' with c='0.0001', res='0.711217320974', time='0:00:02.903926'
ready func 'simple_regression_scaled' with c='0.001', res='0.716167292924', time='0:00:05.213528'
ready func 'simple_regression_scaled' with c='0.01', res='0.716344879872', time='0:00:06.721026'
ready func 'simple_regression_scaled' with c='0.1', res='0.71632046854', time='0:00:07.015689'
ready func 'simple_regression_scaled' with c='1.0', res='0.716317570129', time='0:00:07.126337'
ready func 'simple_regression_scaled' with c='10.0', res='0.716317186619', time='0:00:07.024566'
ready func 'simple_regression_scaled' with c='100.0', res='0.716317112405', time='0:00:07.034862'
ready func 'simple_regression_scaled' with c='1000.0', res='0.71631709547', time='0:00:07.225902'


(0.71634487987202744, '0:00:06.721026', 0.01)

In [161]:
_heroes_features = ['%s%s_hero' % (_k, _i) for _i in range(1, 6) for _k in ('r', 'd')]

CATEGORIZED_FEATURES = ['lobby_type'] + _heroes_features

x_wout_categorized = X[list(set(X.columns.values.tolist()) - set(CATEGORIZED_FEATURES))]
x_wout_categorized_scaled = StandardScaler().fit_transform(x_wout_categorized)

#2) logistic regression on all features wout categorized    
def regression_wout_categorized_features(c):
    global x_wout_categorized_scaled, Y, cv
    regression_wout_categorized = LogisticRegression(C=c)
    return np.mean(cross_val_score(estimator=regression_wout_categorized, X=x_wout_categorized_scaled, y=Y, cv=cv,
                           scoring='roc_auc', n_jobs=-1))

regression_wout_categorized_features_res = search_c(regression_wout_categorized_features, C_CONSTANTS)
regression_wout_categorized_features_res

ready func 'regression_wout_categorized_features' with c='0.0001', res='0.711191345793', time='0:00:02.705892'
ready func 'regression_wout_categorized_features' with c='0.001', res='0.716179492122', time='0:00:04.926753'
ready func 'regression_wout_categorized_features' with c='0.01', res='0.716364860708', time='0:00:06.530423'
ready func 'regression_wout_categorized_features' with c='0.1', res='0.71634202932', time='0:00:06.624340'
ready func 'regression_wout_categorized_features' with c='1.0', res='0.716338118433', time='0:00:06.723832'
ready func 'regression_wout_categorized_features' with c='10.0', res='0.71633758659', time='0:00:06.729390'
ready func 'regression_wout_categorized_features' with c='100.0', res='0.716337503947', time='0:00:06.634843'
ready func 'regression_wout_categorized_features' with c='1000.0', res='0.716337537839', time='0:00:06.830869'


(0.71636486070808325, '0:00:06.530423', 0.01)

In [109]:
heroes_dict_filename = '/home/rk/submissions/final_work/data/dictionaries/heroes.csv'
heroes = pandas.read_csv(heroes_dict_filename, index_col='id')
total_heroes_count = heroes.count()[0]
print "total heroes count is:'%s'" % total_heroes_count

def calc_used_heroes(_df):
    _res = set()
    for _col_name in _heroes_features:
        _res = _res.union(set(_df[_col_name].unique().tolist()))
    return _res

#3) heroes count
heroes_count = len(calc_used_heroes(X))
#4) regression with word bag
def calc_heroes_words_bag(_df, heroes_count):
    X_pick = np.zeros((_df.shape[0], heroes_count))
    for i, match_id in enumerate(_df.index):
        for p in xrange(5):
            X_pick[i, _df.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, _df.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    return X_pick

word_bag = calc_heroes_words_bag(X, total_heroes_count)

total heroes count is:'112'


In [164]:
x_scaled_with_word_bag = np.concatenate((x_wout_categorized_scaled, word_bag), axis=1)
    
def regression_with_word_bag(c):
    global x_scaled_with_word_bag, cv, Y
    return np.mean(cross_val_score(
        estimator=LogisticRegression(C=c),
        X=x_scaled_with_word_bag, y=Y, cv=cv, scoring='roc_auc', n_jobs=-1))

regression_with_word_bag_res = search_c(regression_with_word_bag, C_CONSTANTS)
regression_with_word_bag_res

ready func 'regression_with_word_bag' with c='0.0001', res='0.724937232498', time='0:00:03.302896'
ready func 'regression_with_word_bag' with c='0.001', res='0.746165535297', time='0:00:06.225842'
ready func 'regression_with_word_bag' with c='0.01', res='0.751573468685', time='0:00:09.929529'
ready func 'regression_with_word_bag' with c='0.1', res='0.751767993492', time='0:00:13.434582'
ready func 'regression_with_word_bag' with c='1.0', res='0.751748739393', time='0:00:15.947667'
ready func 'regression_with_word_bag' with c='10.0', res='0.751745749678', time='0:00:14.941429'
ready func 'regression_with_word_bag' with c='100.0', res='0.751745529351', time='0:00:14.056328'
ready func 'regression_with_word_bag' with c='1000.0', res='0.751745450835', time='0:00:14.757150'


(0.75176799349225965, '0:00:13.434582', 0.10000000000000001)

In [177]:
#5) get prediction on test data
test_filename = '/home/rk/submissions/final_work/features_test.csv'
test_data = pandas.read_csv(test_filename, index_col=index_col)
test_data = test_data.fillna(0)

test_x_wout_categorized = test_data[list(set(test_data.columns.values.tolist()) - set(CATEGORIZED_FEATURES))]
test_data_wout_categorized_scaled = StandardScaler().fit_transform(test_x_wout_categorized)

word_bag_test = calc_heroes_words_bag(test_data, total_heroes_count)
x_scaled_with_word_bag_test = np.concatenate((test_data_wout_categorized_scaled, word_bag_test), axis=1)

best_classifier = LogisticRegression(C=0.1)
best_classifier.fit(X=x_scaled_with_word_bag, y=Y)
prediction = best_classifier.predict_proba(x_scaled_with_word_bag_test)
prediction

array([[ 0.1753393 ,  0.8246607 ],
       [ 0.24283038,  0.75716962],
       [ 0.81221233,  0.18778767],
       ..., 
       [ 0.76600267,  0.23399733],
       [ 0.37529702,  0.62470298],
       [ 0.57289783,  0.42710217]])

In [208]:
print prediction[:,0:1].max()
print prediction[:,0:1].min()

0.991419367389
0.00354077701738
