In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold

from sklearn.cross_validation import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, Ridge

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
def normalize_data(X):
    return pd.DataFrame(
        StandardScaler().fit_transform(X),
        index = X.index, 
        columns=X.columns
    )

def logEstimation(X, y):
    grid = {'C': np.power(10.0, np.arange(-5, 1))}
    kf=KFold(y.size, n_folds=5, shuffle=True, random_state=241)
    clf=LogisticRegression(random_state=241)
    gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=kf)
    gs.fit(X, y)
    return gs

In [3]:
def my_clf_cross_val(X_train, y_train): 
    clf=LogisticRegression(random_state=241, C=0.1)
    print 'logreg: ', np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))
    clf=Ridge(alpha=0.0001)
    print 'Ridge: ', np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))
    clf=Lasso(alpha=0.0001, max_iter=6000)
    print 'Lasso: ', np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

In [4]:
def easy_test():
    clf=Ridge(alpha=0.0001)
    print 'Ridge: ', np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))
    clf=Lasso(alpha=0.0001, max_iter=6000)
    print 'Lasso: ', np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

In [6]:
def full_test():
    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=228)
    print 'RF: ', np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))
    gs = logEstimation(X_train, y_train)
#     print gs.grid_scores_
    print gs.best_params_
    print('best score of log grid search: {}'.format(gs.best_score_))
    clf=LogisticRegression(random_state=241, C=gs.best_params_['C'])
#     clf=LogisticRegression(random_state=241, C=0.1)
    print 'logreg: ', np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))
    clf=Ridge(alpha=0.0001)
    print 'Ridge: ', np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))
    clf=Lasso(alpha=0.0001, max_iter=6000)
    print 'Lasso: ', np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

In [11]:
def total_merge(gold, heroes, items, xp, creeps, events):
    train_matches = pd.read_csv('data/train.csv')
    test_matches = pd.read_csv('data/test.csv')
    
    train = pd.merge(train_matches, gold, on='mid', how='left')
    test = pd.merge(test_matches, gold, on='mid', how='left')
    train = pd.merge(train, heroes, on='mid', how='left')
    test = pd.merge(test, heroes, on='mid', how='left')
    train = pd.merge(train, items, on='mid', how='left')
    test = pd.merge(test, items, on='mid', how='left')
    train = pd.merge(train, xp, on='mid', how='left')
    test = pd.merge(test, xp, on='mid', how='left')
    train = pd.merge(train, creeps, on='mid', how='left')
    test = pd.merge(test, creeps, on='mid', how='left')
    train = pd.merge(train, events, on='mid', how='left')
    test = pd.merge(test, events, on='mid', how='left')

    X_train = train.drop(['radiant_won'], 1)
    y_train = train.radiant_won

    return X_train, y_train, test

In [8]:
gold_tables_path = [
    'norm_gold',
    'goldScoreWithoutCarry',
    'goldScoreWithCarry',
    'normByRowsRankGold',
    'normByColsRankGold',
    'goldRankDif'
]

gold_tables_dict = {path: pd.read_csv('processing_tables/' + path + '.csv') for path in gold_tables_path}

In [14]:
norm_gold = pd.read_csv('processing_tables/norm_gold.csv')
gold_score = pd.read_csv('processing_tables/goldScoreWithoutCarry.csv')
gold_score_with_carry = pd.read_csv('processing_tables/goldScoreWithCarry.csv')
norm_by_rows_rank_gold = pd.read_csv('processing_tables/normByRowsRankGold.csv')
norm_by_cols_rank = pd.read_csv('processing_tables/normByColsRankGold.csv')
dif_rank_gold = pd.read_csv('processing_tables/goldRankDif.csv')

gold_tables_dict['scoreNormGold'] = pd.merge(
    norm_gold, 
    gold_score.drop(['radiant_norm_gold', 'dire_norm_gold'], 1),
    on='mid'
)

gold_tables_dict['scoreNormByRowsRank'] = pd.merge(norm_by_rows_rank_gold, gold_score, on='mid', how='left')
gold_tables_dict['scoreNormByColsRank'] = pd.merge(norm_by_cols_rank, gold_score, on='mid', how='left')
gold_tables_dict['scoreDifRankGold'] = pd.merge(dif_rank_gold, gold_score, on='mid', how='left')

In [21]:
only_xp = pd.read_csv('processing_tables/only_exp.csv')
xp_score = pd.read_csv('processing_tables/xp_score.csv')
rank_by_rows_xp = pd.read_csv('processing_tables/rankByRowsXp.csv')
rank_by_rows_xp_score = pd.merge(rank_by_rows_xp, xp_score, on='mid', how='left')

xp_dif = pd.read_csv('processing_tables/xpDif.csv')
norm_score_xp = pd.merge(only_xp, xp_score, on='mid', how='left')
xp_score_dif = pd.merge(xp_score, xp_dif, on='mid', how='left')

heroes = pd.read_csv('processing_tables/only_heroes.csv')
items = pd.read_csv('processing_tables/only_items.csv')

xp_dict = {
    'only_xp': only_xp,
    'norm xp + score': norm_score_xp,
    'xp_score': xp_score,
    'xp_dif': xp_dif,
    'xp_score_dif': xp_score_dif,
    'rank_by_rows_xp': rank_by_rows_xp,
    'rank_by_rows_xp_score': rank_by_rows_xp_score,
}

creeps = pd.read_csv('processing_tables/creeps.csv')
events = pd.read_csv('processing_tables/dummy_events.csv')

for gold_table_name, gold in gold_tables_dict.items():
    for xp_table_name, xp in xp_dict.items():
        print gold_table_name, xp_table_name
        X_train, y_train, test = total_merge(gold, heroes, items, xp, creeps, events)
        X_train.index = X_train.mid
        X_train.drop('mid', 1,inplace=True)
        my_clf_cross_val(X_train, y_train)

goldRankDif norm xp + score
logreg:  0.766173901595
Ridge:  0.764441899903
Lasso:  0.765962116574
goldRankDif rank_by_rows_xp
logreg:  0.766163004419
Ridge:  0.764194817133
Lasso:  0.765875200761
goldRankDif xp_score
logreg:  0.765479990705
Ridge:  0.763445984357
Lasso:  0.765231354801
goldRankDif only_xp
logreg:  0.766214545164
Ridge:  0.764247713985
Lasso:  0.765991038421
goldRankDif rank_by_rows_xp_score
logreg:  0.766100869082
Ridge:  0.764478500457
Lasso:  0.765867695325
goldRankDif xp_dif
logreg:  0.766383583565
Ridge:  0.764408464647
Lasso:  0.766107133354
goldRankDif xp_score_dif
logreg:  0.766296141942
Ridge:  0.764691677351
Lasso:  0.766084907833
scoreNormByColsRank norm xp + score
logreg:  0.76604067885
Ridge:  0.764648292738
Lasso:  0.766073849827
scoreNormByColsRank rank_by_rows_xp
logreg:  0.766054305536
Ridge:  0.764625264931
Lasso:  0.766046681085
scoreNormByColsRank xp_score
logreg:  0.765636655361
Ridge:  0.764314280461
Lasso:  0.765798847252
scoreNormByColsRank only_

MemoryError: 

In [107]:
# gold = pd.read_csv('processing_tables/')
heroes = pd.read_csv('processing_tables/only_heroes.csv')
items = pd.read_csv('processing_tables/only_items.csv')
xp = pd.read_csv('processing_tables/only_exp.csv')
xp_score = pd.read_csv('processing_tables/xp_score.csv')
xp = pd.merge(xp, xp_score, on='mid', how='left')
creeps = pd.read_csv('processing_tables/creeps.csv')
events = pd.read_csv('processing_tables/dummy_events.csv')

for gold_table_name, gold in gold_tables_dict.items():
    print gold_table_name, 
    X_train, y_train, test = total_merge(gold, heroes, items, xp, creeps, events)
    my_clf_cross_val(X_train, y_train)

goldRankDif
logreg:  0.765071725275
Ridge:  0.76444053927
Lasso:  0.765956935043
scoreNormByColsRank
logreg:  0.764455259581
Ridge:  0.764647095789
Lasso:  0.76607724744
goldScoreWithoutCarry
logreg:  0.764024424627
Ridge:  0.764808975599
Lasso:  0.766211588079
scoreNormGold
logreg:  0.763913143579
Ridge:  0.764398556237
Lasso:  0.765984679271
norm_gold
logreg:  0.764802685821
Ridge:  0.763851938059
Lasso:  0.765405592767
scoreNormByRowsRank
logreg:  0.76431752769
Ridge:  0.764663530475
Lasso:  0.76614739602
scoreDifRankGold
logreg:  0.764797522394
Ridge:  0.764737295219
Lasso:  0.766177217117
normByColsRankGold
logreg:  0.764798307096
Ridge:  0.764252874796
Lasso:  0.765758106068
goldScoreWithCarry
logreg:  0.764024424627
Ridge:  0.764808975599
Lasso:  0.766211588079
normByRowsRankGold
logreg:  0.764609923882
Ridge:  0.764537152695
Lasso:  0.766012323891


# GOLD STANDART

In [106]:
gold = pd.read_csv('processing_tables/goldScoreWithCarry.csv')
heroes = pd.read_csv('processing_tables/only_heroes.csv')
items = pd.read_csv('processing_tables/only_items.csv')
xp = pd.read_csv('processing_tables/only_exp.csv')
xp_score = pd.read_csv('processing_tables/xp_score.csv')
xp = pd.merge(xp, xp_score, on='mid', how='left')
creeps = pd.read_csv('processing_tables/creeps.csv')
events = pd.read_csv('processing_tables/dummy_events.csv')

X_train, y_train, test = total_merge(gold, heroes, items, xp, creeps, events)
my_clf_cross_val(X_train, y_train)

logreg:  0.764024424627
Ridge:  0.764808975599
Lasso:  0.766211588079


In [99]:
def logEstimation(X, y):
    grid = {'C': [0.0001, 0.001, 0.1, 0.2, 0.4, 0.6, 0.8, 1, 1.2]}
    kf=KFold(y.size, n_folds=5, shuffle=True, random_state=241)
    clf=LogisticRegression(random_state=241)
    gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=kf)
    gs.fit(X, y)
    return gs

In [100]:
gs = logEstimation(X_train, y_train)
print gs.grid_scores_
print gs.best_params_
print('best score of log grid search: {}'.format(gs.best_score_))
clf=LogisticRegression(random_state=241, C=gs.best_params_['C'])
print 'logreg: ', np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc'))

[mean: 0.72353, std: 0.00496, params: {'C': 0.0001}, mean: 0.74613, std: 0.00703, params: {'C': 0.001}, mean: 0.76480, std: 0.01168, params: {'C': 0.1}, mean: 0.76484, std: 0.01188, params: {'C': 0.2}, mean: 0.76467, std: 0.01194, params: {'C': 0.4}, mean: 0.76462, std: 0.01195, params: {'C': 0.6}, mean: 0.76474, std: 0.01192, params: {'C': 0.8}, mean: 0.76463, std: 0.01200, params: {'C': 1}, mean: 0.76473, std: 0.01195, params: {'C': 1.2}]
{'C': 0.2}
best score of log grid search: 0.764842857076
