In [4]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, Ridge

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [5]:
def normalize_data(X):
    return pd.DataFrame(
        StandardScaler().fit_transform(X),
        index = X.index, 
        columns=X.columns
    )

def logEstimation(X, y):
    grid = {'C': np.power(10.0, np.arange(-5, 1))}
    kf=KFold(y.size, n_folds=5, shuffle=True, random_state=241)
    clf=LogisticRegression(random_state=241)
    gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=kf)
    gs.fit(X, y)
    return gs

In [6]:
items = pd.read_csv('data/items.csv')
items.fillna(0, inplace=True)
items.head()

Unnamed: 0,mid,player,item_0,item_1,item_2,item_3,item_4,item_5,item_6,item_7,...,item_111,item_112,item_113,item_114,item_115,item_116,item_117,item_118,item_119,item_120
0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [7]:
radiant_items = items.drop('player', 1).loc[items.player < 5].groupby('mid').sum()
norm_radiant_items = pd.DataFrame(index=radiant_items.index)
for col in radiant_items.columns:
    norm_radiant_items['rad_' + col] = radiant_items[col].astype(int32)
norm_radiant_items.reset_index(level=0, inplace=True)
norm_radiant_items.head()

Unnamed: 0,mid,rad_item_0,rad_item_1,rad_item_2,rad_item_3,rad_item_4,rad_item_5,rad_item_6,rad_item_7,rad_item_8,...,rad_item_111,rad_item_112,rad_item_113,rad_item_114,rad_item_115,rad_item_116,rad_item_117,rad_item_118,rad_item_119,rad_item_120
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,2,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
4,4,0,5,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### Возможно стоило бы отмасштабировать, но не буду

In [8]:
dire_items = items.drop('player', 1).loc[items.player > 4].groupby('mid').sum()
norm_dire_items = pd.DataFrame(index=dire_items.index)
for col in dire_items.columns:
    norm_dire_items['dire_' + col] = dire_items[col].astype(int32)
norm_dire_items.reset_index(level=0, inplace=True)
norm_dire_items.head()

Unnamed: 0,mid,dire_item_0,dire_item_1,dire_item_2,dire_item_3,dire_item_4,dire_item_5,dire_item_6,dire_item_7,dire_item_8,...,dire_item_111,dire_item_112,dire_item_113,dire_item_114,dire_item_115,dire_item_116,dire_item_117,dire_item_118,dire_item_119,dire_item_120
0,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
1,1,0,2,0,0,0,0,0,0,0,...,0,1,0,0,2,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
3,3,0,2,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,4,0,4,0,0,0,0,0,0,0,...,0,0,1,0,2,0,0,0,0,0


In [19]:
new_items = pd.merge(norm_radiant_items, norm_dire_items, on='mid', how='left')
new_items.head()

Unnamed: 0,mid,rad_item_0,rad_item_1,rad_item_2,rad_item_3,rad_item_4,rad_item_5,rad_item_6,rad_item_7,rad_item_8,...,dire_item_111,dire_item_112,dire_item_113,dire_item_114,dire_item_115,dire_item_116,dire_item_117,dire_item_118,dire_item_119,dire_item_120
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
1,1,0,2,0,0,0,0,0,0,0,...,0,1,0,0,2,0,0,0,0,0
2,2,0,2,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,4,0,5,0,0,0,0,0,0,0,...,0,0,1,0,2,0,0,0,0,0


In [20]:
new_items.to_csv('processing_tables/only_items.csv', index=None)

# Items dif

In [4]:
items = pd.read_csv('data/items.csv')
items.fillna(0, inplace=True)
radiant_items = items.drop('player', 1).loc[items.player < 5].groupby('mid').sum()
dire_items = items.drop('player', 1).loc[items.player > 4].groupby('mid').sum()

items_dif = pd.DataFrame(index=dire_items.index)
for col in dire_items.columns:
    items_dif[col + '_dif'] = (radiant_items[col] - dire_items[col]).astype(int32)
items_dif.reset_index(level=0, inplace=True)
items_dif.to_csv('processing_tables/items_dif.csv', index=None)
items_dif.head()

Unnamed: 0,mid,item_0_dif,item_1_dif,item_2_dif,item_3_dif,item_4_dif,item_5_dif,item_6_dif,item_7_dif,item_8_dif,...,item_111_dif,item_112_dif,item_113_dif,item_114_dif,item_115_dif,item_116_dif,item_117_dif,item_118_dif,item_119_dif,item_120_dif
0,0,0,-2,0,1,0,0,0,0,0,...,0,0,0,0,-1,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,-1,0,0,-2,0,0,0,0,0
2,2,0,2,0,0,0,0,0,0,0,...,0,0,-1,0,0,0,0,0,0,0
3,3,0,-2,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,4,0,1,0,0,0,0,0,0,0,...,0,1,-1,0,-2,0,0,0,0,0


# Merge and save

In [33]:
train = pd.read_csv('processing_tables/train_gold_heroes.csv')
test = pd.read_csv('processing_tables/test_gold_heroes.csv')

In [34]:
train = pd.merge(train, new_items, on='mid', how='left')
test = pd.merge(test, new_items, on='mid', how='left')

In [35]:
new_items.to_csv('processing_tables/processed_items.csv', index=None)
train.to_csv('processing_tables/train_gold_heroes_items.csv', index=None)
test.to_csv('processing_tables/test_gold_heroes_items.csv', index=None)