In [2]:
XBOOST_LOCATION = '/Users/siim/Lib/xgboost/wrapper'
import sys
sys.path.append(XBOOST_LOCATION)
import xgboost as xgb

from sklearn.cross_validation import train_test_split

import pandas as pd
import numpy as np
import itertools

# Prepare Data

In [5]:
def proportional_train_test_split(df, label_col, test_size=0.25):
    '''
    split dataframe into training and test set so that label proportions stay the same 
    @param df DataFrame input data
    @param label_col String name of column that holds class labels
    @test_size float proportion of test set in range (0,1)
    '''   
    label_col = 'target'
    labels = df[label_col].unique()   
    
    train_arrays, test_arrays = [], []    
    for label in labels:
        arr1, arr2 = train_test_split(df[df[label_col] == label], test_size=test_size)
        train_arrays.append(arr1)
        test_arrays.append(arr2)        
        
    return pd.DataFrame(np.concatenate(train_arrays), columns=df.columns), \
            pd.DataFrame(np.concatenate(test_arrays), columns=df.columns)
    
    
# read data from csv to DataFrames
df = pd.read_csv('./train.csv', index_col=0)
df_train, df_control = proportional_train_test_split(df, 'target', test_size=0.2)

label_train = df_train.target.values
df_train = df_train.drop('target', axis=1)
label_control = df_control.target.values
df_control = df_control.drop('target', axis=1)

label_all = df.target.values
df = df.drop('target', axis=1)

df_test = pd.read_csv('./test.csv', index_col=0)


# create xgboost data structures
labels = [int(x.split('_')[1]) - 1 for x in label_train]
dtrain = xgb.DMatrix(df_train.values, label=labels)

labels = [int(x.split('_')[1]) - 1 for x in label_control]
dcontrol = xgb.DMatrix(df_control.values, label=labels)

labels = [int(x.split('_')[1]) - 1 for x in label_all]
dall = xgb.DMatrix(df.values, label=labels) 

dtest = xgb.DMatrix(df_test.values)

# Feature Selection

In [47]:
%%capture --no-stdout
'''
Grid search. Define param_options and for-loop below performs 
grid search over all possible combinations
printing out best mean test logloss score for each

When running through many options, then put output to DataFrame, 
group by different columns and aggregate to find best settings
'''

from operator import itemgetter
def best_cv_result(cv_results):
    '''get number of round and mean score for best result by test folds'''
    aux = [float( x.split(':')[1].split('+')[0] ) for x in cv_results] 
    return min(enumerate( aux ), key=itemgetter(1) )

param_initial = {
        'silent': 0,
        'objective': 'multi:softprob', 
        'num_class': 9,
        'eval_metric': 'mlogloss'
    }

param_options = (
        ('subsample', [0.1]),
        ('colsample_bytree', [0.1]),
        ('min_child_weight', [1]),
        ('max_depth', [10]),
        ('eta', [0.01]),
        ('gamma', [False, 0.2])
    )

num_round = 5
nfold = 5

for param_vals in itertools.product(*[aux[1] for aux in param_options]):
        
    param = param_initial
    for i, val in enumerate(param_vals):
        if val is not False:
            param[param_options[i][0]] = val
        
    cv_results = xgb.cv(param, dtrain, num_round, nfold=nfold)
    print param_vals, best_cv_result(cv_results)
    sys.stdout.flush()

(0.1, 0.1, 1, 10, 0.01, False) (4, 2.144435)
(0.1, 0.1, 1, 10, 0.01, 0.2) (4, 2.14441)


# Predict

In [46]:
'''
use parameters found from grid search above
'''
with_train_test_split = False
run_prediction = False

param = {
    'silent': 0,
    'objective': 'multi:softprob', 
    'num_class': 9,
    'eval_metric': 'mlogloss',
    'subsample': 0.1,
    'colsample_bytree': 0.1,
    'min_child_weight': 1,
    'bst:max_depth':10,
    'bst:eta': 0.1,
    'gamma': 0.1
}
num_round = 4200

print num_round, param

plst = param.items()

if with_train_test_split:
    evallist = [(dcontrol,'eval'), (dtrain,'train')]
    bst = xgb.train( plst, dtrain, num_round, evallist )
else: 
    evallist = [(dall,'train')]
    bst = xgb.train( plst, dall, num_round, evallist )
    
if run_prediction:
    pred = bst.predict(dtest)

4200 {'bst:max_depth': 10, 'subsample': 0.1, 'bst:eta': 0.1, 'gamma': 0.1, 'num_class': 9, 'silent': 0, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob', 'colsample_bytree': 0.1, 'min_child_weight': 1}


In [123]:
pred = bst.predict(dtest)

# Output

In [124]:
import csv
label_columns = ['Class_' + str(i) for i in range(1,10)]
with open('./output.csv', 'w') as fh:
    writer = csv.writer(fh)
    writer.writerow(['id'] + label_columns)
    for i, r in enumerate(pred):
        row = [i+1] + map(lambda x: round(x, 3), list(r))
        writer.writerow(row)