In [137]:
XBOOST_LOCATION = '/Users/siim/Lib/xgboost/wrapper'
import sys
sys.path.append(XBOOST_LOCATION)
import xgboost as xgb

import numpy as np
import pandas as pd
import scipy.stats
import sklearn as skl

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import scale, OneHotEncoder
from sklearn.svm import SVC
from sklearn.decomposition import PCA

from sklearn.cross_validation import train_test_split

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
import sklearn.linear_model as lm
from sklearn.metrics import log_loss

import scipy as sp


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
#Neural Nets
from lasagne.layers import DenseLayer
from lasagne.layers import InputLayer
from lasagne.layers import DropoutLayer
from lasagne.nonlinearities import softmax
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import NeuralNet

import time
import csv

In [3]:
def proportional_train_test_split(df, label_col, test_size=0.25):
    '''
    split dataframe into training and test set so that label proportions stay the same 
    @param df DataFrame input data
    @param label_col String name of column that holds class labels
    @test_size float proportion of test set in range (0,1)
    '''   
    label_col = 'target'
    labels = df[label_col].unique()   
    
    train_arrays, test_arrays = [], []    
    for label in labels:
        arr1, arr2 = train_test_split(df[df[label_col] == label], test_size=test_size)
        train_arrays.append(arr1)
        test_arrays.append(arr2)        
        
    return pd.DataFrame(np.concatenate(train_arrays), columns=df.columns), \
            pd.DataFrame(np.concatenate(test_arrays), columns=df.columns)
    
   
def llfun(act, pred):
    '''log loss function from https://www.kaggle.com/wiki/LogarithmicLoss
    Differently from sklearn.metrics.log_loss does not require all classes
    to be present in pred'''
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

def get_mean_prob(arr):
    return np.mean( np.dstack(arr), axis=2)

def preprocess_data(X, scaler=None):
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

# Prepare Data

In [14]:
# read data from csv to DataFrames
df_all = pd.read_csv('./train.csv', index_col=0)
df_train, df_control = proportional_train_test_split(df_all, 'target', test_size=0.2)
df_test = pd.read_csv('./test.csv', index_col=0)



#shuffle test and train data (for Neural Nets)
df_all = df_all.reindex(np.random.permutation(df_all.index))
labels_all = df_all.target.map(lambda x: int(x.split('_')[1]) - 1)

df_train = df_train.reindex(np.random.permutation(df_train.index))
labels_train = df_train.target.map(lambda x: int(x.split('_')[1]) - 1)


df_control = df_control.reindex(np.random.permutation(df_control.index))
labels_control = df_control.target.map(lambda x: int(x.split('_')[1]) - 1)




# create xgboost data structures
dtrain = xgb.DMatrix(df_train.drop('target', axis=1).values, 
                     label=[int(x.split('_')[1]) - 1 for x in df_train.target.values])

dcontrol = xgb.DMatrix(df_control.drop('target', axis=1).values,
                       label=[int(x.split('_')[1]) - 1 for x in df_control.target.values])

dall = xgb.DMatrix(df_all.drop('target', axis=1).values, 
                   label=[int(x.split('_')[1]) - 1 for x in df_all.target.values]) 

dtest = xgb.DMatrix(df_test.values)


#
num_classes = 9
num_features = 93

# 1. Gradient Boost model

In [180]:
%%capture --no-stdout

#predictors_xgb, predictions_xgb = [], []

sys.stdout.flush()
param = {'bst:max_depth': 8, 'subsample': 0.9, 
         'bst:eta': 0.075, 'num_class': 9, 
         'silent': 0, 'eval_metric': 'mlogloss', 
         'objective': 'multi:softprob', 
         'colsample_bytree': 0.6, 'min_child_weight': 4}

num_round = 550
evallist = [(dcontrol,'eval'), (dtrain,'train')]

for i in range(30):  
    
    time_start = time.time()
    param['seed'] = np.random.randint(10000)
    plst = param.items()
    tmp1 = xgb.train( plst, dtrain, num_round, evallist )
    predictors_xgb.append( tmp1 )
    time_train = time.time() - time_start
    
    tmp2 = tmp1.predict(dcontrol) 
    predictions_xgb.append( tmp2 )
    time_predict = time.time() - time_start - time_train
    print '{}) {}, (time: {}, {})'.format(i, 
                                          log_loss(labels_control, tmp2),
                                          time_train, time_predict)    
    sys.stdout.flush()

# 2. Neural Nets

In [17]:
#parameters 
iteration_params = [
    (
          [('input', InputLayer),             
           ('dense0', DenseLayer), 
           ('dropout0', DropoutLayer),           
           ('dense1', DenseLayer), 
           ('output', DenseLayer)], 
        {'dense0_num_units': 200, 
         'dropout0_p': 0.5,
         'dense1_num_units': 200, 
        }
    ),
    (
          [('input', InputLayer),             
           ('dense0', DenseLayer), 
           ('dropout0', DropoutLayer),           
           ('dense1', DenseLayer), 
           ('output', DenseLayer)], 
        {'dense0_num_units': 600, 
         'dropout0_p': 0.5,
         'dense1_num_units': 600, 
        }
    ),
    (
          [('input', InputLayer),             
           ('dense0', DenseLayer), 
           ('dropout0', DropoutLayer),           
           ('dense1', DenseLayer), 
           ('output', DenseLayer)], 
        {'dense0_num_units': 800, 
         'dropout0_p': 0.5,
         'dense1_num_units': 800, 
        }
    ),
    (
          [('input', InputLayer),             
           ('dense0', DenseLayer), 
           ('dropout0', DropoutLayer),           
           ('dense1', DenseLayer), 
           ('output', DenseLayer)], 
        {'dense0_num_units': 512, 
         'dropout0_p': 0.5,
         'dense1_num_units': 512, 
        }
    ),
    (
          [('input', InputLayer),             
           ('dense0', DenseLayer), 
           ('dropout0', DropoutLayer),           
           ('dense1', DenseLayer), 
           ('output', DenseLayer)], 
        {'dense0_num_units': 1024, 
         'dropout0_p': 0.5,
         'dense1_num_units': 1024, 
        }
    ),
    (
          [('input', InputLayer),             
           ('dropout0', DropoutLayer),
           ('dense0', DenseLayer), 
           ('dropout1', DropoutLayer),           
           ('dense1', DenseLayer), 
           ('output', DenseLayer)], 
        {'dropout0': 0.5,
         'dense0_num_units': 1024, 
         'dropout1': 0.5,
         'dense1_num_units': 1024, 
        }
    ),
    (
        [('input', InputLayer),             
           ('dense0', DenseLayer), 
           ('dropout0', DropoutLayer),           
           ('dense1', DenseLayer), 
           ('dense2', DenseLayer), 
           ('output', DenseLayer)], 
        {'dense0_num_units': 900, 
         'dropout0_p': 0.75,
         'dense1_num_units': 900, 
         'dense2_num_units': 200, 
        }
    ),
    (
        [('input', InputLayer),             
           ('dense0', DenseLayer), 
           ('dropout0', DropoutLayer),           
           ('dense1', DenseLayer), 
           ('dropout1', DropoutLayer),           
           ('dense2', DenseLayer), 
           ('dropout2', DropoutLayer),
           ('output', DenseLayer)], 
        {'dense0_num_units': 900, 
         'dropout0_p': 0.5,
         'dense1_num_units': 900, 
         'dropout1_p': 0.5,
         'dense2_num_units': 900, 
         'dropout2_p': 0.75,
        }
    ),
    (
        [('input', InputLayer),             
           ('dense0', DenseLayer), 
           ('dropout0', DropoutLayer),           
           ('dense1', DenseLayer), 
           ('dropout1', DropoutLayer),           
           ('dense2', DenseLayer), 
           ('dropout2', DropoutLayer),
           ('output', DenseLayer)], 
        {'dense0_num_units': 1024, 
         'dropout0_p': 0.5,
         'dense1_num_units': 512, 
         'dropout1_p': 0.5,
         'dense2_num_units': 256, 
         'dropout2_p': 0.5,
        }
    ),
    #####
    (
          [('input', InputLayer),             
           ('dense0', DenseLayer), 
           ('dropout0', DropoutLayer),           
           ('dense1', DenseLayer), 
           ('output', DenseLayer)], 
        {'dense0_num_units': 512, 
         'dropout0_p': 0.5,
         'dense1_num_units': 512, 
        }
    ),
    (
          [('input', InputLayer),             
           ('dense0', DenseLayer), 
           ('dropout0', DropoutLayer),           
           ('dense1', DenseLayer), 
           ('output', DenseLayer)], 
        {'dense0_num_units': 600, 
         'dropout0_p': 0.5,
         'dense1_num_units': 600, 
        }
    ),
    (
          [('input', InputLayer),             
           ('dense0', DenseLayer), 
           ('dropout0', DropoutLayer),           
           ('dense1', DenseLayer), 
           ('output', DenseLayer)], 
        {'dense0_num_units': 700, 
         'dropout0_p': 0.5,
         'dense1_num_units': 700, 
        }
    )
    ]

In [181]:
X, scaler = preprocess_data( df_train[df_train.columns[:-1]].values.astype('float') )
y = labels_train.values.astype('int32')
#nets = [] 

for layers, params in iteration_params[-3:] * 10:
    tmpNet = NeuralNet(layers=layers,
                 input_shape=(None, num_features),
                 output_num_units=num_classes,
                 output_nonlinearity=softmax,
                 
                 update=nesterov_momentum,
                 update_learning_rate=0.01,
                 update_momentum=0.9,
                 
                 eval_size=0.2,
                 verbose=0,
                 max_epochs=35,
                 
                 **params
                 )
    tmpNet.fit(X, y)
    aux = min(tmpNet.train_history_, key=lambda x: x['valid_loss'])
    print '{}) best @ epoch {}: {}'.format(len(nets), aux['epoch'], aux['valid_loss'])
    sys.stdout.flush()
    nets.append(tmpNet)

In [182]:
# Validate results
X, _ = preprocess_data(df_control[df_control.columns[:-1]].values.astype('float'), scaler)
predictions_nn = []
for i, net in zip(range(len(nets)), nets[:]):    
    pred = net.predict_proba( X )
    ll = skl.metrics.log_loss(labels_control, pred)
    
    if ll > 0.51:
        continue
    print '{}) {}'.format(i, ll )
    predictions_nn.append(pred)
    
print 'NN ({}) ensemble mean {}'.format(np.shape(predictions),
    skl.metrics.log_loss(labels_control, get_mean_prob(predictions_nn)))

# 3. sklearn models

In [69]:
%%capture --no-stdout

skl_restart = False


clfs = [
    RandomForestClassifier(**params_rf),
#    ExtraTreesClassifier(n_estimators=100),
#    GradientBoostingClassifier(n_estimators=1000, learning_rate=0.5),
#    SVC(C=10000, probability=True)
]

params_rf = {'bootstrap': False, 'min_samples_leaf': 1, 'n_estimators': 1000, 
            'min_samples_split': 3, 'criterion': 'gini', 'max_features': 10}
if skl_restart:
    predictors_skl = []
    predictions_skl = []
    
print 'FIT'
sys.stdout.flush()
for clf in clfs*2:
    predictors_skl.append( 
            clf.fit(df_train[df_train.columns[:-1]], labels_train) # convert classes to int 0...8                
            )    

print 'PREDICT'
sys.stdout.flush()
for clf in predictors_skl:
    predictions_skl.append(clf.predict_proba( df_control[df_control.columns[:-1]] ))
    
pass

FIT
PREDICT


# Validate & Analyze

In [157]:
'''
review predictors and find ensemble combo
'''
nn_ens = get_mean_prob(predictions_nn)
skl_ens = get_mean_prob(predictions_skl)
xgb_ens = get_mean_prob(predictions_xgb)

print 'SK ens:', log_loss(labels_control, skl_ens)
print 'NN ens:', log_loss(labels_control, nn_ens)
print 'GB ens:', log_loss(labels_control, xgb_ens)

for prediction in predictions_xgb:
    print '-- :', log_loss(labels_control, prediction)
    
print 'ENS', log_loss(labels_control, 
                      get_mean_prob([nn_ens * 0.3] + 
                                    [skl_ens * 0.0] +
                                    [xgb_ens * 0.7]))

SK ens: 0.537128993152
NN ens: 0.49128332543
GB ens: 0.445642994198
-- : 0.450854697643
-- : 0.448594597719
-- : 0.44981782105
-- : 0.452437168303
-- : 0.451266495907
ENS 0.438570688446


# OUTPUT

In [168]:
# PREDICTIONS

# Neural nets
X, _ = preprocess_data( df_test.values.astype('float'), scaler)
tests_nn = []
for net in nets:
    tests_nn.append( net.predict_proba( X ) )
                               
# XGB
tests_xgb = []
for bst in predictors_xgb[5:]:
    tests_xgb.append( bst.predict( dtest ) )
#test_xgb = bst.predict(dtest)

# SKL, RF
tests_rf = []
for clf in predictors_skl:
    tests_rf.append( clf.predict_proba( df_test.values) )
    
# Ensemble                           
test_ens = get_mean_prob([aux * 0.3 for aux in tests_nn] +
                            [aux * 0.7 for aux in tests_xgb])

In [179]:
label_columns = ['Class_' + str(i) for i in range(1,10)]
with open('./output_ens.csv', 'w') as fh:
    writer = csv.writer(fh)
    writer.writerow(['id'] + label_columns)
    for i, r in enumerate(test_ens):
        row = [i+1] + map(lambda x: round(x, 3), list(r))
        writer.writerow(row)

## Top predictions from each model

process:
- on first layer calculate probabilities with all classifiers
- From each predictor of the 1st layer pick top N classes (that are above threshold)
- Use those classes as features (total N * # of predictors) for 2nd layer (predicts probability)

Result - initial try gave 0.86@LB

In [174]:
print 'Generate parameters:'
X = []
tmp_preds = [get_mean_prob(tests_nn)] + \
            [get_mean_prob(tests_rf)] + \
            [get_mean_prob(tests_xgb)]
      
print np.shape(get_mean_prob(tests_nn))
print np.shape(get_mean_prob(tests_rf))
print np.shape(get_mean_prob(tests_xgb))

#tmp_preds = [nn_ens] + [skl_ens] + [xgb_preds]
for rows in zip(*tmp_preds):
    args = []
    for row in rows:
        top_idxs = np.argsort(row)[-3:]        
        for idx in top_idxs:
            if row[idx] < 0.1:
                args.append(-1)
            else:
                args.append(idx)        
    X.append( args )
print np.shape( X )

Generate parameters:
(144368, 9)
(144368, 9)
(144368, 9)
(144368, 9)


In [183]:
%capture --no-stdout
dtrain2 = xgb.DMatrix(X, labels_control)

sys.stdout.flush()
param = {'bst:max_depth': 8, 'subsample': 0.9, 
         'bst:eta': 0.075, 'num_class': 9, 
         'silent': 0, 'eval_metric': 'mlogloss', 
         'objective': 'multi:softprob', 
         'colsample_bytree': 0.6, 'min_child_weight': 4}
plst = param.items()
num_round = 600
evallist = [(dtrain2,'train')]
bst = xgb.train( plst, dtrain2, num_round, evallist )

pass

ERROR: Line magic function `%capture` not found (But cell magic `%%capture` exists, did you mean that instead?).
[0]	train-mlogloss:2.029082


In [178]:
dtrain2 = xgb.DMatrix( X )
test_ens = bst.predict(dtrain2) 

# Analysis over classes

Here compare results by class and predictors

- Find out best predictors per class, and use them for final decision
- Just run new classifier over results of initial predictions
- try first layer both with class and probability predictions

In [None]:
'''
Compare loglosses by class for each predictor
'''
enc = OneHotEncoder()
enc.fit([[x] for x in labels])

for cls in range(9):
    lls = []
    for prediction in predictions:
        act = enc.transform([[x] for x in aux[aux == cls]]).toarray()
        lls.append( llfun(act, prediction[aux == cls])[cls] ) 
    print 'llfun({})'.format(cls), [round(x, 3) for x in lls]

In [98]:
print 'TOTAL:'
for a, b in df['4'].value_counts().iteritems():
    print a, round(b/float(len(df)), 3), b

print 


mistakes = {}

for i in '0123':
    print 'class, % of class among wrong, % of mistake in class'
    mistakes[i], s = {}, []
    for a, b in df[df[i] != df['4']][i].value_counts().iteritems():
        #a - class
        #b - wrong of that class
        s.append(b)
        print '{}, {} ({})'.format(a, round(b/float(sum(df[i] != df['4'])), 3), b),        
        print round(b/float(len(df[i])), 3) #% of mistake in class\
        mistakes[i][a] = round(b/float(len(df[i])), 3)
    print 'total mistake', sum(s)
    print 
    
print 
print
print "best predictors per class"
for cls in range(9):
    print cls, [mistakes[i][cls] for i in '0123']

TOTAL:
1 0.261 3225
5 0.228 2827
7 0.137 1693
2 0.129 1601
8 0.08 991
6 0.046 568
4 0.044 548
3 0.044 539
0 0.031 386

class, % of class among wrong, % of mistake in class
1, 0.48 (1069) 0.086
2, 0.2 (445) 0.036
7, 0.097 (216) 0.017
5, 0.075 (168) 0.014
8, 0.067 (150) 0.012
6, 0.036 (80) 0.006
3, 0.026 (58) 0.005
0, 0.012 (26) 0.002
4, 0.006 (14) 0.001
total mistake 2226

class, % of class among wrong, % of mistake in class
1, 0.514 (1196) 0.097
2, 0.177 (413) 0.033
7, 0.099 (231) 0.019
5, 0.089 (207) 0.017
8, 0.061 (142) 0.011
6, 0.031 (72) 0.006
3, 0.014 (32) 0.003
0, 0.01 (23) 0.002
4, 0.006 (13) 0.001
total mistake 2329

class, % of class among wrong, % of mistake in class
1, 0.47 (1228) 0.099
7, 0.191 (498) 0.04
2, 0.152 (397) 0.032
5, 0.055 (143) 0.012
8, 0.048 (125) 0.01
6, 0.042 (110) 0.009
0, 0.027 (70) 0.006
3, 0.011 (28) 0.002
4, 0.006 (15) 0.001
total mistake 2614

class, % of class among wrong, % of mistake in class
1, 0.442 (949) 0.077
2, 0.223 (479) 0.039
7, 0.081 (174) 