In [4]:
import pandas as pd
import numpy as np
import csv
import random

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.base import BaseEstimator
from scipy.optimize import minimize
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from sknn.mlp import Classifier, Layer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

print('Load data...')


###################################################
########     TO CHANGE    #########################
###################################################

DATA_DIR = "/Desktop/Kaggle/BNP_Paribas/"
train = pd.read_csv(DATA_DIR + "train.csv")
test = pd.read_csv(DATA_DIR + "test.csv")

###################################################
########     END OF CHANGE     ####################
###################################################





target = train['target'].values

train = train.drop(['ID','target'],axis=1)
id_test = test['ID'].values
test = test.drop(['ID'],axis=1)

print('Clearing...')
for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            #print "mean", train_series.mean()
            train.loc[train_series.isnull(), train_name] = -9999 #train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = -9999 #train_series.mean()  #TODO
            

            
n_classes = 2 
for_real = True

#this is what i'll change when i run the whole data set...
#essentially my train and test sets are already split

#Spliting data into train and test sets.
X, X_test, y, y_test = train_test_split(train, target, test_size=0.2)
    
#Spliting train data into training and validation sets.
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25)

print('Data shape:')
print('X_train: %s, X_valid: %s, X_test: %s \n' %(X_train.shape, X_valid.shape, 
                                                  X_test.shape))

if for_real:
    #take the train, target and test data, and come up with a validation set from train
    X_real = train
    X_test_real = test
    y_real = target
    
    X_train_real, X_valid_real, y_train_real, y_valid_real = train_test_split(X_real, y_real, test_size=0.25)
    

    
    
#Defining the classifiers
#think about jittering the random state and then averaging the predictions together ...
#only good for extra trees, RF?, native XGB, NN
clfs = {#'LR'  : LogisticRegression(), 
        #'SVM' : SVC(probability=True, random_state=random_state), 
        #'RF'  : RandomForestClassifier(n_estimators=100, n_jobs=-1), 
        #'GBM' : GradientBoostingClassifier(n_estimators=50), 
        'ETC' : ExtraTreesClassifier(n_estimators=108, max_features=130, max_depth=12, n_jobs=-1),
        #'KNN' : KNeighborsClassifier(n_neighbors=30)}
        'XGBc': XGBClassifier(objective='binary:logistic',
                              colsample_bytree=0.77638333498678636,
                              learning_rate=0.030567867858705199,
                              max_delta_step=4.6626180513766657,
                              min_child_weight=57.354121041109124,
                              n_estimators=478,
                              subsample=0.8069399976204783,
                              max_depth=6,
                              gamma=0.2966938071810209)#,
        #'NN'  : Pipeline([('min/max scaler', MinMaxScaler(feature_range=(-1.0, 1.0))),
        #                  ('neural network', Classifier(layers=[Layer("Rectifier", units=10),
        #                                                        Layer("Tanh", units=10),
        #                                                        Layer("Softmax")], 
        #                                                n_iter=5))])       
       }


print('------------------------------------------------------------------------')    
print('MEAN MODELS: Performance of individual classifiers (1st layer) on X_test')   
print('------------------------------------------------------------------------')
p_valid_mean = []
p_test_mean = []

p_valid_mean_real = []
p_test_mean_real = []

for nm, clf in clfs.items():
    p_valid_clf = []
    p_test_clf = []
    p_valid_clf_real = []
    p_test_clf_real = []
    
    holder = []
    
    for i in range(250):
        print "Iteration: " + str(i+1)
        dummy = random.randint(1,10000)
        x = True
        while x == True:
            if dummy in holder:
                dummy = random.randint(1,10000)
            else:
                x = False
        holder.append(dummy)
    
        random.seed(dummy)
    
        if nm == 'NN':
            #First run. Training on (X_train, y_train) and predicting on X_valid.
            clf.fit(X_train.as_matrix(), y_train)
            yv = clf.predict_proba(X_valid.as_matrix())
            p_valid_clf.append(yv)
        
            #Second run. Training on (X, y) and predicting on X_test.
            clf.fit(X.as_matrix(), y)
            yt = clf.predict_proba(X_test.as_matrix())
            p_test_clf.append(yt)
        
            if for_real:
                #First run. Training on (X_train, y_train) and predicting on X_valid.
                clf.fit(X_train_real.as_matrix(), y_train_real)
                yv_real = clf.predict_proba(X_valid_real.as_matrix())
                p_valid_clf_real.append(yv_real)
        
                #Second run. Training on (X, y) and predicting on X_test.
                clf.fit(X_real.as_matrix(), y_real)
                yt_real = clf.predict_proba(X_test_real.as_matrix())
                p_test_clf_real.append(yt_real)
    
        else:
            #First run. Training on (X_train, y_train) and predicting on X_valid.
            clf.fit(X_train, y_train)
            yv = clf.predict_proba(X_valid)
            p_valid_clf.append(yv)
        
            #Second run. Training on (X, y) and predicting on X_test.
            clf.fit(X, y)
            yt = clf.predict_proba(X_test)
            p_test_clf.append(yt)
        
            if for_real:
                #First run. Training on (X_train, y_train) and predicting on X_valid.
                clf.fit(X_train_real, y_train_real)
                yv_real = clf.predict_proba(X_valid_real)
                p_valid_clf_real.append(yv_real)
        
                #Second run. Training on (X, y) and predicting on X_test.
                clf.fit(X_real, y_real)
                yt_real = clf.predict_proba(X_test_real)
                p_test_clf_real.append(yt_real)
            
       
    #Printing out the performance of the classifier
    mean_pred_cv = np.mean(p_valid_clf, axis=0)
    mean_pred_test = np.mean(p_test_clf, axis=0)
    p_valid_mean.append(mean_pred_cv)
    p_test_mean.append(mean_pred_test)
    
    mean_real_pred_cv = np.mean(p_valid_clf_real, axis=0)
    mean_real_pred_test = np.mean(p_test_clf_real, axis=0)
    p_valid_mean_real.append(mean_real_pred_cv)
    p_test_mean_real.append(mean_real_pred_test)
    
    print('{:10s} {:2s} {:1.7f}'.format('%s - mean: ' %(nm), 'logloss  =>', log_loss(y_test, mean_pred_test)))
    
print('')

#also try setting different parameters for the XGB and add a NN to the mix
#either use bayesopt for each classifier and putting those into this model, -or- 
#randomize both parameters and by random_state... i could do several loops here
#lots of comp time but each random_state run a series of different parameters and
#take the average result for that particular random_state, then run the same
#parameters on the next random_state (don't want random X random as that is hard to replicate)...
#too many combos... let's do bayes_opt






pd.DataFrame({"p_valid_mean_0_0": p_valid_mean[0][:,0],
              "p_valid_mean_0_1": p_valid_mean[0][:,1],
              "p_valid_mean_1_0": p_valid_mean[1][:,0],
              "p_valid_mean_1_1": p_valid_mean[1][:,1]
             }).to_csv('p_valid_mean_preds_for_EXT_and_XGBc_mean_models.csv',index=False)

pd.DataFrame({"p_test_mean_0_0" : p_test_mean[0][:,0],
              "p_test_mean_0_1" : p_test_mean[0][:,1],
              "p_test_mean_1_0" : p_test_mean[1][:,0],
              "p_test_mean_1_1" : p_test_mean[1][:,1]
             }).to_csv('p_test_mean_preds_for_EXT_and_XGBc_mean_models.csv',index=False)

pd.DataFrame({"p_valid_mean_real_0_0": p_valid_mean_real[0][:,0],
              "p_valid_mean_real_0_1": p_valid_mean_real[0][:,1],
              "p_valid_mean_real_1_0": p_valid_mean_real[1][:,0],
              "p_valid_mean_real_1_1": p_valid_mean_real[1][:,1]
             }).to_csv('p_valid_mean_real_preds_for_EXT_and_XGBc_mean_models.csv',index=False)

pd.DataFrame({"p_test_mean_real_0_0" : p_test_mean_real[0][:,0],
              "p_test_mean_real_0_1" : p_test_mean_real[0][:,1],
              "p_test_mean_real_1_0" : p_test_mean_real[1][:,0],
              "p_test_mean_real_1_1" : p_test_mean_real[1][:,1]
             }).to_csv('p_test_mean_real_preds_for_EXT_and_XGBc_mean_models.csv',index=False)


Load data...
Clearing...
Data shape:
X_train: (68592, 131), X_valid: (22864, 131), X_test: (22865, 131) 

------------------------------------------------------------------------
MEAN MODELS: Performance of individual classifiers (1st layer) on X_test
------------------------------------------------------------------------
Iteration: 0
Iteration: 1


KeyboardInterrupt: 