In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
import pandas as pd
import numpy as np
import imp
import matplotlib.pyplot as plt
import xgboost
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
%matplotlib inline
from HelperClass.DataProcessing import *

In [3]:
trainDataFile = 'Data/train.csv'
testDataFile = 'Data/test.csv'
dataTypes = {
    'PassengerId' : 'int64',
    'Survived' : 'int64',
    'Pclass' : 'int64',
    'Name' : 'object',
    'Sex' : 'object',
    'Age' : 'float64',
    'SibSp' : 'int64',
    'Parch' : 'int64',
    'Ticket' : 'object',
    'Fare' : 'float64',
    'Cabin' : 'object',
    'Embarked' : 'object'
}
sep = ','
target = 'Survived'
randomSeed = 0
testRatio = 0.3
numCores = 10

In [4]:
dataProc = DataProcessing(trainDataFile, dataTypes, sep)
dataProc.ReadFile()
dataProc.PopulateFeatureAndLabel(target)

In [5]:
dataProc.GetSummaryOfNull()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
catCols = dataProc.GetColumnsByType('object')
numCols = dataProc.GetColumnsByType(['int64', 'float64'])

In [7]:
columnsToIgnore = ['Name', 'PassengerId', 'Ticket']

In [8]:
catCols = [i for i in catCols if i not in columnsToIgnore]
numCols = [i for i in numCols if i not in columnsToIgnore]

In [9]:
dataProc.FillNullCatColWithNA(catCols)
dataProc.FillNullNumColWithMean(numCols)

In [10]:
dataProc.GetSummaryOfNull()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [11]:
dataProc.ShowDistinctValues(catCols)

{'Cabin': {'NA': 687,
  'C23 C25 C27': 4,
  'B96 B98': 4,
  'G6': 4,
  'E101': 3,
  'C22 C26': 3,
  'D': 3,
  'F2': 3,
  'F33': 3,
  'C124': 2,
  'B51 B53 B55': 2,
  'E33': 2,
  'E67': 2,
  'F4': 2,
  'E24': 2,
  'E44': 2,
  'C2': 2,
  'B58 B60': 2,
  'C83': 2,
  'E8': 2,
  'D17': 2,
  'C65': 2,
  'B20': 2,
  'B28': 2,
  'D36': 2,
  'C126': 2,
  'C93': 2,
  'C123': 2,
  'D26': 2,
  'D20': 2,
  'F G73': 2,
  'C52': 2,
  'C125': 2,
  'B35': 2,
  'B22': 2,
  'C92': 2,
  'B49': 2,
  'D33': 2,
  'B57 B59 B63 B66': 2,
  'B18': 2,
  'D35': 2,
  'E25': 2,
  'C68': 2,
  'B5': 2,
  'B77': 2,
  'C78': 2,
  'E121': 2,
  'F E69': 1,
  'D6': 1,
  'C49': 1,
  'D50': 1,
  'C46': 1,
  'B94': 1,
  'B50': 1,
  'E46': 1,
  'C70': 1,
  'B82 B84': 1,
  'E40': 1,
  'E68': 1,
  'T': 1,
  'B38': 1,
  'C82': 1,
  'A14': 1,
  'A19': 1,
  'C32': 1,
  'B73': 1,
  'D7': 1,
  'E31': 1,
  'A5': 1,
  'C106': 1,
  'E34': 1,
  'C148': 1,
  'A20': 1,
  'C118': 1,
  'C99': 1,
  'C90': 1,
  'E17': 1,
  'C128': 1,
  'A10': 

In [12]:
dataProc.PopulateFeatureColumns(catCols + numCols)

In [14]:
dataProc.RandomSplitTrainTestData(testRatio, randomSeed)

## Random forest model

In [15]:
preProc = ColumnTransformer([
    ('StdScaler', StandardScaler(), numCols),
    ('OneHot', OneHotEncoder(handle_unknown='ignore'), catCols)
])
rf = RandomForestClassifier(random_state = randomSeed)
rfpipeline = Pipeline([('preProcessing', preProc),
                       ('rf', rf)])
params = {
    'rf__n_estimators' : range(5, 41, 5),
    'rf__max_depth' : range(2, 21, 2),
    'rf__max_features' : ['sqrt', 'log2', None]
}

In [16]:
rfcv = RandomizedSearchCV(rfpipeline, n_iter= 300, scoring = 'accuracy', 
                          param_distributions=params, verbose=1, cv = 20, n_jobs = numCores, random_state=randomSeed)

In [17]:
rfcv.fit(dataProc.X_train, dataProc.y_train)

Fitting 20 folds for each of 240 candidates, totalling 4800 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    2.4s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    4.2s
[Parallel(n_jobs=10)]: Done 1119 tasks      | elapsed:   14.3s
[Parallel(n_jobs=10)]: Done 2519 tasks      | elapsed:   31.3s
[Parallel(n_jobs=10)]: Done 4319 tasks      | elapsed:   56.2s
[Parallel(n_jobs=10)]: Done 4800 out of 4800 | elapsed:  1.1min finished


RandomizedSearchCV(cv=20, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('preProcessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('StdScaler', StandardScaler(copy=True, with_mean=True, with_std=True), ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp']), ('OneHot', OneHotEncoder(cat...ors='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=300, n_jobs=10,
          param_distributions={'rf__n_estimators': range(5, 41, 5), 'rf__max_depth': range(2, 21, 2), 'rf__max_features': ['sqrt', 'log2', None]},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=1)

## XGB model

In [19]:
param_grid = {
    'xgb__max_depth' : [3, 5, 7, 9],
    'xgb__learning_rate' : [0.05, 0.1, 0.2, 0.4],
    'xgb__n_estimators' : [300, 500, 700],
    'xgb__subsample' : [0.5, 0.7, 0.9],
    'xgb__colsample_bytree' : [0.5, 0.7, 0.9],
    'xgb__reg_alpha' : [0.1, 0.3, 0.5, 0.7, 0.9],
    'xgb__reg_lambda' : [0.1, 0.3, 0.5, 0.7, 0.9],
}

xgb = xgboost.XGBClassifier(seed=randomSeed)
xgbpipeline = Pipeline([('preProcessing', preProc),
                       ('xgb', xgb)])
xgbcv = RandomizedSearchCV(xgbpipeline, n_iter = 50, cv=20, param_distributions =param_grid,
                        scoring='neg_mean_absolute_error', verbose = 2, n_jobs = numCores, random_state=randomSeed)

xgbcv.fit(dataProc.X_train, dataProc.y_train)

Fitting 20 folds for each of 50 candidates, totalling 1000 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:    3.9s
[Parallel(n_jobs=10)]: Done 142 tasks      | elapsed:   23.0s
[Parallel(n_jobs=10)]: Done 345 tasks      | elapsed:   53.6s
[Parallel(n_jobs=10)]: Done 628 tasks      | elapsed:  1.6min
[Parallel(n_jobs=10)]: Done 1000 out of 1000 | elapsed:  2.4min finished


RandomizedSearchCV(cv=20, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('preProcessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('StdScaler', StandardScaler(copy=True, with_mean=True, with_std=True), ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp']), ('OneHot', OneHotEncoder(cat...=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1))]),
          fit_params=None, iid='warn', n_iter=50, n_jobs=10,
          param_distributions={'xgb__max_depth': [3, 5, 7, 9], 'xgb__learning_rate': [0.05, 0.1, 0.2, 0.4], 'xgb__n_estimators': [300, 500, 700], 'xgb__subsample': [0.5, 0.7, 0.9], 'xgb__colsample_bytree': [0.5, 0.7, 0.9], 'xgb__reg_alpha': [0.1, 0.3, 0.5, 0.7, 0.9], 'xgb__reg_lambda': [0.1, 0.3, 0.5, 0.7, 0.9]},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          return_train_score='warn', scoring='neg

In [20]:
y_test_rf_pred = rfcv.predict(dataProc.X_test)
y_test_xgb_pred = xgbcv.predict(dataProc.X_test)

In [21]:
print('Random Forest Model %.3f ' % accuracy_score(dataProc.y_test, y_test_rf_pred))
print('XGB Model %.3f ' % accuracy_score(dataProc.y_test, y_test_xgb_pred))

Random Forest Model 0.813 
XGB Model 0.836 


## Model stacking

In [28]:
from mlens.ensemble import SuperLearner
ensemble = SuperLearner(scorer=accuracy_score, random_state=randomSeed)
ensemble.add([rfcv, xgbcv])
ensemble.add_meta(LogisticRegression())
ensemble.fit(dataProc.X_train, dataProc.y_train)
y_test_stacking_pred = ensemble.predict(dataProc.X_test)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Fitting 2 folds for each of 50 candidates, totalling 100 fits
Fitting 2 folds for each of 50 candidates, totalling 100 fits
Fitting 5 folds for each of 240 candidates, totalling 1200 fitsFitting 2 folds for each of 50 candidates, totalling 100 fits
Fitting 5 folds for each of 240 candidates, totalling 1200 fits

[CV] xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5 
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5 
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5 


[Parallel(n_jobs=10)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=10)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=10)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=10)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=10)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=10)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5, total=   0.5s
[CV]  xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5, total=   0.6s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5 
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5 
[CV]  xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5, total=   0.6s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5 


[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV]  xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5, total=   0.6s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.9, xgb__reg_alpha=0.9, xgb__n_estimators=700, xgb__max_depth=9, xgb__learning_rate=0.4, xgb__colsample_bytree=0.9 
[CV]  xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5, total=   0.6s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.9, xgb__reg_alpha=0.9, xgb__n_estimators=700, xgb__max_depth=9, xgb__learning_rate=0.4, xgb__colsample_bytree=0.9 
[CV]  xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5, total=   0.6s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.9, xgb__reg_alpha=0.9, xgb__n_estimators=700, xgb__max_depth=9, xgb__learning_rate=0.4, xgb__colsample_bytree=0.9 
[CV]  x

[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.1, xgb__reg_alpha=0.1, xgb__n_estimators=300, xgb__max_depth=5, xgb__learning_rate=0.2, xgb__colsample_bytree=0.9, total=   0.5s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.1, xgb__reg_alpha=0.1, xgb__n_estimators=300, xgb__max_depth=5, xgb__learning_rate=0.2, xgb__colsample_bytree=0.9 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.1, xgb__reg_alpha=0.1, xgb__n_estimators=300, xgb__max_depth=5, xgb__learning_rate=0.2, xgb__colsample_bytree=0.9, total=   0.5s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.1, xgb__reg_alpha=0.1, xgb__n_estimators=300, xgb__max_depth=5, xgb__learning_rate=0.2, xgb__colsample_bytree=0.9 
[CV]  xgb__subsample=0.9, xgb__reg_lambda=0.9, xgb__reg_alpha=0.7, xgb__n_estimators=500, xgb__max_depth=5, xgb__learning_rate=0.1, xgb__colsample_bytree=0.7, total=   1.0s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.9, xgb__reg_alpha=0.7, xgb__n_estimators=500, xgb__max_depth=5, xgb__learning_rate=0.1, xgb__colsample_bytree=0.7 
[CV]  x

[CV]  xgb__subsample=0.9, xgb__reg_lambda=0.5, xgb__reg_alpha=0.3, xgb__n_estimators=500, xgb__max_depth=9, xgb__learning_rate=0.05, xgb__colsample_bytree=0.7, total=   1.1s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.7, xgb__reg_alpha=0.3, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.2, xgb__colsample_bytree=0.7 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.7, xgb__reg_alpha=0.3, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.2, xgb__colsample_bytree=0.7, total=   0.6s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.7, xgb__reg_alpha=0.3, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.2, xgb__colsample_bytree=0.7 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.7, xgb__reg_alpha=0.9, xgb__n_estimators=700, xgb__max_depth=5, xgb__learning_rate=0.4, xgb__colsample_bytree=0.7, total=   1.2s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.3, xgb__n_estimators=500, xgb__max_depth=9, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5 
[CV]  

[CV]  xgb__subsample=0.7, xgb__reg_lambda=0.5, xgb__reg_alpha=0.7, xgb__n_estimators=700, xgb__max_depth=5, xgb__learning_rate=0.4, xgb__colsample_bytree=0.9, total=   1.1s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.7, xgb__n_estimators=700, xgb__max_depth=5, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5 
[CV]  xgb__subsample=0.7, xgb__reg_lambda=0.5, xgb__reg_alpha=0.7, xgb__n_estimators=700, xgb__max_depth=5, xgb__learning_rate=0.4, xgb__colsample_bytree=0.9, total=   1.1s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.7, xgb__n_estimators=700, xgb__max_depth=5, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5 
[CV]  xgb__subsample=0.7, xgb__reg_lambda=0.3, xgb__reg_alpha=0.7, xgb__n_estimators=500, xgb__max_depth=9, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5, total=   0.9s
[CV] xgb__subsample=0.7, xgb__reg_lambda=0.3, xgb__reg_alpha=0.7, xgb__n_estimators=500, xgb__max_depth=9, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5 
[CV]  x

[CV]  xgb__subsample=0.7, xgb__reg_lambda=0.9, xgb__reg_alpha=0.5, xgb__n_estimators=500, xgb__max_depth=7, xgb__learning_rate=0.05, xgb__colsample_bytree=0.9, total=   1.1s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.7, xgb__n_estimators=700, xgb__max_depth=3, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.9, xgb__reg_alpha=0.1, xgb__n_estimators=500, xgb__max_depth=7, xgb__learning_rate=0.05, xgb__colsample_bytree=0.5, total=   0.8s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.9, xgb__reg_alpha=0.1, xgb__n_estimators=500, xgb__max_depth=7, xgb__learning_rate=0.05, xgb__colsample_bytree=0.5 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.9, xgb__reg_alpha=0.1, xgb__n_estimators=500, xgb__max_depth=7, xgb__learning_rate=0.05, xgb__colsample_bytree=0.5, total=   0.8s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.9, xgb__reg_alpha=0.1, xgb__n_estimators=500, xgb__max_depth=7, xgb__learning_rate=0.05, xgb__colsample_bytree=0.5 
[C

[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.9, xgb__n_estimators=500, xgb__max_depth=5, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5, total=   0.8s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.9, xgb__n_estimators=500, xgb__max_depth=5, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.9, xgb__n_estimators=500, xgb__max_depth=5, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5, total=   0.9s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.7, xgb__n_estimators=500, xgb__max_depth=7, xgb__learning_rate=0.1, xgb__colsample_bytree=0.7 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.5, xgb__reg_alpha=0.9, xgb__n_estimators=500, xgb__max_depth=9, xgb__learning_rate=0.2, xgb__colsample_bytree=0.7, total=   1.0s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.5, xgb__reg_alpha=0.3, xgb__n_estimators=700, xgb__max_depth=7, xgb__learning_rate=0.4, xgb__colsample_bytree=0.7 
[CV]  x

[CV]  xgb__subsample=0.9, xgb__reg_lambda=0.7, xgb__reg_alpha=0.9, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5, total=   0.5s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.7, xgb__reg_alpha=0.9, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.9, xgb__n_estimators=500, xgb__max_depth=5, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5, total=   0.9s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.9, xgb__n_estimators=500, xgb__max_depth=5, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5 
[CV]  xgb__subsample=0.9, xgb__reg_lambda=0.7, xgb__reg_alpha=0.9, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5, total=   0.4s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.7, xgb__reg_alpha=0.3, xgb__n_estimators=700, xgb__max_depth=5, xgb__learning_rate=0.05, xgb__colsample_bytree=0.9 
[CV]  

[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.1, xgb__reg_alpha=0.1, xgb__n_estimators=500, xgb__max_depth=5, xgb__learning_rate=0.4, xgb__colsample_bytree=0.9, total=   0.8s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.3, xgb__reg_alpha=0.1, xgb__n_estimators=500, xgb__max_depth=9, xgb__learning_rate=0.05, xgb__colsample_bytree=0.5 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.5, xgb__reg_alpha=0.3, xgb__n_estimators=300, xgb__max_depth=5, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5, total=   0.5s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.7, xgb__reg_alpha=0.9, xgb__n_estimators=300, xgb__max_depth=3, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.1, xgb__reg_alpha=0.1, xgb__n_estimators=500, xgb__max_depth=5, xgb__learning_rate=0.4, xgb__colsample_bytree=0.9, total=   0.7s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.3, xgb__reg_alpha=0.1, xgb__n_estimators=500, xgb__max_depth=9, xgb__learning_rate=0.05, xgb__colsample_bytree=0.5 
[CV] 

[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.7, xgb__reg_alpha=0.9, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5, total=   0.4s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.3, xgb__reg_alpha=0.9, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.1, xgb__colsample_bytree=0.9 
[CV]  xgb__subsample=0.9, xgb__reg_lambda=0.3, xgb__reg_alpha=0.9, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.1, xgb__colsample_bytree=0.9, total=   0.5s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.3, xgb__reg_alpha=0.9, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.1, xgb__colsample_bytree=0.9 
[CV]  xgb__subsample=0.9, xgb__reg_lambda=0.3, xgb__reg_alpha=0.9, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.1, xgb__colsample_bytree=0.9, total=   0.5s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.3, xgb__reg_alpha=0.9, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.1, xgb__colsample_bytree=0.9 
[CV]  x

[CV]  xgb__subsample=0.7, xgb__reg_lambda=0.7, xgb__reg_alpha=0.1, xgb__n_estimators=300, xgb__max_depth=5, xgb__learning_rate=0.1, xgb__colsample_bytree=0.7, total=   0.5s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.5, xgb__reg_alpha=0.5, xgb__n_estimators=700, xgb__max_depth=7, xgb__learning_rate=0.1, xgb__colsample_bytree=0.9 
[CV]  xgb__subsample=0.7, xgb__reg_lambda=0.7, xgb__reg_alpha=0.1, xgb__n_estimators=300, xgb__max_depth=5, xgb__learning_rate=0.1, xgb__colsample_bytree=0.7, total=   0.5s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.5, xgb__reg_alpha=0.5, xgb__n_estimators=700, xgb__max_depth=7, xgb__learning_rate=0.1, xgb__colsample_bytree=0.9 
[CV]  xgb__subsample=0.9, xgb__reg_lambda=0.1, xgb__reg_alpha=0.7, xgb__n_estimators=500, xgb__max_depth=9, xgb__learning_rate=0.05, xgb__colsample_bytree=0.5, total=   0.9s
[CV] xgb__subsample=0.9, xgb__reg_lambda=0.9, xgb__reg_alpha=0.7, xgb__n_estimators=700, xgb__max_depth=7, xgb__learning_rate=0.05, xgb__colsample_bytree=0.5 
[CV] 

[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.9, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.2, xgb__colsample_bytree=0.5, total=   0.4s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.3, xgb__n_estimators=300, xgb__max_depth=5, xgb__learning_rate=0.1, xgb__colsample_bytree=0.9 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.3, xgb__n_estimators=300, xgb__max_depth=5, xgb__learning_rate=0.1, xgb__colsample_bytree=0.9, total=   0.5s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.3, xgb__n_estimators=300, xgb__max_depth=5, xgb__learning_rate=0.1, xgb__colsample_bytree=0.9 
[CV]  xgb__subsample=0.7, xgb__reg_lambda=0.1, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=5, xgb__learning_rate=0.05, xgb__colsample_bytree=0.9, total=   0.6s
[CV] xgb__subsample=0.7, xgb__reg_lambda=0.5, xgb__reg_alpha=0.5, xgb__n_estimators=700, xgb__max_depth=7, xgb__learning_rate=0.4, xgb__colsample_bytree=0.5 
[CV]  

[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.3, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.4, xgb__colsample_bytree=0.9, total=   0.4s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.3, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.4, xgb__colsample_bytree=0.9 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.3, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.4, xgb__colsample_bytree=0.9, total=   0.4s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.3, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.4, xgb__colsample_bytree=0.9 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.3, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.4, xgb__colsample_bytree=0.9, total=   0.5s


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  1.5min finished


[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.3, xgb__reg_alpha=0.3, xgb__n_estimators=300, xgb__max_depth=7, xgb__learning_rate=0.4, xgb__colsample_bytree=0.9, total=   0.4s


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  1.5min finished


[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.5, xgb__reg_alpha=0.5, xgb__n_estimators=700, xgb__max_depth=7, xgb__learning_rate=0.1, xgb__colsample_bytree=0.9, total=   1.4s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.9, xgb__reg_alpha=0.1, xgb__n_estimators=700, xgb__max_depth=5, xgb__learning_rate=0.4, xgb__colsample_bytree=0.7 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.9, xgb__reg_alpha=0.1, xgb__n_estimators=700, xgb__max_depth=5, xgb__learning_rate=0.4, xgb__colsample_bytree=0.7, total=   1.0s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.9, xgb__reg_alpha=0.1, xgb__n_estimators=700, xgb__max_depth=5, xgb__learning_rate=0.4, xgb__colsample_bytree=0.7 
[CV]  xgb__subsample=0.5, xgb__reg_lambda=0.9, xgb__reg_alpha=0.1, xgb__n_estimators=700, xgb__max_depth=5, xgb__learning_rate=0.4, xgb__colsample_bytree=0.7, total=   1.0s
[CV] xgb__subsample=0.5, xgb__reg_lambda=0.7, xgb__reg_alpha=0.5, xgb__n_estimators=300, xgb__max_depth=9, xgb__learning_rate=0.05, xgb__colsample_bytree=0.5 
[CV]  

[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  1.7min finished
[Parallel(n_jobs=10)]: Done 1200 out of 1200 | elapsed:  2.5min finished
[Parallel(n_jobs=10)]: Done 1200 out of 1200 | elapsed:  2.6min finished
[Parallel(n_jobs=10)]: Done 1200 out of 1200 | elapsed:  2.8min finished


In [18]:
print('Stacking Random Forest and XGB %.3f ' % accuracy_score(dataProc.y_test, y_test_stacking_pred))

NameError: name 'y_test_stacking_pred' is not defined

In [25]:
testDataProc = DataProcessing(testDataFile, dataTypes, sep)
testDataProc.ReadFile()
testDataProc.PopulateFeatureColumns(catCols + numCols)

In [27]:
testDataProc.AllData.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [28]:
testDataProc.FillNullCatColWithNA(catCols)
testDataProc.FillNullNumColWithMean(numCols)

In [29]:
testDataProc.AllData.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,418.0,418.0,418.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,12.634534,0.89676,0.981429,55.8405
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,23.0,0.0,0.0,7.8958
50%,1100.5,3.0,30.27259,0.0,0.0,14.4542
75%,1204.75,3.0,35.75,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [None]:
testDataProc.AllData

In [35]:
testDataProc.AllData['Survived'] = rfcv.predict(testDataProc.AllData)

In [36]:
testDataProc.AllData[['PassengerId', 'Survived']].to_csv('Output/submit.txt', index=False)