# Packages and Data Imports

In [10]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Stats and other tools
from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report,confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import scipy.stats as stats
from scipy.stats import uniform
from scipy.stats import randint as sp_randint
import time
import logging
logging.getLogger().setLevel(logging.INFO)

#Models we will test and try
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
import tensorflow as tf
import tensorflow.contrib.learn as learn

#Tensorflow abstractions (for when if we decide for a proper DNN implementation)
from tensorflow import estimator
#from keras import layers
#from tensorflow.contrib.layers import fully_connected

#from keras import models
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras import regularizers
#from hyperas.distributions import choice, uniform, conditional
#from hyperas import optim
#from hyperopt import Trials, STATUS_OK, tpe


cv_k_global = 11 #the amount of f_folds to be used in all CV




train_full= pd.read_csv('train.csv')
train_full.set_index('PassengerId',inplace=True)

quantitative = [f for f in train_full.columns if train_full.dtypes[f] != 'object']
quantitative.remove('Survived')#Survived is target label
#quantitative.remove('PassengerId')#PassengerId will be turned into index
qualitative = [f for f in train_full.columns if train_full.dtypes[f] == 'object']

# Cross-validation and hyper-parameter searches

Loading the data sets

In [15]:
dataset='deck'

test_1 = pd.read_csv('data_test_1.csv',index_col='PassengerId').drop(['Missing_Embark','Missing_Deck'],axis=1)
test_2 = pd.read_csv('data_test_2.csv',index_col='PassengerId').drop(['Missing_Embark','Missing_Deck'],axis=1)
test_3 = pd.read_csv('data_test_2.csv',index_col='PassengerId').drop(['Missing_Embark','Missing_Deck'],axis=1)

if(dataset=='old'):
    data_1 = pd.read_csv('data_train_1.csv',index_col='PassengerId').drop(['Missing_Embark','Missing_Deck'],axis=1)
    data_2 = pd.read_csv('data_train_2.csv',index_col='PassengerId').drop(['Missing_Embark','Missing_Deck'],axis=1)
    data_3 = pd.read_csv('data_train_3.csv',index_col='PassengerId').drop(['Missing_Embark','Missing_Deck'],axis=1)
    
if(dataset=='new'):
    data_1 = pd.read_csv('data_train_1_new.csv',index_col='PassengerId').drop(['Missing_Embark','Missing_Deck'],axis=1)
    data_2 = pd.read_csv('data_train_2_new.csv',index_col='PassengerId').drop(['Missing_Embark','Missing_Deck'],axis=1)
    data_3 = pd.read_csv('data_train_3_new.csv',index_col='PassengerId').drop(['Missing_Embark','Missing_Deck'],axis=1)
    
if(dataset=='deck'):
    data_1 = pd.read_csv('data_train_deck_pred_1.csv',index_col='PassengerId').drop(['Missing_Embark','T_deck'],axis=1)
    data_2 = pd.read_csv('data_train_deck_pred_2.csv',index_col='PassengerId').drop(['Missing_Embark','T_deck'],axis=1)
    data_3 = pd.read_csv('data_train_deck_pred_3.csv',index_col='PassengerId').drop(['Missing_Embark','T_deck'],axis=1)

## Logistic Regression

### Data Type 1

In [3]:
X=data_1.drop('Survived',axis=1).as_matrix()
Y=data_1['Survived'].as_matrix()
logmodel = LogisticRegression(n_jobs=-1)

param_grid = [{
    'C': [10**x for x in range(-3,4)],
    'penalty':['l1'],
    'solver' :['liblinear','saga']},{
    'C': [10**x for x in range(-3,4)],
    'penalty':['l2'],
    'solver':['newton-cg','lbfgs','sag']}]
gscv=GridSearchCV(logmodel,param_grid,scoring='accuracy',cv=cv_k_global,n_jobs=-1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

# param_distributions ={'C':uniform(0.001,1000),
#                       'penalty':['l2'],
#                       'solver':['newton-cg','lbfgs','sag']
#                      }
# rscv=RandomizedSearchCV(estimator=logmodel,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=cv_k_global,
#                   verbose=1,
#                   n_jobs=-1,
#                   n_iter=1000)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_LR_new_data1.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_LR_data1.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_LR_deck_data1.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_C,param_penalty,param_solver,rank_test_score,std_test_score,error
34,0.104525,0.803591,0.811111,1000,l2,sag,1,0.034551,0.017276
25,0.104566,0.803591,0.814254,1,l2,sag,1,0.032995,0.016497
33,0.023015,0.803591,0.811561,1000,l2,lbfgs,1,0.034551,0.017276
29,0.020502,0.803591,0.811448,100,l2,newton-cg,1,0.034551,0.017276
13,0.106931,0.803591,0.810887,1000,l1,saga,1,0.034551,0.017276
23,0.011643,0.803591,0.814254,1,l2,newton-cg,1,0.032995,0.016497
11,0.106557,0.803591,0.811111,100,l1,saga,1,0.034551,0.017276
10,0.083559,0.803591,0.811448,100,l1,liblinear,1,0.034551,0.017276
12,0.150345,0.803591,0.811336,1000,l1,liblinear,1,0.034551,0.017276
31,0.104402,0.803591,0.811111,100,l2,sag,1,0.034551,0.017276


### Data Type 2

In [4]:
X=data_2.drop('Survived',axis=1).as_matrix()
Y=data_2['Survived'].as_matrix()
logmodel = LogisticRegression(n_jobs=-1)

param_grid = [{
    'C': [10**x for x in range(-3,4)],
    'penalty':['l1'],
    'solver' :['liblinear','saga']},{
    'C': [10**x for x in range(-3,4)],
    'penalty':['l2'],
    'solver':['newton-cg','lbfgs','sag']}]
gscv=GridSearchCV(logmodel,param_grid,scoring='accuracy',cv=cv_k_global,n_jobs=-1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

# param_distributions ={'C':uniform(0.001,1000),
#                       'penalty':['l2'],
#                       'solver':['newton-cg','lbfgs','sag']
#                      }
# rscv=RandomizedSearchCV(estimator=logmodel,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=cv_k_global,
#                   verbose=1,
#                   n_jobs=-1,
#                   n_iter=1000)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_LR_new_data2.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_LR_data2.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_LR_deck_data2.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_C,param_penalty,param_solver,rank_test_score,std_test_score,error
24,0.024789,0.803591,0.81504,1.0,l2,lbfgs,1,0.038271,0.019135
22,0.108634,0.803591,0.810664,0.1,l2,sag,1,0.034915,0.017458
21,0.012597,0.803591,0.810664,0.1,l2,lbfgs,1,0.034915,0.017458
20,0.010962,0.803591,0.810664,0.1,l2,newton-cg,1,0.034915,0.017458
25,0.105233,0.803591,0.815152,1.0,l2,sag,1,0.038271,0.019135
6,0.041556,0.803591,0.814815,1.0,l1,liblinear,1,0.042252,0.021126
23,0.01776,0.803591,0.81504,1.0,l2,newton-cg,1,0.038271,0.019135
33,0.026223,0.802469,0.814815,1000.0,l2,lbfgs,8,0.036025,0.018012
29,0.02356,0.802469,0.815601,100.0,l2,newton-cg,8,0.040452,0.020226
12,0.137919,0.802469,0.815489,1000.0,l1,liblinear,8,0.036025,0.018012


### Data Type 3

In [5]:
X=data_3.drop('Survived',axis=1).as_matrix()
Y=data_3['Survived'].as_matrix()
logmodel = LogisticRegression(n_jobs=-1)

param_grid = [{
    'C': [10**x for x in range(-3,4)],
    'penalty':['l1'],
    'solver' :['liblinear','saga']},{
    'C': [10**x for x in range(-3,4)],
    'penalty':['l2'],
    'solver':['newton-cg','lbfgs','sag']}]
gscv=GridSearchCV(logmodel,param_grid,scoring='accuracy',cv=cv_k_global,n_jobs=-1,verbose=1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)


# param_distributions ={'C':[10**x for x in range(-3,4)],
#      'penalty':['l2'],
#      'solver':['newton-cg','lbfgs','sag']}

# rscv=RandomizedSearchCV(estimator=logmodel,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=cv_k_global,
#                   verbose=1,
#                   n_jobs=-1,
#                   n_iter=21)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_LR_new_data3.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_LR_data3.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_LR_deck_data3.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 11 folds for each of 35 candidates, totalling 385 fits


[Parallel(n_jobs=-1)]: Done 385 out of 385 | elapsed:    5.4s finished


Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_C,param_penalty,param_solver,rank_test_score,std_test_score,error
24,0.021187,0.829405,0.836139,1.0,l2,lbfgs,1,0.024098,0.012049
25,0.103973,0.829405,0.836139,1.0,l2,sag,1,0.024098,0.012049
6,0.031642,0.829405,0.836251,1.0,l1,liblinear,1,0.022548,0.011274
23,0.018178,0.829405,0.836139,1.0,l2,newton-cg,1,0.024098,0.012049
7,0.104542,0.828283,0.835017,1.0,l1,saga,5,0.023532,0.011766
9,0.123022,0.82716,0.839619,10.0,l1,saga,6,0.029936,0.014968
11,0.10433,0.823793,0.838946,100.0,l1,saga,7,0.030437,0.015218
28,0.104275,0.822671,0.838945,10.0,l2,sag,8,0.026116,0.013058
13,0.102849,0.822671,0.838946,1000.0,l1,saga,8,0.031415,0.015707
22,0.102849,0.821549,0.82806,0.1,l2,sag,10,0.0271,0.01355


In [6]:
# X=data_3.drop('Survived',axis=1).as_matrix()
# Y=data_3['Survived'].as_matrix()
# logmodel = LogisticRegression(n_jobs=-1)
# param_grid = [{
#     #'C': [2,4,6,8,10,12,14],
#     #'penalty':['l1'],
#     #'solver' :['liblinear','saga']},{
#     'C': range(1,16),
#     'penalty':['l2'],
#     'solver':['newton-cg','lbfgs','sag']}]
# gscv=GridSearchCV(logmodel,param_grid,scoring='accuracy',cv=cv_k_global,n_jobs=-1)
# gscv.fit(X,Y)
# results_df=pd.DataFrame(gscv.cv_results_)
# results_df.to_csv('./CV_LR_new_data3_further.csv')
# results_df.sort_values('rank_test_score',axis=0).head(10)

## LR With polinomial features

### Data Type 1

In [None]:
X=data_1.drop('Survived',axis=1).as_matrix()
Y=data_1['Survived'].as_matrix()

poly = PolynomialFeatures(4)
X_poly = poly.fit_transform(X)

logmodel = LogisticRegression(n_jobs=-1)

param_grid = [{
    'C': [10**x for x in range(-3,4)],
    'penalty':['l1'],
    'solver' :['liblinear','saga']},{
    'C': [10**x for x in range(-3,4)],
    'penalty':['l2'],
    'solver':['newton-cg','lbfgs','sag']}]
gscv=GridSearchCV(logmodel,param_grid,scoring='accuracy',cv=cv_k_global,verbose=1,n_jobs=-1)
gscv.fit(X_poly,Y)
results_df=pd.DataFrame(gscv.cv_results_)

# param_distributions ={'C':uniform(0.001,1000),
#                       #'penalty':['l2'],
#                       'solver':['lbfgs','sag']#'newton-cg',
#                      }
# rscv=RandomizedSearchCV(estimator=logmodel,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=cv_k_global,
#                   verbose=1,
#                   n_jobs=-1,
#                   n_iter=1000)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_LR_X_new_data1.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_LR_X_data1.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_LR_X_deck_data1.csv')


results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

### Data Type 2

In [None]:
X=data_2.drop('Survived',axis=1).as_matrix()
Y=data_2['Survived'].as_matrix()

poly = PolynomialFeatures(4)
X_poly = poly.fit_transform(X)

logmodel = LogisticRegression(n_jobs=-1)

# param_grid = [{
#     'C': [0.01,0.1,1,10,100],
#     'penalty':['l1'],
#     'solver' :['liblinear','saga']},{
#     'C': [0.01,0.1,1,10,100],
#     'penalty':['l2'],
#     'solver':['lbfgs','sag']}]#'newton-cg'
# gscv=GridSearchCV(logmodel,param_grid,scoring='accuracy',cv=cv_k_global,verbose=1,n_jobs=-1)
# gscv.fit(X_poly,Y)
# results_df=pd.DataFrame(gscv.cv_results_)

param_distributions ={'C':uniform(0.001,1),
                      #'penalty':['l2'],
                      'solver':['lbfgs','sag']#'newton-cg',
                     }
rscv=RandomizedSearchCV(estimator=logmodel,
                  param_distributions=param_distributions,
                  scoring='accuracy',
                  cv=cv_k_global,
                  verbose=1,
                  n_jobs=-1,
                  n_iter=1000)
rscv.fit(X,Y)
results_df=pd.DataFrame(rscv.cv_results_)

# if(dataset=='new'):
#     results_df.to_csv('./CV_LR_X_new_data2.csv')
# if(dataset=='old'):
#     results_df.to_csv('./CV_LR_X_data2.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

### Data Type 3

In [None]:
X=data_3.drop('Survived',axis=1).as_matrix()
Y=data_3['Survived'].as_matrix()

poly = PolynomialFeatures(4)
X_poly = poly.fit_transform(X)

logmodel = LogisticRegression(n_jobs=-1)
# param_grid = [{
#     'C': [0.1,1,10],
#     'penalty':['l1'],
#     'solver' :['liblinear','saga']},{
#     'C': [0.1,1,10],
#     'penalty':['l2'],
#     'solver':['newton-cg','lbfgs','sag']}]
# gscv=GridSearchCV(logmodel,param_grid,scoring='accuracy',cv=cv_k_global,verbose=1,n_jobs=-1)
# gscv.fit(X_poly,Y)
# results_df=pd.DataFrame(gscv.cv_results_)

param_distributions ={'C':uniform(0.001,1000),
                      #'penalty':['l2'],
                      'solver':['newton-cg','lbfgs','sag']
                     }

rscv=RandomizedSearchCV(estimator=logmodel,
                  param_distributions=param_distributions,
                  scoring='accuracy',
                  cv=cv_k_global,
                  verbose=1,
                  n_jobs=-1,
                  n_iter=1000)
rscv.fit(X,Y)
results_df=pd.DataFrame(rscv.cv_results_)

# if(dataset=='new'):
#     results_df.to_csv('./CV_LR_X_new_data3.csv')
# if(dataset=='old'):
#     results_df.to_csv('./CV_LR_X_data3.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

## AdaBC

### Data Type 1

In [16]:
X=data_1.drop('Survived',axis=1).as_matrix()
Y=data_1['Survived'].as_matrix()

classif_ABC = AdaBoostClassifier()

param_grid ={'n_estimators':[10*x for x in range(1,21)],
             'learning_rate':[10**x for x in range(-4,5)],
             'algorithm':['SAMME','SAMME.R']}
gscv=GridSearchCV(classif_ABC,param_grid,scoring='accuracy',cv=cv_k_global,n_jobs=-1,verbose=1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_ABC_new_data1.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_ABC_data1.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_ABC_deck_data1.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 11 folds for each of 360 candidates, totalling 3960 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 863 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 1113 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 1463 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done 2241 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done 3341 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done 3929 out of 3960 | elapsed:  1.1min remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 3960 out of 3960 | elapsed:  1.1min finished


Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_algorithm,param_learning_rate,param_n_estimators,rank_test_score,std_test_score,error
94,0.322884,0.824916,0.833446,SAMME,1,150,1,0.025163,0.012581
96,0.385134,0.823793,0.833333,SAMME,1,170,2,0.027733,0.013867
91,0.231216,0.822671,0.831425,SAMME,1,120,3,0.027507,0.013753
93,0.28596,0.822671,0.832211,SAMME,1,140,3,0.024861,0.01243
89,0.197664,0.821549,0.830415,SAMME,1,100,5,0.03042,0.01521
95,0.338949,0.821549,0.833109,SAMME,1,160,5,0.027368,0.013684
263,0.090426,0.821549,0.83771,SAMME.R,1,40,5,0.033649,0.016825
90,0.219,0.820426,0.830415,SAMME,1,110,8,0.029022,0.014511
92,0.278898,0.820426,0.83165,SAMME,1,130,8,0.024911,0.012456
88,0.184079,0.819304,0.828732,SAMME,1,90,10,0.028978,0.014489


### Data Type 2

In [17]:
X=data_2.drop('Survived',axis=1).as_matrix()
Y=data_2['Survived'].as_matrix()

classif_ABC = AdaBoostClassifier()

param_grid ={'n_estimators':[10*x for x in range(1,21)],
             'learning_rate':[10**x for x in range(-4,5)],
             'algorithm':['SAMME','SAMME.R']}
gscv=GridSearchCV(classif_ABC,param_grid,scoring='accuracy',cv=cv_k_global,n_jobs=-1,verbose=1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_ABC_new_data2.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_ABC_data2.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_ABC_deck_data2.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 11 folds for each of 360 candidates, totalling 3960 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 755 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 1355 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done 2219 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done 3319 tasks      | elapsed:   55.8s
[Parallel(n_jobs=-1)]: Done 3929 out of 3960 | elapsed:  1.1min remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 3960 out of 3960 | elapsed:  1.1min finished


Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_algorithm,param_learning_rate,param_n_estimators,rank_test_score,std_test_score,error
81,0.052673,0.818182,0.818743,SAMME,1,20,1,0.020648,0.010324
278,0.453855,0.815937,0.869808,SAMME.R,1,190,2,0.037931,0.018965
82,0.070246,0.815937,0.820651,SAMME,1,30,2,0.021544,0.010772
279,0.461172,0.814815,0.872053,SAMME.R,1,200,4,0.035495,0.017747
83,0.103674,0.813692,0.820875,SAMME,1,40,5,0.017259,0.008629
88,0.190438,0.81257,0.823456,SAMME,1,90,6,0.018448,0.009224
277,0.413999,0.81257,0.86891,SAMME.R,1,180,6,0.038414,0.019207
271,0.295359,0.811448,0.857912,SAMME.R,1,120,8,0.035679,0.017839
275,0.371859,0.811448,0.86588,SAMME.R,1,160,8,0.031554,0.015777
272,0.310211,0.810325,0.860942,SAMME.R,1,130,10,0.032352,0.016176


### Data Type 3

In [18]:
X=data_3.drop('Survived',axis=1).as_matrix()
Y=data_3['Survived'].as_matrix()

classif_ABC = AdaBoostClassifier()

param_grid ={'n_estimators':[10*x for x in range(1,21)],
             'learning_rate':[10**x for x in range(-4,5)],
             'algorithm':['SAMME','SAMME.R']}
gscv=GridSearchCV(classif_ABC,param_grid,scoring='accuracy',cv=cv_k_global,n_jobs=-1,verbose=1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_ABC_new_data3.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_ABC_data3.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_ABC_deck_data3.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 11 folds for each of 360 candidates, totalling 3960 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 649 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 899 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 1249 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done 1699 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 2249 tasks      | elapsed:   38.6s
[Parallel(n_jobs=-1)]: Done 2899 tasks      | elapsed:   51.3s
[Parallel(n_jobs=-1)]: Done 3649 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 3960 out of 3960 | elapsed:  1.2min finished


Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_algorithm,param_learning_rate,param_n_estimators,rank_test_score,std_test_score,error
81,0.046443,0.822671,0.828396,SAMME,1,20,1,0.024299,0.01215
80,0.021183,0.821549,0.827722,SAMME,1,10,2,0.025815,0.012908
278,0.510516,0.820426,0.873849,SAMME.R,1,190,3,0.03953,0.019765
279,0.515215,0.819304,0.875533,SAMME.R,1,200,4,0.044319,0.022159
277,0.476716,0.818182,0.871717,SAMME.R,1,180,5,0.038859,0.019429
276,0.424892,0.818182,0.871605,SAMME.R,1,170,5,0.038806,0.019403
89,0.240704,0.818182,0.82862,SAMME,1,100,5,0.023509,0.011755
275,0.39148,0.817059,0.869361,SAMME.R,1,160,8,0.036875,0.018437
88,0.213899,0.817059,0.828508,SAMME,1,90,8,0.023824,0.011912
84,0.112505,0.817059,0.829181,SAMME,1,50,8,0.018614,0.009307


## GBC

### Data Type 1

In [None]:
X=data_1.drop('Survived',axis=1).as_matrix()
Y=data_1['Survived'].as_matrix()

classif_GBC = GradientBoostingClassifier()

param_grid ={'loss' : ['deviance', 'exponential'],
             'criterion':['friedman_mse','mse','mae'],
             'n_estimators':[10*x for x in range(1,21)],
             'subsample':[x*0.1 for x in range(1,11)],
             'max_features':['auto','log2','sqrt',None]}
gscv=GridSearchCV(classif_GBC,param_grid,scoring='accuracy',cv=cv_k_global,n_jobs=-1,verbose=1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_GBC_new_data1.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_GBC_data1.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_GBC_deck_data1.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

# scores=cross_val_score(classif_GBC,X=X,y=Y,cv=10)
# print("%0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Fitting 11 folds for each of 4800 candidates, totalling 52800 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1322 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 2072 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 3824 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 5990 tasks      | elapsed:   44.9s
[Parallel(n_jobs=-1)]: Done 8405 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 10355 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 13409 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 16781 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 19929 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 24759 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 28657 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 34213 tasks      | elapsed:  4.7min


### Data Type 2

In [None]:
X=data_2.drop('Survived',axis=1).as_matrix()
Y=data_2['Survived'].as_matrix()

classif_GBC = GradientBoostingClassifier()

param_grid ={'loss' : ['deviance', 'exponential'],
             'criterion':['friedman_mse','mse','mae'],
             'n_estimators':[10*x for x in range(1,21)],
             'subsample':[x*0.1 for x in range(1,11)],
             'max_features':['auto','log2','sqrt',None]}
gscv=GridSearchCV(classif_GBC,param_grid,scoring='accuracy',cv=cv_k_global,n_jobs=-1,verbose=1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_GBC_new_data2.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_GBC_data2.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_GBC_deck_data2.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

# scores=cross_val_score(classif_GBC,X=X,y=Y,cv=10)
# print("%0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

### Data Type 3

In [None]:
X=data_3.drop('Survived',axis=1).as_matrix()
Y=data_3['Survived'].as_matrix()

classif_GBC = GradientBoostingClassifier()

param_grid ={'loss' : ['deviance', 'exponential'],
             'criterion':['friedman_mse','mse','mae'],
             'n_estimators':[10*x for x in range(1,21)],
             'subsample':[x*0.1 for x in range(1,11)],
             'max_features':['auto','log2','sqrt',None]}
gscv=GridSearchCV(classif_GBC,param_grid,scoring='accuracy',cv=cv_k_global,n_jobs=-1,verbose=1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_GBC_new_data3.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_GBC_data3.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_GBC_deck_data3.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

# scores=cross_val_score(classif_GBC,X=X,y=Y,cv=10)
# print("%0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

## Random Forests

### Data Type 1

In [7]:
X=data_1.drop('Survived',axis=1).as_matrix()
Y=data_1['Survived'].as_matrix()
rfc = RandomForestClassifier(n_jobs=-1)

param_grid ={
    'n_estimators':[10*x for x in range(1,21)],
    'criterion':['gini','entropy'],
    'max_features':['auto','log2','sqrt',None]}
gscv=GridSearchCV(rfc,param_grid,scoring='accuracy',cv=cv_k_global,verbose=1,n_jobs=-1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

# param_distributions=param_grid ={
#     'n_estimators':sp_randint(1, 1001),
#     'criterion':['gini','entropy'],
#     'max_features':['auto','log2','sqrt',None]
#     }

# rscv=RandomizedSearchCV(estimator=rfc,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=cv_k_global,
#                   verbose=1,
#                   n_jobs=-1,
#                   n_iter=100)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)


if(dataset=='new'):
    results_df.to_csv('./CV_RF_new_data1.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_RF_data1.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_RF_deck_data1.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 11 folds for each of 160 candidates, totalling 1760 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   32.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   57.0s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1760 out of 1760 | elapsed:  2.4min finished


Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_criterion,param_max_features,param_n_estimators,rank_test_score,std_test_score,error
149,0.463449,0.83165,0.986195,entropy,,100,1,0.044548,0.022274
155,0.840625,0.82716,0.986308,entropy,,160,2,0.050943,0.025472
152,0.714194,0.82716,0.986308,entropy,,130,2,0.050638,0.025319
151,0.646103,0.82716,0.986308,entropy,,120,2,0.045891,0.022945
141,0.118171,0.82716,0.979237,entropy,,20,2,0.04908,0.02454
153,0.746805,0.826038,0.986308,entropy,,140,6,0.055091,0.027545
142,0.131536,0.826038,0.98339,entropy,,30,6,0.056854,0.028427
148,0.452426,0.826038,0.986195,entropy,,90,6,0.054249,0.027125
66,0.276267,0.824916,0.986083,gini,,70,9,0.053894,0.026947
156,0.874148,0.824916,0.986308,entropy,,170,9,0.057124,0.028562


### Data Type 2

In [8]:
X=data_2.drop('Survived',axis=1).as_matrix()
Y=data_2['Survived'].as_matrix()
rfc = RandomForestClassifier(n_jobs=-1)

param_grid ={
    'n_estimators':[10*x for x in range(1,21)],
    'criterion':['gini','entropy'],
    'max_features':['auto','log2','sqrt',None]}
gscv=GridSearchCV(rfc,param_grid,scoring='accuracy',cv=cv_k_global,verbose=1,n_jobs=-1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

# param_distributions=param_grid ={
#     'n_estimators':sp_randint(1, 1001),
#     'criterion':['gini','entropy'],
#     'max_features':['auto','log2','sqrt',None]
#     }

# rscv=RandomizedSearchCV(estimator=rfc,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=cv_k_global,
#                   verbose=1,
#                   n_jobs=-1,
#                   n_iter=100)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)


if(dataset=='new'):
    results_df.to_csv('./CV_RF_new_data2.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_RF_data2.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_RF_deck_data2.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 11 folds for each of 160 candidates, totalling 1760 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   36.1s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1760 out of 1760 | elapsed:  2.6min finished


Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_criterion,param_max_features,param_n_estimators,rank_test_score,std_test_score,error
141,0.126278,0.828283,0.98339,entropy,,20,1,0.049788,0.024894
72,0.656556,0.82716,0.988552,gini,,130,2,0.053607,0.026803
66,0.298519,0.826038,0.988215,gini,,70,3,0.047024,0.023512
156,1.034242,0.826038,0.988552,entropy,,170,3,0.055238,0.027619
67,0.378763,0.826038,0.988215,gini,,80,3,0.050928,0.025464
151,0.746471,0.826038,0.988552,entropy,,120,3,0.051966,0.025983
146,0.389372,0.824916,0.987991,entropy,,70,7,0.058471,0.029236
154,0.958299,0.824916,0.988552,entropy,,150,7,0.052272,0.026136
153,0.909869,0.823793,0.988552,entropy,,140,9,0.048842,0.024421
142,0.138439,0.823793,0.985409,entropy,,30,9,0.041977,0.020989


### Data Type 3

In [9]:
X=data_3.drop('Survived',axis=1).as_matrix()
Y=data_3['Survived'].as_matrix()
rfc = RandomForestClassifier(n_jobs=-1)

param_grid ={
    'n_estimators':[10*x for x in range(1,21)],
    'criterion':['gini','entropy'],
    'max_features':['auto','log2','sqrt',None]}
gscv=GridSearchCV(rfc,param_grid,scoring='accuracy',cv=cv_k_global,verbose=1,n_jobs=-1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

# param_distributions=param_grid ={
#     'n_estimators':sp_randint(1, 1001),
#     'criterion':['gini','entropy'],
#     'max_features':['auto','log2','sqrt',None]
#     }

# rscv=RandomizedSearchCV(estimator=rfc,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=cv_k_global,
#                   verbose=1,
#                   n_jobs=-1,
#                   n_iter=100)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)


if(dataset=='new'):
    results_df.to_csv('./CV_RF_new_data3.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_RF_data3.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_RF_deck_data3.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 11 folds for each of 160 candidates, totalling 1760 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1760 out of 1760 | elapsed:  2.6min finished


Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_criterion,param_max_features,param_n_estimators,rank_test_score,std_test_score,error
153,0.839676,0.833895,0.989562,entropy,,140,1,0.039624,0.019812
140,0.117257,0.83165,0.974748,entropy,,10,2,0.04506,0.02253
71,0.619085,0.830527,0.98945,gini,,120,3,0.040282,0.020141
74,0.7779,0.830527,0.989562,gini,,150,3,0.043199,0.021599
67,0.385714,0.829405,0.989113,gini,,80,5,0.040815,0.020407
79,0.967594,0.828283,0.989562,gini,,200,6,0.048686,0.024343
148,0.556565,0.828283,0.989562,entropy,,90,6,0.042359,0.02118
75,0.798757,0.82716,0.989562,gini,,160,8,0.041616,0.020808
77,0.926408,0.82716,0.989562,gini,,180,8,0.043489,0.021745
144,0.278927,0.82716,0.98844,entropy,,50,8,0.04658,0.02329


## KNN

### Data Type 1

In [10]:
X=data_1.drop('Survived',axis=1).as_matrix()
Y=data_1['Survived'].as_matrix()
knn = KNeighborsClassifier()

param_grid = [{
    'n_neighbors':[2*x+1 for x in range(1,51)],
    'algorithm':[ 'ball_tree', 'kd_tree','brute'],
    'p':[1,2],
    'weights':['uniform','distance'],
    'leaf_size':[2*x+1 for x in range(1,51)]}]
gscv=GridSearchCV(knn,param_grid,scoring='accuracy',cv=cv_k_global,verbose=1,n_jobs=-1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

# param_distributions ={'n_neighbors':sp_randint(1, 101),
#                       'algorithm':[ 'ball_tree', 'kd_tree','brute'],
#                       'p':[1,2],
#                       'weights':['uniform','distance'],
#                       'leaf_size':sp_randint(1, 101)}                     
# rscv=RandomizedSearchCV(estimator=knn,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=cv_k_global,
#                   verbose=1,
#                   n_jobs=-1,
#                   n_iter=1000)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_KNN_new_data1.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_KNN_data1.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_KNN_deck_data1.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 11 folds for each of 30000 candidates, totalling 330000 fits


[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1568 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 4068 tasks      | elapsed:   35.5s
[Parallel(n_jobs=-1)]: Done 7568 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 12068 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 17568 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 24068 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 31568 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 40068 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 49568 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 60068 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 71568 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 84068 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 97568 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 111508 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done 119258 tasks    

Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_algorithm,param_leaf_size,param_n_neighbors,param_p,param_weights,rank_test_score,std_test_score,error
5636,0.001284,0.818182,0.828846,ball_tree,59,21,1,uniform,1,0.022182,0.011091
18436,0.001179,0.818182,0.828846,kd_tree,87,21,1,uniform,1,0.022182,0.011091
8636,0.00121,0.818182,0.828846,ball_tree,89,21,1,uniform,1,0.022182,0.011091
19636,0.001192,0.818182,0.828846,kd_tree,99,21,1,uniform,1,0.022182,0.011091
19436,0.00127,0.818182,0.828846,kd_tree,97,21,1,uniform,1,0.022182,0.011091
2636,0.001327,0.818182,0.828846,ball_tree,29,21,1,uniform,1,0.022182,0.011091
14436,0.001264,0.818182,0.828733,kd_tree,47,21,1,uniform,1,0.022182,0.011091
4636,0.001352,0.818182,0.828846,ball_tree,49,21,1,uniform,1,0.022182,0.011091
9636,0.001225,0.818182,0.828846,ball_tree,99,21,1,uniform,1,0.022182,0.011091
6436,0.001221,0.818182,0.828846,ball_tree,67,21,1,uniform,1,0.022182,0.011091


### Data Type 2

In [11]:
X=data_2.drop('Survived',axis=1).as_matrix()
Y=data_2['Survived'].as_matrix()
knn = KNeighborsClassifier()

param_grid = [{
    'n_neighbors':[2*x+1 for x in range(1,51)],
    'algorithm':[ 'ball_tree', 'kd_tree','brute'],
    'p':[1,2],
    'weights':['uniform','distance'],
    'leaf_size':[2*x+1 for x in range(1,51)]}]
gscv=GridSearchCV(knn,param_grid,scoring='accuracy',cv=cv_k_global,verbose=1,n_jobs=-1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

# param_distributions ={'n_neighbors':sp_randint(1, 101),
#                       'algorithm':[ 'ball_tree', 'kd_tree','brute'],
#                       'p':[1,2],
#                       'weights':['uniform','distance'],
#                       'leaf_size':sp_randint(1, 101)}                     
# rscv=RandomizedSearchCV(estimator=knn,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=cv_k_global,
#                   verbose=1,
#                   n_jobs=-1,
#                   n_iter=1000)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_KNN_new_data2.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_KNN_data2.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_KNN_deck_data2.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 11 folds for each of 30000 candidates, totalling 330000 fits


[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 1681 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 2931 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 4681 tasks      | elapsed:   47.5s
[Parallel(n_jobs=-1)]: Done 6931 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 9681 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 12931 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 16681 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 20931 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 25681 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 30931 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 36681 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 42931 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 49681 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 56931 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 64681 tasks      | 

Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_algorithm,param_leaf_size,param_n_neighbors,param_p,param_weights,rank_test_score,std_test_score,error
216,0.001971,0.806958,0.837712,ball_tree,5,11,1,uniform,1,0.043325,0.021662
14016,0.0014,0.806958,0.8376,kd_tree,43,11,1,uniform,1,0.043325,0.021662
12016,0.0015,0.806958,0.8376,kd_tree,23,11,1,uniform,1,0.043325,0.021662
11416,0.001525,0.806958,0.8376,kd_tree,17,11,1,uniform,1,0.043325,0.021662
4216,0.001522,0.806958,0.837375,ball_tree,45,11,1,uniform,1,0.043325,0.021662
4016,0.001533,0.806958,0.837375,ball_tree,43,11,1,uniform,1,0.043325,0.021662
3416,0.001521,0.806958,0.837375,ball_tree,37,11,1,uniform,1,0.043325,0.021662
3216,0.001559,0.806958,0.837375,ball_tree,35,11,1,uniform,1,0.043325,0.021662
11816,0.001554,0.806958,0.8376,kd_tree,21,11,1,uniform,1,0.043325,0.021662
2616,0.001506,0.806958,0.837375,ball_tree,29,11,1,uniform,1,0.043325,0.021662


### Data Type 3

In [12]:
X=data_3.drop('Survived',axis=1).as_matrix()
Y=data_3['Survived'].as_matrix()
knn = KNeighborsClassifier()

param_grid = [{
    'n_neighbors':[2*x+1 for x in range(1,51)],
    'algorithm':[ 'ball_tree', 'kd_tree','brute'],
    'p':[1,2],
    'weights':['uniform','distance'],
    'leaf_size':[2*x+1 for x in range(1,51)]}]
gscv=GridSearchCV(knn,param_grid,scoring='accuracy',cv=cv_k_global,verbose=1,n_jobs=-1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)

# param_distributions ={'n_neighbors':sp_randint(1, 101),
#                       'algorithm':[ 'ball_tree', 'kd_tree','brute'],
#                       'p':[1,2],
#                       'weights':['uniform','distance'],
#                       'leaf_size':sp_randint(1, 101)}                     
# rscv=RandomizedSearchCV(estimator=knn,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=cv_k_global,
#                   verbose=1,
#                   n_jobs=-1,
#                   n_iter=1000)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_KNN_new_data3.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_KNN_data3.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_KNN_deck_data3.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 11 folds for each of 30000 candidates, totalling 330000 fits


[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 1360 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 2796 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done 4196 tasks      | elapsed:   48.9s
[Parallel(n_jobs=-1)]: Done 5996 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 8196 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 10796 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 13796 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 17196 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 20996 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 25196 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 29796 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 34796 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 40196 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 45996 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 52196 tasks      | 

Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_algorithm,param_leaf_size,param_n_neighbors,param_p,param_weights,rank_test_score,std_test_score,error
13230,0.00164,0.828283,0.839395,kd_tree,35,17,2,uniform,1,0.031321,0.01566
17630,0.001352,0.828283,0.839395,kd_tree,79,17,2,uniform,1,0.031321,0.01566
28630,0.000992,0.828283,0.839395,brute,89,17,2,uniform,1,0.031321,0.01566
10030,0.002596,0.828283,0.839395,kd_tree,3,17,2,uniform,1,0.031321,0.01566
23030,0.001049,0.828283,0.839395,brute,33,17,2,uniform,1,0.031321,0.01566
21030,0.000769,0.828283,0.839395,brute,13,17,2,uniform,1,0.031321,0.01566
10230,0.001961,0.828283,0.839395,kd_tree,5,17,2,uniform,1,0.031321,0.01566
8030,0.001383,0.828283,0.839395,ball_tree,83,17,2,uniform,1,0.031321,0.01566
430,0.002902,0.828283,0.839395,ball_tree,7,17,2,uniform,1,0.031321,0.01566
4430,0.001641,0.828283,0.839395,ball_tree,47,17,2,uniform,1,0.031321,0.01566


## SVM

### Data Type 1

In [13]:
X=data_1.drop('Survived',axis=1).as_matrix()
Y=data_1['Survived'].as_matrix()
svm_class = svm.SVC()

param_grid =[
    {'C':np.linspace(0.001,1000,21),
    'kernel':['poly'],
    'degree':[2,3,4,5],
    'decision_function_shape':['ovo','ovr']},
     {'C':np.linspace(0.001,1000,21),
    'kernel':['rbf','linear','sigmoid'],
    'decision_function_shape':['ovo','ovr']}
    ]
gscv=GridSearchCV(svm_class,param_grid,scoring='accuracy',cv=cv_k_global,verbose=1,n_jobs=-1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)
    
# param_distributions =[
#     {'C':uniform(0,1000),
#     'kernel':['poly'],
#     'degree':[2,3,4,5],
#     'decision_function_shape':['ovo','ovr']},
#      {'C':uniform(0,1000),
#     'kernel':['rbf','linear','sigmoid'],
#     'decision_function_shape':['ovo','ovr']}
#     ]

# param_distributions ={'C':uniform(0.001,1000),
#     'kernel':['poly'],
#     'degree':[2,3,4,5,6],
#     'decision_function_shape':['ovo','ovr']}

# rscv=RandomizedSearchCV(estimator=svm_class,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=cv_k_global,
#                   verbose=1,
#                   n_jobs=-1,
#                   n_iter=1000)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_SVM_new_data1.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_SVM_data1.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_SVM_deck_data1.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 11 folds for each of 294 candidates, totalling 3234 fits


[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 799 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 1207 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 1557 tasks      | elapsed:   53.6s
[Parallel(n_jobs=-1)]: Done 2075 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 2625 tasks      | elapsed: 23.4min
[Parallel(n_jobs=-1)]: Done 3234 out of 3234 | elapsed: 53.8min finished


Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_C,param_decision_function_shape,param_degree,param_kernel,rank_test_score,std_test_score,error
115,0.179323,0.840629,0.889226,700,ovo,5,poly,1,0.031999,0.016
119,0.172937,0.840629,0.889226,700,ovr,5,poly,1,0.031999,0.016
131,0.244906,0.839506,0.890124,800,ovo,5,poly,3,0.030918,0.015459
127,0.209144,0.839506,0.889787,750,ovr,5,poly,3,0.032662,0.016331
123,0.212245,0.839506,0.889787,750,ovo,5,poly,3,0.032662,0.016331
135,0.216812,0.839506,0.890124,800,ovr,5,poly,3,0.030918,0.015459
103,0.177069,0.839506,0.889338,600,ovr,5,poly,3,0.030008,0.015004
99,0.158536,0.839506,0.889338,600,ovo,5,poly,3,0.030008,0.015004
95,0.168348,0.839506,0.889339,550,ovr,5,poly,3,0.031363,0.015682
91,0.16866,0.839506,0.889339,550,ovo,5,poly,3,0.031363,0.015682


### Data Type 2

In [14]:
X=data_2.drop('Survived',axis=1).as_matrix()
Y=data_2['Survived'].as_matrix()
svm_class = svm.SVC()

param_grid =[
    {'C':np.linspace(0.001,1000,21),
    'kernel':['poly'],
    'degree':[2,3,4,5],
    'decision_function_shape':['ovo','ovr']},
     {'C':np.linspace(0.001,1000,21),
    'kernel':['rbf','linear','sigmoid'],
    'decision_function_shape':['ovo','ovr']}
    ]
gscv=GridSearchCV(svm_class,param_grid,scoring='accuracy',cv=cv_k_global,verbose=1,n_jobs=-1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)
    
# param_distributions =[
#     {'C':uniform(0,1000),
#     'kernel':['poly'],
#     'degree':[2,3,4,5],
#     'decision_function_shape':['ovo','ovr']},
#      {'C':uniform(0,1000),
#     'kernel':['rbf','linear','sigmoid'],
#     'decision_function_shape':['ovo','ovr']}
#     ]

# param_distributions ={'C':uniform(0.001,1000),
#     'kernel':['poly'],
#     'degree':[2,3,4,5,6],
#     'decision_function_shape':['ovo','ovr']}

# rscv=RandomizedSearchCV(estimator=svm_class,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=cv_k_global,
#                   verbose=1,
#                   n_jobs=-1,
#                   n_iter=1000)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_SVM_new_data2.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_SVM_data2.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_SVM_deck_data2.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 11 folds for each of 294 candidates, totalling 3234 fits


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 1418 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 2049 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2499 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 3049 tasks      | elapsed: 41.8min
[Parallel(n_jobs=-1)]: Done 3234 out of 3234 | elapsed: 52.3min finished


Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_C,param_decision_function_shape,param_degree,param_kernel,rank_test_score,std_test_score,error
21,0.086145,0.83165,0.875533,100.001,ovr,3,poly,1,0.032675,0.016337
17,0.077205,0.83165,0.875533,100.001,ovo,3,poly,1,0.032675,0.016337
9,0.057089,0.830527,0.863749,50.0009,ovo,3,poly,3,0.034634,0.017317
13,0.056349,0.830527,0.863749,50.0009,ovr,3,poly,3,0.034634,0.017317
10,0.048519,0.828283,0.868912,50.0009,ovo,4,poly,5,0.027863,0.013932
14,0.057305,0.828283,0.868912,50.0009,ovr,4,poly,5,0.027863,0.013932
22,0.073787,0.826038,0.880696,100.001,ovr,4,poly,7,0.030304,0.015152
20,0.109101,0.826038,0.857464,100.001,ovr,2,poly,7,0.036028,0.018014
18,0.072026,0.826038,0.880696,100.001,ovo,4,poly,7,0.030304,0.015152
8,0.062137,0.826038,0.848822,50.0009,ovo,2,poly,7,0.032807,0.016404


### Data Type 3

In [15]:
X=data_3.drop('Survived',axis=1).as_matrix()
Y=data_3['Survived'].as_matrix()
svm_class = svm.SVC()

param_grid =[
    {'C':np.linspace(0.001,1000,21),
    'kernel':['poly'],
    'degree':[2,3,4,5],
    'decision_function_shape':['ovo','ovr']},
     {'C':np.linspace(0.001,1000,21),
    'kernel':['rbf','linear','sigmoid'],
    'decision_function_shape':['ovo','ovr']}
    ]
gscv=GridSearchCV(svm_class,param_grid,scoring='accuracy',cv=cv_k_global,verbose=1,n_jobs=-1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)
    
# param_distributions =[
#     {'C':uniform(0,1000),
#     'kernel':['poly'],
#     'degree':[2,3,4,5],
#     'decision_function_shape':['ovo','ovr']},
#      {'C':uniform(0,1000),
#     'kernel':['rbf','linear','sigmoid'],
#     'decision_function_shape':['ovo','ovr']}
#     ]

# param_distributions ={'C':uniform(0.001,1000),
#     'kernel':['poly'],
#     'degree':[2,3,4,5,6],
#     'decision_function_shape':['ovo','ovr']}

# rscv=RandomizedSearchCV(estimator=svm_class,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=cv_k_global,
#                   verbose=1,
#                   n_jobs=-1,
#                   n_iter=1000)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)

if(dataset=='new'):
    results_df.to_csv('./CV_SVM_new_data3.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_SVM_data3.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_SVM_deck_data3.csv')

results_df['error']=results_df['std_test_score']/2
results_df.sort_values('rank_test_score',axis=0).drop(['mean_score_time','params','split0_test_score', 'split0_train_score', 'split10_test_score',
       'split10_train_score', 'split1_test_score', 'split1_train_score',
       'split2_test_score', 'split2_train_score', 'split3_test_score',
       'split3_train_score', 'split4_test_score', 'split4_train_score',
       'split5_test_score', 'split5_train_score', 'split6_test_score',
       'split6_train_score', 'split7_test_score', 'split7_train_score',
       'split8_test_score', 'split8_train_score', 'split9_test_score',
       'split9_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 11 folds for each of 294 candidates, totalling 3234 fits


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 1496 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done 2092 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2542 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 3092 tasks      | elapsed: 19.3min
[Parallel(n_jobs=-1)]: Done 3234 out of 3234 | elapsed: 23.7min finished


Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_C,param_decision_function_shape,param_degree,param_kernel,rank_test_score,std_test_score,error
12,0.060453,0.832772,0.856454,50.0009,ovr,2,poly,1,0.026188,0.013094
20,0.089885,0.832772,0.865882,100.001,ovr,2,poly,1,0.027299,0.013649
16,0.092751,0.832772,0.865882,100.001,ovo,2,poly,1,0.027299,0.013649
8,0.064571,0.832772,0.856454,50.0009,ovo,2,poly,1,0.026188,0.013094
33,0.095593,0.830527,0.89798,200.001,ovo,3,poly,5,0.037721,0.01886
38,0.076528,0.830527,0.899776,200.001,ovr,4,poly,5,0.041878,0.020939
37,0.089588,0.830527,0.89798,200.001,ovr,3,poly,5,0.037721,0.01886
34,0.083811,0.830527,0.899776,200.001,ovo,4,poly,5,0.041878,0.020939
46,0.103822,0.829405,0.901347,250.001,ovr,4,poly,9,0.035709,0.017854
51,0.080288,0.829405,0.901235,300.001,ovo,5,poly,9,0.035631,0.017815


## DNN in Keras

In [3]:
def my_DNN_classifier(in_layer,optimizer='adam',neurons=64,dropout=0.0,activation='relu',loss='binary_crossentropy',activation_final='sigmoid',shape='one'):
    model=None
    model = Sequential()
    model.add(Dense(neurons, input_dim=in_layer, kernel_initializer='normal',activation=activation))
    model.add(Dropout(dropout))
    if(shape=='none'):
        None
    if(shape=='one'):
        model.add(Dense(neurons, kernel_initializer='normal',activation=activation))
        model.add(Dropout(dropout))
    if(shape=='two'):
        model.add(Dense(neurons, kernel_initializer='normal',activation=activation))
        model.add(Dropout(dropout))
        model.add(Dense(neurons, kernel_initializer='normal',activation=activation))
        model.add(Dropout(dropout))
    model.add(Dense(1, kernel_initializer='normal', activation=activation_final))
    model.compile(loss=loss, metrics=['accuracy'],optimizer=optimizer)
    return model

### Data Type 1

In [17]:
X=data_1.drop('Survived',axis=1).as_matrix()
Y=data_1['Survived'].as_matrix()

classifier = None
classifier = KerasClassifier(build_fn=my_DNN_classifier,verbose=0)

param_grid ={'batch_size': [1024],
                      'epochs': [1000],
                      'in_layer' :[X.shape[1]],
                      'optimizer': ['adam','adagrad','rmsprop','adadelta','nadam'],# ['adam','adagrad']
                      'dropout' : np.linspace(0,1,11),#[0.0,0.1,0.2,0.3,0.4,0.5]
                      'neurons' : [32],#sp_randint(1, 101)
                      'activation' : ['relu'],#['relu','sigmoid','tanh']
                      'activation_final' : ['sigmoid'],#['relu','sigmoid','tanh']
                      'loss' : ['binary_crossentropy'],#['sparse_categorical_crossentropy','binary_crossentropy'],)
                      'shape':['two']
                     }
gscv=GridSearchCV(estimator=classifier,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=5,
                  verbose=1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)


# param_distributions ={'batch_size': [1024],
#                       'epochs': [1000],
#                       'in_layer' :[X.shape[1]],
#                       'optimizer': ['adadelta'],# ['adam','adagrad']
#                       'dropout' : np.linspace(0.3,0.7,5),#[0.0,0.1,0.2,0.3,0.4,0.5]
#                       'neurons' : [32],#sp_randint(1, 101)
#                       'activation' : ['relu'],#['relu','sigmoid','tanh']
#                       'activation_final' : ['sigmoid'],#['relu','sigmoid','tanh']
#                       'loss' : ['binary_crossentropy'],#['sparse_categorical_crossentropy','binary_crossentropy'],)
#                       'shape':['two']
#                      }
# rscv=RandomizedSearchCV(estimator=classifier,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=5,
#                   verbose=1,
#                   #n_jobs=-1,
#                   n_iter=5)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)

results_df['error']=results_df['std_test_score']/2

if(dataset=='new'):
    results_df.to_csv('./CV_DNN_new_data1.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_DNN_data1.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_DNN_deck_data1.csv')

results_df.sort_values('rank_test_score',axis=0).drop(['param_epochs','param_loss','param_batch_size','param_in_layer','mean_score_time','params','split0_test_score', 'split0_train_score', 'split1_test_score',
       'split1_train_score', 'split2_test_score', 'split2_train_score',
       'split3_test_score', 'split3_train_score', 'split4_test_score',
       'split4_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 5 folds for each of 55 candidates, totalling 275 fits


[Parallel(n_jobs=1)]: Done 275 out of 275 | elapsed: 64.3min finished


Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_activation,param_activation_final,param_dropout,param_neurons,param_optimizer,param_shape,rank_test_score,std_test_score,error
19,10.354967,0.830527,0.914144,relu,sigmoid,0.3,32,nadam,two,1,0.029399,0.014699
24,11.564893,0.829405,0.903199,relu,sigmoid,0.4,32,nadam,two,2,0.03215,0.016075
15,8.887065,0.829405,0.888328,relu,sigmoid,0.3,32,adam,two,2,0.041589,0.020795
8,7.516866,0.828283,0.87542,relu,sigmoid,0.1,32,adadelta,two,4,0.028228,0.014114
27,11.08298,0.826038,0.877948,relu,sigmoid,0.5,32,rmsprop,two,5,0.028554,0.014277
10,7.90234,0.824916,0.892536,relu,sigmoid,0.2,32,adam,two,6,0.031705,0.015852
22,9.770357,0.823793,0.883559,relu,sigmoid,0.4,32,rmsprop,two,7,0.026413,0.013207
23,11.194319,0.823793,0.879351,relu,sigmoid,0.4,32,adadelta,two,7,0.021748,0.010874
12,7.658233,0.823793,0.891133,relu,sigmoid,0.2,32,rmsprop,two,7,0.03582,0.01791
14,9.461951,0.823793,0.914982,relu,sigmoid,0.2,32,nadam,two,7,0.034566,0.017283


### Data Type 2

In [5]:
X=data_2.drop('Survived',axis=1).as_matrix()
Y=data_2['Survived'].as_matrix()

classifier = None
classifier = KerasClassifier(build_fn=my_DNN_classifier,verbose=0)

param_grid ={'batch_size': [1024],
                      'epochs': [1000],
                      'in_layer' :[X.shape[1]],
                      'optimizer': ['adam','adagrad','rmsprop','adadelta','nadam'],# ['adam','adagrad']
                      'dropout' : np.linspace(0,1,11),#[0.0,0.1,0.2,0.3,0.4,0.5]
                      'neurons' : [32],#sp_randint(1, 101)
                      'activation' : ['relu'],#['relu','sigmoid','tanh']
                      'activation_final' : ['sigmoid'],#['relu','sigmoid','tanh']
                      'loss' : ['binary_crossentropy'],#['sparse_categorical_crossentropy','binary_crossentropy'],)
                      'shape':['two']
                     }
gscv=GridSearchCV(estimator=classifier,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=5,
                  verbose=1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)


# param_distributions ={'batch_size': [1024],
#                       'epochs': [1000],
#                       'in_layer' :[X.shape[1]],
#                       'optimizer': ['adadelta'],# ['adam','adagrad']
#                       'dropout' : np.linspace(0.3,0.7,5),#[0.0,0.1,0.2,0.3,0.4,0.5]
#                       'neurons' : [32],#sp_randint(1, 101)
#                       'activation' : ['relu'],#['relu','sigmoid','tanh']
#                       'activation_final' : ['sigmoid'],#['relu','sigmoid','tanh']
#                       'loss' : ['binary_crossentropy'],#['sparse_categorical_crossentropy','binary_crossentropy'],)
#                       'shape':['two']
#                      }
# rscv=RandomizedSearchCV(estimator=classifier,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=5,
#                   verbose=1,
#                   #n_jobs=-1,
#                   n_iter=5)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)

results_df['error']=results_df['std_test_score']/2

if(dataset=='new'):
    results_df.to_csv('./CV_DNN_new_data2.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_DNN_data2.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_DNN_deck_data2.csv')

results_df.sort_values('rank_test_score',axis=0).drop(['param_epochs','param_loss','param_batch_size','param_in_layer','mean_score_time','params','split0_test_score', 'split0_train_score', 'split1_test_score',
       'split1_train_score', 'split2_test_score', 'split2_train_score',
       'split3_test_score', 'split3_train_score', 'split4_test_score',
       'split4_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 5 folds for each of 55 candidates, totalling 275 fits


[Parallel(n_jobs=1)]: Done 275 out of 275 | elapsed: 64.6min finished


Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_activation,param_activation_final,param_dropout,param_neurons,param_optimizer,param_shape,rank_test_score,std_test_score,error
25,11.18473,0.829405,0.899274,relu,sigmoid,0.5,32,adam,two,1,0.031478,0.015739
17,8.828118,0.82716,0.904322,relu,sigmoid,0.3,32,rmsprop,two,2,0.039962,0.019981
10,8.002001,0.824916,0.918632,relu,sigmoid,0.2,32,adam,two,3,0.027108,0.013554
28,12.622214,0.824916,0.894505,relu,sigmoid,0.5,32,adadelta,two,3,0.036056,0.018028
18,9.863065,0.820426,0.904604,relu,sigmoid,0.3,32,adadelta,two,5,0.031453,0.015727
27,11.110935,0.819304,0.900114,relu,sigmoid,0.5,32,rmsprop,two,6,0.034111,0.017055
11,7.158175,0.819304,0.888889,relu,sigmoid,0.2,32,adagrad,two,6,0.04062,0.02031
33,13.959143,0.819304,0.881311,relu,sigmoid,0.6,32,adadelta,two,6,0.032562,0.016281
6,6.19076,0.819304,0.893939,relu,sigmoid,0.1,32,adagrad,two,6,0.029043,0.014521
5,6.906888,0.818182,0.921437,relu,sigmoid,0.1,32,adam,two,10,0.02735,0.013675


### Data Type 3

In [None]:
X=data_3.drop('Survived',axis=1).as_matrix()
Y=data_3['Survived'].as_matrix()

classifier = None
classifier = KerasClassifier(build_fn=my_DNN_classifier,verbose=0)

param_grid ={'batch_size': [1024],
                      'epochs': [1000],
                      'in_layer' :[X.shape[1]],
                      'optimizer': ['adam','adagrad','rmsprop','adadelta','nadam'],# ['adam','adagrad']
                      'dropout' : np.linspace(0,1,11),#[0.0,0.1,0.2,0.3,0.4,0.5]
                      'neurons' : [32],#sp_randint(1, 101)
                      'activation' : ['relu'],#['relu','sigmoid','tanh']
                      'activation_final' : ['sigmoid'],#['relu','sigmoid','tanh']
                      'loss' : ['binary_crossentropy'],#['sparse_categorical_crossentropy','binary_crossentropy'],)
                      'shape':['two']
                     }
gscv=GridSearchCV(estimator=classifier,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=5,
                  verbose=1)
gscv.fit(X,Y)
results_df=pd.DataFrame(gscv.cv_results_)


# param_distributions ={'batch_size': [1024],
#                       'epochs': [1000],
#                       'in_layer' :[X.shape[1]],
#                       'optimizer': ['adadelta'],# ['adam','adagrad']
#                       'dropout' : np.linspace(0.3,0.7,5),#[0.0,0.1,0.2,0.3,0.4,0.5]
#                       'neurons' : [32],#sp_randint(1, 101)
#                       'activation' : ['relu'],#['relu','sigmoid','tanh']
#                       'activation_final' : ['sigmoid'],#['relu','sigmoid','tanh']
#                       'loss' : ['binary_crossentropy'],#['sparse_categorical_crossentropy','binary_crossentropy'],)
#                       'shape':['two']
#                      }
# rscv=RandomizedSearchCV(estimator=classifier,
#                   param_distributions=param_distributions,
#                   scoring='accuracy',
#                   cv=5,
#                   verbose=1,
#                   #n_jobs=-1,
#                   n_iter=5)
# rscv.fit(X,Y)
# results_df=pd.DataFrame(rscv.cv_results_)

results_df['error']=results_df['std_test_score']/2

if(dataset=='new'):
    results_df.to_csv('./CV_DNN_new_data3.csv')
if(dataset=='old'):
    results_df.to_csv('./CV_DNN_data3.csv')
if(dataset=='deck'):
    results_df.to_csv('./CV_DNN_deck_data3.csv')

results_df.sort_values('rank_test_score',axis=0).drop(['param_epochs','param_loss','param_batch_size','param_in_layer','mean_score_time','params','split0_test_score', 'split0_train_score', 'split1_test_score',
       'split1_train_score', 'split2_test_score', 'split2_train_score',
       'split3_test_score', 'split3_train_score', 'split4_test_score',
       'split4_train_score','std_fit_time', 'std_score_time', 'std_train_score'],axis=1).head(10)

Fitting 5 folds for each of 55 candidates, totalling 275 fits
