# Intro

In this notebook, we take the features we created in "Features.ipynb" and use them to train a series of models. 

# Load packages

In [1]:
# Saving models
import joblib

# Data manipulation
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.decomposition import PCA

# Model building tools
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit, RandomizedSearchCV, GridSearchCV
from scipy import stats

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier#,StackingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore',category=DeprecationWarning)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier

# Load data

Loading the whole dataset processed in "Features.ipynb"

In [2]:
df = pd.read_csv('./data/df_full.csv',index_col='PassengerId')

Re-splitting into train and test sets

In [3]:
# Training mask
tr = ~df['Survived'].isnull()

# Training data
y = df[tr]['Survived'].to_numpy().astype(int)
X = df[tr].drop('Survived',1)

# Model fitting setup

First, we define a series of data-scaling and feature selection pipelines to search over

In [4]:
# We only want to scale numerical columns, the rest have already
# been one-hot encoded
num_cols = ['Age','SibSp','Parch','Pclass','TicketFreq','FareBin']

# Methods for scaling numerical columns

minmax_sc = ColumnTransformer(transformers=[('num', MinMaxScaler(), num_cols)],remainder='passthrough')
standard_sc = ColumnTransformer(transformers=[('num', StandardScaler(), num_cols)],remainder='passthrough')
scalers = [None, minmax_sc, standard_sc]

# Methods for feature selection/transformation
feature_transforms = [None,  # original features
                      SelectPercentile(percentile=75),  # keeps highest % based on univariate statistical tests
                      PCA(n_components=round(X.shape[1]*0.75)),  # first 75% PCA components
                      PolynomialFeatures(degree=2,interaction_only=True),  # 2-deg polynomial interaction terms
                      PolynomialFeatures(degree=3,interaction_only=True)]  # 3-deg polynomial interaction terms

# Default CV scheme
cross_validator = StratifiedShuffleSplit(n_splits=10, train_size=0.8, random_state=49)

Create some utility functions for performing the CV

In [5]:
def print_results(best_model_pipeline, X,y):
    '''
    Takes in a cross-validated model and prints the best parameters used for the
    data processing, feature engineering, and classifier steps. Also prints out
    the training and average cross-validation accuracy. 
    '''
    print("#------------ Best Data Pipeline found in RandomSearchCV  ------------#\n\n", best_model_pipeline.best_estimator_[0])
    print("\n\n#------------ Best Feature Engineering technique found in RandomSearchCV  ------------#\n\n", best_model_pipeline.best_estimator_[1])
    print("\n\n#------------ Best Classifier found in RandomSearchCV  ------------#\n\n", best_model_pipeline.best_estimator_[2])
    print("\n\n#------------ Best Estimator's Accuracy Score on training set ------------#\n\n", best_model_pipeline.score(X,y))
    print("\n\n#------------ Best Estimator's average Accuracy Score on CV (validation set) -------------#\n\n", best_model_pipeline.best_score_)
    
def my_CV(clf_params, scalers = scalers, features = feature_transforms,
         cv = None,n_iter=50,n_jobs=32,verbose=False, search_func=RandomizedSearchCV):
    '''
    Creates an optimal classification pipeline using RandomSearchCV. 
    
    Data are first scaled by a column transformer in "scalers," then feature selection
    is performed by a method in "features," followed by classification using a 
    classifier defined in clf_params. The parameter space of the classifier is also
    defined in clf_params, and the entire pipeline is optimized using RandomSearchCV. 
    
    Chosen parameters and results are printed at the end. 
    '''
    
    # Initializing default pipeline with a scaler step, feature transform step
    # and classifier. These specific values are just placeholders that will 
    # replaced with values in params_grid
    pipe = Pipeline(steps = [
    ('scaler', standard_sc),
    ('feature_transf', PCA()), 
    ('clf', LogisticRegression())
    ])
    
    # Parameter space to search over
    params_grid = {
     'scaler' : scalers,
     'feature_transf': features
    }
    params_grid.update(clf_params)
    
    # Default CV scheme
    if cv is None:
        cv = StratifiedShuffleSplit(n_splits=10, train_size=0.8, random_state=49)
        
    if n_iter is None:
        kwargs = {'param_grid' : params_grid}
    else:
        kwargs = {'param_distributions' : params_grid,
                  'n_iter' : n_iter,
                  'random_state' : 7}   
    
    # Search parameters and choose the pipeline with the highest CV accuracy
    best_model_pipeline = search_func(estimator=pipe, 
                                             scoring='accuracy',
                                             refit='accuracy', 
                                             n_jobs=n_jobs,
                                             cv=cv, 
                                             verbose=verbose,
                                             **kwargs)
    best_model_pipeline.fit(X, y)
    print_results(best_model_pipeline,X,y)
    
    return best_model_pipeline

# Models

Here, we fit the actual models using the utility functions we built

## Logistic Regression

In [6]:
logreg_pipe = my_CV(
    clf_params = {
        'clf' : [LogisticRegression(solver='saga',penalty='l1'),  # Need to specify different solver for l1 vs l2
                 LogisticRegression(solver='lbfgs',penalty='l2')],
        'clf__C' : stats.loguniform(0.005, 1)
    },
    n_iter=500)

#------------ Best Data Pipeline found in RandomSearchCV  ------------#

 ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 StandardScaler(copy=True, with_mean=True,
                                                with_std=True),
                                 ['Age', 'SibSp', 'Parch', 'Pclass',
                                  'TicketFreq', 'FareBin'])],
                  verbose=False)


#------------ Best Feature Engineering technique found in RandomSearchCV  ------------#

 PolynomialFeatures(degree=2, include_bias=True, interaction_only=True,
                   order='C')


#------------ Best Classifier found in RandomSearchCV  ------------#

 LogisticRegression(C=0.4344110413285817, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='wa

In [7]:
# Save the results
joblib.dump(logreg_pipe.best_estimator_,'models/best_logreg.joblib');

## KNNeighbors

In [8]:
knn_pipe = my_CV(
    clf_params = {
        'clf' : [KNeighborsClassifier()],
        'clf__n_neighbors' : stats.randint(1,50),
        'clf__weights' : ['uniform','distance'],
        'clf__metric' : ['minkowski','euclidean']
    },
    n_iter=500)

#------------ Best Data Pipeline found in RandomSearchCV  ------------#

 ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 MinMaxScaler(copy=True, feature_range=(0, 1)),
                                 ['Age', 'SibSp', 'Parch', 'Pclass',
                                  'TicketFreq', 'FareBin'])],
                  verbose=False)


#------------ Best Feature Engineering technique found in RandomSearchCV  ------------#

 PCA(copy=True, iterated_power='auto', n_components=19, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)


#------------ Best Classifier found in RandomSearchCV  ------------#

 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=31, p=2,
                     weights='distance')


#------------ Best Estimator's Accuracy Scor

In [9]:
# Save the results
joblib.dump(knn_pipe.best_estimator_,'models/best_knn.joblib');

## SVC

In [10]:
svc_pipe = my_CV(
    clf_params = {
        'clf' : [SVC(probability=True)],
        'clf__C' : stats.loguniform(0.01,10),
        'clf__kernel' : ['linear','poly','rbf']
    },
    n_iter=500)

#------------ Best Data Pipeline found in RandomSearchCV  ------------#

 ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 StandardScaler(copy=True, with_mean=True,
                                                with_std=True),
                                 ['Age', 'SibSp', 'Parch', 'Pclass',
                                  'TicketFreq', 'FareBin'])],
                  verbose=False)


#------------ Best Feature Engineering technique found in RandomSearchCV  ------------#

 PolynomialFeatures(degree=2, include_bias=True, interaction_only=True,
                   order='C')


#------------ Best Classifier found in RandomSearchCV  ------------#

 SVC(C=0.012781635090469897, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True,

In [11]:
# Save the results
joblib.dump(svc_pipe.best_estimator_,'models/best_svc.joblib');

## Random forest

In [12]:
rf_pipe = my_CV(
    clf_params = {
        'clf' : [RandomForestClassifier(random_state=7)],
        'clf__n_estimators': stats.randint(10, 175),
        'clf__criterion' : ['gini', 'entropy'],
        'clf__max_features': [None, "auto", "log2"],
        'clf__max_depth': list(np.arange(1,6,dtype=int)) + [None]
    },
    n_iter=500)

#------------ Best Data Pipeline found in RandomSearchCV  ------------#

 ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 StandardScaler(copy=True, with_mean=True,
                                                with_std=True),
                                 ['Age', 'SibSp', 'Parch', 'Pclass',
                                  'TicketFreq', 'FareBin'])],
                  verbose=False)


#------------ Best Feature Engineering technique found in RandomSearchCV  ------------#

 None


#------------ Best Classifier found in RandomSearchCV  ------------#

 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,


In [13]:
# Save the results
joblib.dump(rf_pipe.best_estimator_,'models/best_rf.joblib');

## XGBoost

In [14]:
xgb_pipe = my_CV(
    clf_params = {
        'clf' : [XGBClassifier()],
        'clf__n_estimators': stats.randint(5, 125),
     'clf__eta': stats.loguniform(0.01, 1),
     'clf__max_depth': [None] + list(np.arange(1,7,dtype=int)),
     'clf__gamma': stats.loguniform(0.01, 1)
    },
    n_iter=500)

#------------ Best Data Pipeline found in RandomSearchCV  ------------#

 ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 StandardScaler(copy=True, with_mean=True,
                                                with_std=True),
                                 ['Age', 'SibSp', 'Parch', 'Pclass',
                                  'TicketFreq', 'FareBin'])],
                  verbose=False)


#------------ Best Feature Engineering technique found in RandomSearchCV  ------------#

 None


#------------ Best Classifier found in RandomSearchCV  ------------#

 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.1008355757194972,
              gamma=0.7001745261557822, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.100835577,


In [15]:
# Save the results
joblib.dump(xgb_pipe.best_estimator_,'models/best_xgb.joblib');