Load necessary libraries.

In [51]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from statistics import mean

# I. Model Testing on All Data

Create explanatory and response variables.

In [52]:
df = pd.read_csv('train.csv')
X = df.drop(['outcome'], axis=1)
y = pd.get_dummies(df["outcome"], drop_first=True)
y = np.ravel(y).reshape((-1,))

Identify Numeric and categorical columns.

In [53]:
numeric_columns = X.select_dtypes(include=[float, int]).columns

categorical_columns = list(set(X.columns) - set(numeric_columns))

Create preprocessing pipelines. One without and one with scaling for numeric variables.

In [54]:
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('ohe', OneHotEncoder(handle_unknown = 'ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

In [55]:
numeric_transformer_scale = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', MinMaxScaler())])

preprocessor_scale = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_scale, numeric_columns),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

Load models.

In [56]:
reg = LogisticRegression(penalty='l1', solver='liblinear', C=1)
reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor_scale), ('model', reg)])

In [57]:
mlp = MLPClassifier(alpha=0.15, hidden_layer_sizes=5, learning_rate_init=0.01, max_iter=1000, random_state=1)
mlp_pipeline = Pipeline(steps=[('preprocessor_scale', preprocessor_scale), ('model', mlp)])

In [58]:
xgb = XGBClassifier(learning_rate=0.05, n_estimators=114, max_depth=5, min_child_weight=5, gamma=0.6,
                    colsample_bytree=0.9, subsample=1, reg_alpha=0.00001, reg_lambda=1,
                    random_state=1, n_jobs=-1)
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', xgb)])

Run stacking classifier.

In [59]:
rkf = RepeatedKFold(random_state=1)
kf = KFold(shuffle=True, random_state=1)

level0 = list()
level0.append(('reg', reg_pipeline))
level0.append(('mlp', mlp_pipeline))
level0.append(('xgb', xgb_pipeline))
level1 = LogisticRegression(solver='sag')

clf = StackingClassifier(estimators=level0, final_estimator=level1, cv=kf, n_jobs=-1)

cv_scores = cross_val_score(estimator=clf, X=X, y=y, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
print(mean(cv_scores))

-0.570537738682534


# II. Model Testing on Data With No Missing Values

Create explanatory and response variables.

In [60]:
df = pd.read_csv('train_no_na.csv')
X = df.drop(['outcome'], axis=1)
y = pd.get_dummies(df["outcome"], drop_first=True)
y = np.ravel(y).reshape((-1,))

Identify Numeric and categorical columns.

In [61]:
numeric_columns = X.select_dtypes(include=[float, int]).columns

categorical_columns = list(set(X.columns) - set(numeric_columns))

Create preprocessing pipelines. One without and one with scaling for numeric variables.

In [62]:
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('ohe', OneHotEncoder(handle_unknown = 'ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

In [63]:
numeric_transformer_scale = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', MinMaxScaler())])

preprocessor_scale = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_scale, numeric_columns),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

Load models.

In [64]:
reg = LogisticRegression(penalty='elasticnet', solver='saga', random_state=1, C=1, l1_ratio=1)
reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor_scale), ('model', reg)])

In [65]:
mlp = MLPClassifier(learning_rate_init=0.01, alpha=0.2, hidden_layer_sizes=5, max_iter=1000, random_state=1)
mlp_pipeline = Pipeline(steps=[('preprocessor_scale', preprocessor_scale), ('model', mlp)])

In [66]:
xgb = XGBClassifier(learning_rate=0.01, n_estimators=481, max_depth=5, min_child_weight=3, gamma=0.2,
                    colsample_bytree=1, subsample=0.9, reg_alpha=0.00001, reg_lambda=0.1,
                    random_state=1, n_jobs=-1)
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', xgb)])

Run stacking classifier.

In [67]:
rkf = RepeatedKFold(random_state=1)
kf = KFold(shuffle=True, random_state=1)

level0 = list()
level0.append(('reg', reg_pipeline))
level0.append(('mlp', mlp_pipeline))
level0.append(('xgb', xgb_pipeline))
level1 = LogisticRegression(solver='sag')

clf = StackingClassifier(estimators=level0, final_estimator=level1, cv=kf, n_jobs=-1)

cv_scores = cross_val_score(estimator=clf, X=X, y=y, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
print(mean(cv_scores))

-0.5933075444602756
