Load necessary libraries.

In [70]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score

# I. Training Model on All Data

Load in training and test data.

In [71]:
df_train = pd.read_csv('train.csv')
X_train = df_train.drop(['outcome'], axis=1)
y_train = pd.get_dummies(df_train["outcome"], drop_first=True)
y_train = np.ravel(y_train).reshape((-1,))

In [72]:
df_test = pd.read_csv('test.csv')
X_test = df_test.drop(['outcome'], axis=1)
y_test = pd.get_dummies(df_test["outcome"], drop_first=True)
y_test = np.ravel(y_test).reshape((-1,))

Identify Numeric and categorical columns.

In [73]:
numeric_columns = X_train.select_dtypes(include=[float, int]).columns

categorical_columns = list(set(X_train.columns) - set(numeric_columns))

Create preprocessing pipelines. One without and one with scaling for numeric variables.

In [74]:
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('ohe', OneHotEncoder(handle_unknown = 'ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

In [75]:
numeric_transformer_scale = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', MinMaxScaler())])

preprocessor_scale = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_scale, numeric_columns),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

Train best models: XGBoost, Stacking.

In [76]:
#XGBoost

xgb = XGBClassifier(learning_rate=0.05, n_estimators=114, max_depth=5, min_child_weight=5, gamma=0.6,
                    colsample_bytree=0.9, subsample=1, reg_alpha=0.00001, reg_lambda=1,
                    random_state=1, n_jobs=-1)

xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', xgb)])

xgb_pipeline.fit(X_train, y_train)

print(accuracy_score(y_test, xgb_pipeline.predict(X_test)))

predictions_xgb = {'true': y_test,
                  'predicted': xgb_pipeline.predict(X_test),
                  'b_prob': [arr[0] for arr in xgb_pipeline.predict_proba(X_test)],
                  'r_prob': [arr[1] for arr in xgb_pipeline.predict_proba(X_test)]}
predictions_xgb = pd.DataFrame(predictions_xgb)
predictions_xgb[predictions_xgb['b_prob']>0.6]

0.608433734939759


Unnamed: 0,true,predicted,b_prob,r_prob
5,1,0,0.668595,0.331405
39,0,0,0.670601,0.329399
47,0,0,0.624767,0.375233
103,0,0,0.691658,0.308342
131,0,0,0.61129,0.38871
158,1,0,0.655945,0.344055
164,0,0,0.693185,0.306815


In [77]:
#Stacking

reg = LogisticRegression(penalty='l1', solver='liblinear', C=1)
reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor_scale), ('model', reg)])

mlp = MLPClassifier(alpha=0.15, hidden_layer_sizes=5, learning_rate_init=0.01, max_iter=1000, random_state=1)
mlp_pipeline = Pipeline(steps=[('preprocessor_scale', preprocessor_scale), ('model', mlp)])

xgb = XGBClassifier(learning_rate=0.05, n_estimators=114, max_depth=5, min_child_weight=5, gamma=0.6,
                    colsample_bytree=0.9, subsample=1, reg_alpha=0.00001, reg_lambda=1,
                    random_state=1, n_jobs=-1)
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', xgb)])

level0 = list()
level0.append(('reg', reg_pipeline))
level0.append(('mlp', mlp_pipeline))
level0.append(('xgb', xgb_pipeline))
level1 = LogisticRegression(solver='sag')

kf = KFold(shuffle=True, random_state=1)
stacking = StackingClassifier(estimators=level0, final_estimator=level1, cv=kf, n_jobs=-1)

stacking.fit(X_train, y_train)

print(accuracy_score(y_test, stacking.predict(X_test)))

predictions_stacking = {'true': y_test,
                        'predicted': stacking.predict(X_test),
                        'b_prob': [arr[0] for arr in stacking.predict_proba(X_test)],
                        'r_prob': [arr[1] for arr in stacking.predict_proba(X_test)]}
predictions_stacking = pd.DataFrame(predictions_stacking)
predictions_stacking[predictions_stacking['b_prob']>0.66]

0.6024096385542169


Unnamed: 0,true,predicted,b_prob,r_prob
5,1,0,0.736504,0.263496
39,0,0,0.749608,0.250392
47,0,0,0.671081,0.328919
103,0,0,0.767466,0.232534
131,0,0,0.668018,0.331982
158,1,0,0.724198,0.275802
164,0,0,0.758289,0.241711


# I. Training Model on Data With No Missing Values

Load in training and test data.

In [81]:
df_train = pd.read_csv('train_no_na.csv')
X_train = df_train.drop(['outcome'], axis=1)
y_train = pd.get_dummies(df_train["outcome"], drop_first=True)
y_train = np.ravel(y_train).reshape((-1,))

In [82]:
#XGBoost

xgb = XGBClassifier(learning_rate=0.01, n_estimators=481, max_depth=5, min_child_weight=3, gamma=0.2,
                    colsample_bytree=1, subsample=0.9, reg_alpha=0.00001, reg_lambda=0.1,
                    random_state=1, n_jobs=-1)

xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', xgb)])

xgb_pipeline.fit(X_train, y_train)

print(accuracy_score(y_test, xgb_pipeline.predict(X_test)))

predictions_xgb = {'true': y_test,
                  'predicted': xgb_pipeline.predict(X_test),
                  'b_prob': [arr[0] for arr in xgb_pipeline.predict_proba(X_test)],
                  'r_prob': [arr[1] for arr in xgb_pipeline.predict_proba(X_test)]}
predictions_xgb = pd.DataFrame(predictions_xgb)
predictions_xgb.loc[(predictions_xgb['predicted']==0) & 
                    (predictions_xgb['b_prob']>0.66), :]

0.6445783132530121


Unnamed: 0,true,predicted,b_prob,r_prob
39,0,0,0.788247,0.211753
63,0,0,0.661966,0.338034
103,0,0,0.704768,0.295232
150,0,0,0.676619,0.323381
157,1,0,0.70484,0.29516
158,1,0,0.669569,0.330431
164,0,0,0.665617,0.334383


In [83]:
#Stacking

reg = LogisticRegression(penalty='elasticnet', solver='saga', random_state=1, C=1, l1_ratio=1)
reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor_scale), ('model', reg)])

mlp = MLPClassifier(learning_rate_init=0.01, alpha=0.2, hidden_layer_sizes=5, max_iter=1000, random_state=1)
mlp_pipeline = Pipeline(steps=[('preprocessor_scale', preprocessor_scale), ('model', mlp)])

xgb = XGBClassifier(learning_rate=0.01, n_estimators=481, max_depth=5, min_child_weight=3, gamma=0.2,
                    colsample_bytree=1, subsample=0.9, reg_alpha=0.00001, reg_lambda=0.1,
                    random_state=1, n_jobs=-1)
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', xgb)])

level0 = list()
level0.append(('reg', reg_pipeline))
level0.append(('mlp', mlp_pipeline))
level0.append(('xgb', xgb_pipeline))
level1 = LogisticRegression(solver='sag')

kf = KFold(shuffle=True, random_state=1)
stacking = StackingClassifier(estimators=level0, final_estimator=level1, cv=kf, n_jobs=-1)

stacking.fit(X_train, y_train)

print(accuracy_score(y_test, stacking.predict(X_test)))

predictions_stacking = {'true': y_test,
                        'predicted': stacking.predict(X_test),
                        'b_prob': [arr[0] for arr in stacking.predict_proba(X_test)],
                        'r_prob': [arr[1] for arr in stacking.predict_proba(X_test)]}
predictions_stacking = pd.DataFrame(predictions_stacking)
predictions_stacking[predictions_stacking['b_prob']>0.7]

0.6325301204819277


Unnamed: 0,true,predicted,b_prob,r_prob
39,0,0,0.811701,0.188299
63,0,0,0.711682,0.288318
103,0,0,0.744563,0.255437
150,0,0,0.724386,0.275614
157,1,0,0.718877,0.281123
158,1,0,0.714389,0.285611
164,0,0,0.706407,0.293593
