Load necessary libraries.

In [43]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# I. Model Testing on All Data

Create explanatory and response variables.

In [44]:
df = pd.read_csv('train.csv')
X = df.drop(['outcome'], axis=1)
y = pd.get_dummies(df['outcome'], drop_first=True)
y = np.ravel(y).reshape((-1,))

Identify numeric and categorical columns.

In [45]:
numeric_columns = X.select_dtypes(include=['number']).columns

categorical_columns = list(set(X.columns) - set(numeric_columns))

Create preprocessing pipeline.

In [46]:
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', MinMaxScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('ohe', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

# Neural Net

## Randomized Search
Tune model parameters and obtain cv accuracy estimates.

In [47]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", MLPClassifier(max_iter=1000, random_state=1))]
)

distributions = dict(classifier__alpha = [0.01, 0.1, 0.5],
                     classifier__learning_rate_init=[0.001, 0.01, 0.1, 0.2, 0.3],
                     classifier__hidden_layer_sizes=[3,5,7,10,25])
random = RandomizedSearchCV(estimator=clf, param_distributions=distributions,
                            scoring='neg_log_loss', n_iter=10, cv=rkf, random_state=1)
random.fit(X, y)
print("Highest neg_log_loss: ", random.best_score_)
print("Best parameters: ", random.best_params_)

Highest neg_log_loss:  -0.594240160918905
Best parameters:  {'classifier__learning_rate_init': 0.001, 'classifier__hidden_layer_sizes': 25, 'classifier__alpha': 0.5}


## Regular Grid Search
Tune model parameters and obtain cv accuracy estimates.

In [13]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", MLPClassifier(max_iter=1000, random_state=1))]
)

param_grid1 = dict(classifier__alpha=[0.0001, 0.001, 0.01, 0.1],
                   classifier__learning_rate_init=[0.0001, 0.001, 0.01, 0.1],
                   classifier__hidden_layer_sizes=[5,10,25,50])
rkf = RepeatedKFold(random_state=1)
grid1 = GridSearchCV(estimator=clf, param_grid=param_grid1, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
grid1.fit(X, y)
print(grid1.best_score_)
print(grid1.best_params_)

-0.5904346780100138
{'classifier__alpha': 0.1, 'classifier__hidden_layer_sizes': 5, 'classifier__learning_rate_init': 0.01}


In [17]:
grid1_table = pd.concat([pd.DataFrame(grid1.cv_results_['params']),
                         pd.DataFrame(grid1.cv_results_['mean_test_score'],
                                      columns=['Negative Log Loss'])],axis=1)

grid1_table.sort_values('Negative Log Loss', ascending=False)

Unnamed: 0,classifier__alpha,classifier__hidden_layer_sizes,classifier__learning_rate_init,Negative Log Loss
50,0.1000,5,0.010,-0.590435
54,0.1000,10,0.010,-0.592835
49,0.1000,5,0.001,-0.593826
58,0.1000,25,0.010,-0.596114
34,0.0100,5,0.010,-0.598353
...,...,...,...,...
45,0.0100,50,0.001,-0.820175
29,0.0010,50,0.001,-0.857151
13,0.0001,50,0.001,-0.878110
30,0.0010,50,0.010,-0.915427


In [14]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", MLPClassifier(learning_rate_init=0.01, max_iter=1000, random_state=1))]
)

param_grid2 = dict(classifier__alpha=[0.05, 0.1, 0.15],
                   classifier__hidden_layer_sizes=[3,5,7])
rkf = RepeatedKFold(random_state=1)
grid2 = GridSearchCV(estimator=clf, param_grid=param_grid2, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
grid2.fit(X, y)
print(grid2.best_score_)
print(grid2.best_params_)

-0.5934974164533241
{'classifier__alpha': 0.15, 'classifier__hidden_layer_sizes': 5}


In [15]:
grid2_table = pd.concat([pd.DataFrame(grid2.cv_results_['params']),
                         pd.DataFrame(grid2.cv_results_['mean_test_score'],
                                      columns=['Negative Log Loss'])],axis=1)
grid2_table.sort_values('Negative Log Loss', ascending=False)


Unnamed: 0,classifier__alpha,classifier__hidden_layer_sizes,Negative Log Loss
7,0.15,5,-0.593497
4,0.1,5,-0.593826
8,0.15,7,-0.594468
1,0.05,5,-0.594886
5,0.1,7,-0.596311
2,0.05,7,-0.599567
6,0.15,3,-0.61091
0,0.05,3,-0.614176
3,0.1,3,-0.61587


In [19]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", MLPClassifier(learning_rate_init=0.01, max_iter=1000, random_state=1))]
)

param_grid3 = dict(classifier__alpha=[0.15, 0.25, 0.5, 0.75, 1],
                   classifier__hidden_layer_sizes=[4,5,6])
rkf = RepeatedKFold(random_state=1)
grid3 = GridSearchCV(estimator=clf, param_grid=param_grid3, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
grid3.fit(X, y)
print(grid3.best_score_)
print(grid3.best_params_)

-0.5876928144722277
{'classifier__alpha': 0.15, 'classifier__hidden_layer_sizes': 5}


In [20]:
grid3_table = pd.concat([pd.DataFrame(grid3.cv_results_['params']),
                         pd.DataFrame(grid3.cv_results_['mean_test_score'],
                                      columns=['Negative Log Loss'])],axis=1)
grid3_table.sort_values('Negative Log Loss', ascending=False)

Unnamed: 0,classifier__alpha,classifier__hidden_layer_sizes,Negative Log Loss
1,0.15,5,-0.587693
2,0.15,6,-0.59008
4,0.25,5,-0.591792
5,0.25,6,-0.59224
7,0.5,5,-0.595722
8,0.5,6,-0.596341
11,0.75,6,-0.599626
10,0.75,5,-0.602222
14,1.0,6,-0.603643
13,1.0,5,-0.606034


# II. Model Testing on Data With No Missing Values

Create explanatory and response variables.

In [39]:
df = pd.read_csv('train_no_na.csv')
X = df.drop(['outcome'], axis=1)
y = pd.get_dummies(df["outcome"], drop_first=True)
y = np.ravel(y).reshape((-1,))

Identify numeric and categorical columns.

In [40]:
numeric_columns = X.select_dtypes(include=['number']).columns

categorical_columns = list(set(X.columns) - set(numeric_columns))

Create preprocessing pipeline.

In [41]:
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', MinMaxScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('ohe', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

# Neural Net

## Randomized Search
Tune model parameters and obtain cv accuracy estimates.

In [42]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", MLPClassifier(max_iter=1000, random_state=1))]
)

distributions = dict(classifier__alpha = [0.01, 0.1, 0.5],
                     classifier__learning_rate_init=[0.001, 0.01, 0.1, 0.2, 0.3],
                     classifier__hidden_layer_sizes=[3,5,7,10,25])
random = RandomizedSearchCV(estimator=clf, param_distributions=distributions,
                            scoring='neg_log_loss', n_iter=10, cv=rkf, random_state=1)
random.fit(X, y)
print("Highest neg_log_loss: ", random.best_score_)
print("Best parameters: ", random.best_params_)

Highest neg_log_loss:  -0.6171278410952484
Best parameters:  {'classifier__learning_rate_init': 0.001, 'classifier__hidden_layer_sizes': 25, 'classifier__alpha': 0.5}


## Regular Grid Search
Tune model parameters and obtain cv accuracy estimates.

In [26]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", MLPClassifier(max_iter=1000, random_state=1))]
)

param_grid1 = dict(classifier__alpha=[0.0001, 0.001, 0.01, 0.1],
                   classifier__learning_rate_init=[0.0001, 0.001, 0.01, 0.1],
                   classifier__hidden_layer_sizes=[5,10,25,50])
rkf = RepeatedKFold(random_state=1)
grid1 = GridSearchCV(estimator=clf, param_grid=param_grid1, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
grid1.fit(X, y)
print(grid1.best_score_)
print(grid1.best_params_)

-0.6148472844323163
{'classifier__alpha': 0.1, 'classifier__hidden_layer_sizes': 5, 'classifier__learning_rate_init': 0.01}


In [27]:
grid1_table = pd.concat([pd.DataFrame(grid1.cv_results_['params']),
                         pd.DataFrame(grid1.cv_results_['mean_test_score'],
                                      columns=['Negative Log Loss'])],axis=1)

grid1_table.sort_values('Negative Log Loss', ascending=False)

Unnamed: 0,classifier__alpha,classifier__hidden_layer_sizes,classifier__learning_rate_init,Negative Log Loss
50,0.1000,5,0.0100,-0.614847
54,0.1000,10,0.0100,-0.619973
49,0.1000,5,0.0010,-0.620016
34,0.0100,5,0.0100,-0.624553
56,0.1000,25,0.0001,-0.625371
...,...,...,...,...
45,0.0100,50,0.0010,-0.904306
29,0.0010,50,0.0010,-0.989431
13,0.0001,50,0.0010,-0.999843
30,0.0010,50,0.0100,-1.110568


In [28]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", MLPClassifier(learning_rate_init=0.01, max_iter=1000, random_state=1))]
)

param_grid2 = dict(classifier__alpha=[0.05, 0.1, 0.15],
                   classifier__hidden_layer_sizes=[3,5,7])
rkf = RepeatedKFold(random_state=1)
grid2 = GridSearchCV(estimator=clf, param_grid=param_grid2, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
grid2.fit(X, y)
print(grid2.best_score_)
print(grid2.best_params_)

-0.614599972336987
{'classifier__alpha': 0.15, 'classifier__hidden_layer_sizes': 5}


In [29]:
grid2_table = pd.concat([pd.DataFrame(grid2.cv_results_['params']),
                         pd.DataFrame(grid2.cv_results_['mean_test_score'],
                                      columns=['Negative Log Loss'])],axis=1)
grid2_table.sort_values('Negative Log Loss', ascending=False)


Unnamed: 0,classifier__alpha,classifier__hidden_layer_sizes,Negative Log Loss
7,0.15,5,-0.6146
8,0.15,7,-0.614715
4,0.1,5,-0.614847
6,0.15,3,-0.617069
1,0.05,5,-0.617215
5,0.1,7,-0.617851
0,0.05,3,-0.619415
3,0.1,3,-0.619786
2,0.05,7,-0.619897


In [31]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", MLPClassifier(learning_rate_init=0.01, 
                                        max_iter=1000, random_state=1))]
)

param_grid3 = dict(classifier__alpha=[0.15, 0.25, 0.5, 0.75, 1],
                   classifier__hidden_layer_sizes=[5,6])
rkf = RepeatedKFold(random_state=1)
grid3 = GridSearchCV(estimator=clf, param_grid=param_grid3, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
grid3.fit(X, y)
print(grid3.best_score_)
print(grid3.best_params_)

-0.6135942918450586
{'classifier__alpha': 0.25, 'classifier__hidden_layer_sizes': 5}


In [32]:
grid3_table = pd.concat([pd.DataFrame(grid3.cv_results_['params']),
                         pd.DataFrame(grid3.cv_results_['mean_test_score'],
                                      columns=['Negative Log Loss'])],axis=1)
grid3_table.sort_values('Negative Log Loss', ascending=False)

Unnamed: 0,classifier__alpha,classifier__hidden_layer_sizes,Negative Log Loss
2,0.25,5,-0.613594
0,0.15,5,-0.6146
4,0.5,5,-0.61611
5,0.5,6,-0.617077
3,0.25,6,-0.61885
7,0.75,6,-0.620083
6,0.75,5,-0.620541
9,1.0,6,-0.621336
1,0.15,6,-0.62184
8,1.0,5,-0.623164


In [33]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", MLPClassifier(learning_rate_init=0.01, hidden_layer_sizes=5, 
                                        max_iter=1000, random_state=1))]
)

param_grid4 = dict(classifier__alpha=[0.2, 0.25, 0.3, 0.35, 0.4, 0.45])
rkf = RepeatedKFold(random_state=1)
grid4 = GridSearchCV(estimator=clf, param_grid=param_grid4, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
grid4.fit(X, y)
print(grid4.best_score_)
print(grid4.best_params_)

-0.6117097612551021
{'classifier__alpha': 0.2}
