In [1]:
import numpy as np
import pandas as pd

def which(self):
    try:
        self = list(iter(self))
    except TypeError as e:
        raise Exception("""'which' method can only be applied to iterables.
        {}""".format(str(e)))
    indices = [i for i, x in enumerate(self) if bool(x) == True]
    return(indices)

cb = pd.read_csv("codebook-reworked.csv")
vartype = cb.vartype
vartype = vartype[which(vartype != 0)]
index = 0
numeric = []
categorical = []
for i in vartype:
    if i in [1,2,4,5]:
        numeric.append(index)
    if i == 3:
        categorical.append(index)
    index = index + 1

In [2]:
df = pd.read_csv("train.csv")
group = df["personid"]
df = df.drop(['uniqueid', 'personid'], axis=1)
df = df.dropna(axis=1, how='all')

X = df.drop(["health"], axis=1)
colnames_train = X.columns

test = pd.read_csv("test.csv")
test = test.dropna(axis=1, how='all')
uniqueid = test['uniqueid']
test = test.drop(['uniqueid', 'personid'], axis=1)
colnames_test = test.columns

rm_train = set(colnames_train) - set(colnames_test)
rm_test = set(colnames_test) - set(colnames_train)

X = X.drop(list(rm_train), axis=1)
test = test.drop(list(rm_test), axis=1)

y = df["health"]

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# creating a pipeline to process data.
# numerical data is imputed using the mean and scaled to be in [0,1].
# categorical data is imputed by treating NA as a seperate category and one hot encoded.

numeric_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', MinMaxScaler())])
categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value=9999999)), ('ohe', OneHotEncoder())])
t = [('cat', categorical_pipe, categorical), ('num', numeric_pipe, numeric)]
col_transform = ColumnTransformer(transformers=t)
X = col_transform.fit_transform(X)

In [4]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold

# Testing parameters for neural network.

model = MLPClassifier(random_state=20803652, alpha =0.4, hidden_layer_sizes=(1370), max_iter=1000)
param_grid = dict(learning_rate_init=[0.005, 0.01, 0.02]) # parameters to be tested go here. Changed throughout process.
gkf = GroupKFold(n_splits=5).split(X, y, group)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring="neg_log_loss", cv=gkf, n_jobs=-1, verbose=3)
grid_result = grid.fit(X, y)
print(grid_result.best_score_)
print(grid_result.best_params_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:  7.3min remaining: 47.3min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed: 10.5min remaining:  9.2min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 12.4min finished


-1.1852872058425632
{'learning_rate_init': 0.01}


In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV

# Testing parameters for nearest neighbours.

X = col_transform.fit_transform(X)
knn = KNeighborsClassifier(n_jobs=-1)
param_grid = dict(n_neighbors=[250, 260, 270]) # parameters to be tested go here. Changed throughout process.
gkf = GroupKFold(n_splits=5).split(X, y, group)
grid = GridSearchCV(estimator=knn, param_grid=param_grid, scoring="neg_log_loss", cv=gkf, n_jobs=-1, verbose=3)
grid_result = grid.fit(X, y)
print(grid_result.best_score_)
print(grid_result.best_params_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:  1.1min remaining:  7.0min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:  1.1min remaining:   56.4s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.1min finished


-1.3127061417813286
{'n_neighbors': 260}


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV

# Testing parameters for random forests.

rf = RandomForestClassifier(random_state=20803652, n_estimators=3600, oob_score=True, n_jobs=-1)
param_grid = dict(max_depth=[20]) # parameters to be tested go here. Changed throughout process.
gkf = GroupKFold(n_splits=5).split(X, y, group)
grid = GridSearchCV(estimator=rf, param_grid=param_grid, scoring="neg_log_loss", cv=gkf, n_jobs=-1, verbose=3)
grid_result = grid.fit(X, y)
print(grid_result.best_score_)
print(grid_result.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.3min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.3min finished


-1.291680601072605
{'max_depth': 20}
