In [1]:
import numpy as np
import pandas as pd

def which(self):
    try:
        self = list(iter(self))
    except TypeError as e:
        raise Exception("""'which' method can only be applied to iterables.
        {}""".format(str(e)))
    indices = [i for i, x in enumerate(self) if bool(x) == True]
    return(indices)

cb = pd.read_csv("codebook-reworked.csv")
vartype = cb.vartype
vartype = vartype[which(vartype != 0)]
index = 0
numeric = []
categorical = []
for i in vartype:
    if i in [1,2,4,5]:
        numeric.append(index)
    if i == 3:
        categorical.append(index)
    index = index + 1

In [2]:
df = pd.read_csv("train.csv")
group = df["personid"]
df = df.drop(['uniqueid', 'personid'], axis=1)
df = df.dropna(axis=1, how='all')

X = df.drop(["health"], axis=1)
colnames_train = X.columns

test = pd.read_csv("test.csv")
test = test.dropna(axis=1, how='all')
uniqueid = test['uniqueid']
test = test.drop(['uniqueid', 'personid'], axis=1)
colnames_test = test.columns

rm_train = set(colnames_train) - set(colnames_test)
rm_test = set(colnames_test) - set(colnames_train)

X = X.drop(list(rm_train), axis=1)
test = test.drop(list(rm_test), axis=1)

y = df["health"]

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# creating a pipeline to process data.
# numerical data is imputed using the mean and scaled t be in [0,1].
# categorical data is imputed by treating NA as a seperate category and one hot encoded.

numeric_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', MinMaxScaler())])
categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value=9999999)), ('ohe', OneHotEncoder(handle_unknown = "ignore"))])
t = [('cat', categorical_pipe, categorical), ('num', numeric_pipe, numeric)]
col_transform = ColumnTransformer(transformers=t)

In [4]:
from sklearn.neural_network import MLPClassifier

# simple neural network.

mlp = MLPClassifier(random_state=20803652, learning_rate_init=0.01, alpha=0.4, hidden_layer_sizes=1371, max_iter=1000)
mlp_pipeline = Pipeline(steps=[('prep', col_transform), ('m', mlp)])

In [5]:
from sklearn.neighbors import KNeighborsClassifier

# k-nearest neighbours.

knn = KNeighborsClassifier(n_neighbors=261, n_jobs=-1)
knn_pipeline = Pipeline(steps=[('prep', col_transform), ('m', knn)])

In [6]:
from sklearn.linear_model import RidgeClassifier

# ridge regression.

ridge = RidgeClassifier(alpha = 525)
ridge_pipeline = Pipeline(steps=[('prep', col_transform), ('m', ridge)])

In [7]:
from sklearn.ensemble import RandomForestClassifier

# random forest

rf = RandomForestClassifier(random_state=20803652, n_estimators=3600, oob_score=True, n_jobs=-1)
rf_pipeline = Pipeline(steps=[('prep', col_transform), ('m', rf)])

In [8]:
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# stacking all models using 5-fold validation and Logistic regression on probability outputs

gkf = GroupKFold(n_splits=5).split(X, y, group)

level0 = list()
level0.append(('mlp', mlp_pipeline))
level0.append(('knn', knn_pipeline))
level0.append(('ridge', ridge_pipeline))
level0.append(('rf', rf_pipeline))
level1 = LogisticRegression(solver='sag')

model = StackingClassifier(estimators=level0, final_estimator=level1, cv=gkf)
fit = model.fit(X,y)
predictions = fit.predict_proba(test)

In [9]:
# creating a csv file for submission

pred = pd.DataFrame(predictions)
df.insert(loc=0, column='uniqueid', value=uniqueid)
pred.to_csv("submission.csv", index=False)