## Init: Import packages

In [1]:
# general
import numpy as np
import pandas as pd
#sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.feature_selection import VarianceThreshold

## Solution 1: Tuning Principles

See sol_nested_resampling.pdf

## Solution 2: AutoML

### a)

In [2]:
# load data
pima = pd.read_csv("../data/pima.csv")
X_pima = pima.copy()
y_pima = X_pima.pop("diabetes")

# encode the target as 0-1 vector
le = LabelEncoder()
y_pima = le.fit_transform(y_pima)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pima, y_pima, test_size=0.2, stratify=y_pima, random_state=42)

### b)

In [3]:
# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode categorical columns, like strings
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'), make_column_selector(dtype_include=object)),
        # Impute missing values for numerical columns
        ('imputer', SimpleImputer(strategy = 'median'), make_column_selector(dtype_include=np.number))
    ])

### c)

In [4]:
# Create a pipeline with preprocessing and modeling steps
# for knn
clf_knn = Pipeline([
    ('preprocessor', preprocessor),
    ('constant', VarianceThreshold()), # remove variables with constant values across all observations
    ('scaler', StandardScaler()),  # Data scaling
    ('classifier', KNeighborsClassifier())  # KNN model
])

# for random forest
clf_randomforest = Pipeline([
    ('preprocessor', preprocessor),
    ('constant', VarianceThreshold()), # remove variables with constant values across all observations
    ('classifier', RandomForestClassifier(random_state=42))  # Random Forest
])

### d)

In [5]:
# combine both classifiers with a soft voting ensembling
clf_voting = VotingClassifier(
    estimators=[('knn', clf_knn), ('random_forest', clf_randomforest)], 
    voting = "soft", 
    n_jobs = -1
)

### e)

In [6]:
# define a parameter grid for the tuning process
param_grid_voting = [{"knn__classifier__n_neighbors": list(range(1,11))},
                     {"random_forest__classifier__max_features": list(range(1,6))}]

### f)

In [7]:
# initalize scores with 0
NUM_OUTER_FOLDS = 3
nested_scores_voting = np.zeros(NUM_OUTER_FOLDS)

# Choose cross-validation techniques for the inner and outer loops, independently of the dataset.
# E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = StratifiedKFold(n_splits=NUM_OUTER_FOLDS, shuffle=True, random_state=42)

In [8]:
for i, (train_index, val_index) in enumerate(outer_cv.split(X_train, y_train)):
    # Nested CV with parameter optimization for ensemble pipeline
    clf_gs_voting = GridSearchCV(
        estimator=clf_voting, 
        param_grid=param_grid_voting, 
        cv=inner_cv, 
        n_jobs=-1
    )
    clf_gs_voting.fit(X_train.iloc[train_index], y_train[train_index])
    nested_scores_voting[i] = clf_gs_voting.score(X_train.iloc[val_index], y_train[val_index])

### g)

In [9]:
# print performance per outer fold
print(nested_scores_voting)

[0.76097561 0.76097561 0.75490196]


In [10]:
# print performance aggregated over all folds
print(nested_scores_voting.mean())

0.7589510600988363


### h)

In [11]:
# evaluate performace on test set with accuracy
test_scores_voting = clf_gs_voting.score(X_test, y_test)
print(test_scores_voting)

0.7597402597402597


Accuracy does not account for imbalances data! Let's check how the test data is distributed:

In [12]:
unique, counts = np.unique(y_test, return_counts=True)
table = pd.DataFrame(data = dict(zip(unique, counts)), index=[0]) #index necassary because only numeric values are in dictionary
table

Unnamed: 0,0,1
0,100,54


In [13]:
pred_test = clf_gs_voting.predict(X_test)
conf_matrix = pd.DataFrame(confusion_matrix(y_test, pred_test))
conf_matrix

Unnamed: 0,0,1
0,86,14
1,23,31


The distribution shows a shift towards 'false' with $2/3$ of all test observations.

In [14]:
# evaluate performace on test set with balanced accuracy to account for imbalances data set
# Balanced accuracy = (Sensitivity + Specificity) / 2
balanced_accuracy = balanced_accuracy_score(y_test, pred_test)
print(balanced_accuracy)

0.717037037037037


The balanced accuracy is lower than a normal accuracy score, as it accounts seperatly for the lower Sensitivity.

## Solution 3: Kaggle Challenge

We do not provide an explicit solution here, but have a look at the [tuning code demo](https://github.com/slds-lmu/lecture_i2ml/blob/master/code-demos-pdf/code_demo_kaggle.pdf), which is written in *mlr3* and covers some parts,
and take inspiration from the public contributions on Kaggle.