# 5 - Regularization in Classification

In [None]:
#@title Run this cell to download the data and helper files. { display-mode: "form" }
!pip install -U wget
!rm -rf data.zip data lib
!mkdir lib

import wget
wget.download('https://github.com/shengpu1126/BDSI2019-ML/raw/master/lib/config.yaml', 'lib/config.yaml')
wget.download('https://github.com/shengpu1126/BDSI2019-ML/raw/master/lib/helper.py', 'lib/helper.py')
wget.download('https://github.com/shengpu1126/BDSI2019-ML/raw/master/data.zip', 'data.zip')

import zipfile
with zipfile.ZipFile("data.zip","r") as zip_ref:
    zip_ref.extractall(".")

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from lib.helper import load_data, config

In [None]:
#@title Run this cell to define the three preprocessing functions. { display-mode: "form" }
#@markdown - `generate_feature_vector(df)`
#@markdown - `impute_missing_values(X)`
#@markdown - `normalize_feature_matrix(X)`

def generate_feature_vector(df):
    """
    Reads a dataframe containing all measurements for a single patient
    within the first 48 hours of the ICU admission, and convert it into
    a feature vector.
    
    Args:
        df: pd.Dataframe, with columns [Time, Variable, Value]
    
    Returns:
        a python dictionary of format {feature_name: feature_value}
        for example, {'Age': 32, 'Gender': 0, 'mean_HR': 84, ...}
    """
    static_variables = config['invariant']
    timeseries_variables = config['timeseries']

    # Replace unknow values
    df = df.replace({-1: np.nan})
    
    # Split time invariant and time series
    static, timeseries = df.iloc[0:5], df.iloc[5:]
    static = static.pivot('Time', 'Variable', 'Value')

    feature_dict = static.iloc[0].to_dict()
    for variable in timeseries_variables:
        measurements = timeseries[timeseries['Variable'] == variable].Value
        feature_dict['mean_' + variable] = np.mean(measurements)
    
    return feature_dict

def impute_missing_values(X):
    """
    For each feature column, impute missing values  (np.nan) with the 
    population mean for that feature.
    
    Args:
        X: np.array, shape (N, d). X could contain missing values
    Returns:
        X: np.array, shape (N, d). X does not contain any missing values
    """
    from sklearn.impute import SimpleImputer
    return SimpleImputer().fit_transform(X)

def normalize_feature_matrix(X):
    """
    For each feature column, normalize all values to range [0, 1].

    Args:
        X: np.array, shape (N, d).
    Returns:
        X: np.array, shape (N, d). Values are normalized per column.
    """
    from sklearn.preprocessing import MinMaxScaler
    return MinMaxScaler().fit_transform(X)

In [None]:
# Load the dataset
# `raw_data` is a dictionary mapping patient ID to the data associated with that patient
raw_data, df_labels = load_data(N=2500)

# Generate features
features = [generate_feature_vector(df) for _, df in tqdm(sorted(raw_data.items()), desc='Generating feature vectors')]
df_features = pd.DataFrame(features).sort_index(axis=1)
feature_names = df_features.columns.tolist()

In [None]:
# Apply imputation and normalization
X, y = df_features.values, df_labels['In-hospital_death'].values
X = impute_missing_values(X)
X = normalize_feature_matrix(X)

# Split data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=3)
del X, y

In [None]:
# Remind yourself what the features represent:
print(feature_names)

## Linear Classification - Visualized

In [None]:
import matplotlib.pyplot as plt

In [None]:
sel = [0,1,2,3,6,8,55,88]    # Select an easy subset of patients
X = X_train[sel][:, [0,20]]  # Select age and mean_HR
y = y_train[sel]

In [None]:
def plot_data(X, y, size=25):
    fig = plt.figure(figsize=(5,5))
    for xi, yi in zip(X, y):
        if yi == -1:
            plt.scatter(xi[0], xi[1], c='r', marker='o', s=size)
        elif yi == 1:
            plt.scatter(xi[0], xi[1], c='g', marker='x', s=size)
    plt.axis('equal')
    plt.xlabel('Age')
    plt.ylabel('mean_HR')
    plt.xlim(0,1)
    plt.ylim(0,1)
    plt.grid(True)
    return fig

In [None]:
fig = plot_data(X, y)
plt.show()

In [None]:
#@title Given these data points, what is your guess for a "good" linear classifier? { display-mode: "form" }
Equation = None #@param {type:"raw"}

In [None]:
# Train a logistic regression classifier with C=1e5
clf = ???

In [None]:
#@title What are the learned model parameters? { display-mode: "form" }
θ₀ = 0 #@param {type:"number"}
θ₁ = 0 #@param {type:"number"}
θ₂ = 0 #@param {type:"number"}

In [None]:
#@title What is the equation of the classification boundary? { display-mode: "form" }
Equation = None #@param {type:"raw"}

In [None]:
# Visualize the classification boundary in the same plot. Does it match your intuition?
x_clf = np.linspace(0,1,100)
y_clf = ???
fig = plot_data(X, y)
plt.plot(x_clf, y_clf, 'k-')
plt.show()

In [None]:
#@title Run this cell to visualize classification boundary with predicted probabilities. { display-mode: "form" }
def plot_boundary(X, pred):
    try:
        x_min, x_max = plt.gca().get_xlim()
        y_min, y_max = plt.gca().get_ylim()
    except:
        x_min, x_max = X[:,0].min() - .1, X[:,0].max() + .1
        y_min, y_max = X[:,1].min() - .1, X[:,1].max() + .1
    xs, ys = np.meshgrid(
        np.linspace(x_min, x_max, 200),
        np.linspace(y_min, y_max, 200)
    )
    xys = np.column_stack([xs.ravel(), ys.ravel()])
    zs = pred(xys).reshape(xs.shape)
    plt.contour(xs, ys, (zs >= 0.5).astype(int), cmap='Greys')
    plt.imshow(zs, cmap="PiYG", vmin=-.2, vmax=1.2, alpha=0.4, origin='lower', extent=[x_min, x_max, y_min, y_max])

x_clf = np.linspace(0,1,100)
y_clf = (clf.coef_[0,0] * x_clf + clf.intercept_) / -clf.coef_[0,1]
fig = plot_data(X, y)
plt.plot(x_clf, y_clf, 'k-')
plot_boundary(X, lambda xi: clf.predict_proba(xi)[:,1])
plt.show()

## Regularization

In [None]:
# Now, train different logistic regression classifiers with C=1e2, C=1e1 and C=1 (default)

In [None]:
#@title Run this cell to visualize classification boundary with predicted probabilities. { display-mode: "form" }
def plot_boundary(X, pred):
    try:
        x_min, x_max = plt.gca().get_xlim()
        y_min, y_max = plt.gca().get_ylim()
    except:
        x_min, x_max = X[:,0].min() - .1, X[:,0].max() + .1
        y_min, y_max = X[:,1].min() - .1, X[:,1].max() + .1
    xs, ys = np.meshgrid(
        np.linspace(x_min, x_max, 200),
        np.linspace(y_min, y_max, 200)
    )
    xys = np.column_stack([xs.ravel(), ys.ravel()])
    zs = pred(xys).reshape(xs.shape)
    plt.contour(xs, ys, (zs >= 0.5).astype(int), cmap='Greys')
    plt.imshow(zs, cmap="PiYG", vmin=-.2, vmax=1.2, alpha=0.4, origin='lower', extent=[x_min, x_max, y_min, y_max])

x_clf = np.linspace(0,1,100)
y_clf = (clf.coef_[0,0] * x_clf + clf.intercept_) / -clf.coef_[0,1]
fig = plot_data(X, y)
plt.plot(x_clf, y_clf, 'k-')
plot_boundary(X, lambda xi: clf.predict_proba(xi)[:,1])
plt.show()

What do you notice about the decision boundaries of LR with different C values? 

## Extensions
- Try the same experiment with the entire training set
- Choose two different features as the input features to your model, or use the first two components of PCA

In [None]:
# e.g., use all training data
fig = plot_data(X_train[:, [0,20]], y_train, 16)
plt.show()