Environment: Pytorch

# <font color = 'purple'> Feed Forward Neural Network
Predicting presence of heart disease using feed forward neural network.

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
import torch.optim as optim

import optuna
from optuna.trial import TrialState

In [2]:
my_seed = 101

## <font color = 'blue'> Reserve test data
Test data will not be used for model training or hyperparameter tuning, itstead it is reserved for final evaluation of model performance.

In [3]:
def get_train_test_df(fp, label_colname, my_seed=None):
    """
    Function to import raw data, carry out pre-processing, and split into training and test datasets.
    Test data will be reserved for final evaluation of model performance (i.e. not for hyperparameter tuning)
    
    :param fp: filepath
    :param label_colname: name of column containing labels
    :param my_seed: integer to be used to fix random state for train_test_split
    
    :return: tuple of dataframes - training_df, test_df
    """

    # import data
    df = pd.read_csv(fp)

    # one-hot encoding of categorical variables
    df = pd.get_dummies(df, drop_first=True)

    # Standard scaling of features
    # Note that doing this gives a marked improvement in validation accuracy from around 75% to around 90%
    scaler = StandardScaler()
    df[df.drop(columns=label_colname).columns] = scaler.fit_transform(df[df.drop(columns=label_colname).columns])


    # separate into training & test datasets. 
    # Stratification is used to ensure training and test sets have representative proportions of all classes
    training_df, test_df = train_test_split(df, test_size=0.2, random_state=my_seed, stratify=df[label_colname])

    return training_df, test_df

In [4]:
training_df, test_df = get_train_test_df(fp = "heart_clean.csv", label_colname='HeartDisease', my_seed = my_seed)

## <font color = 'blue'> Train Model

**Run Optuna Study to Tune Hyperparameters**

In [5]:
class MyDataset(Dataset): # inherits properties of pytorch Dataset class
    def __init__(self, dataframe, label_colname):
        """
            Class initialisation
            :param dataframe: pandas dataframe including features and labels
            :param label_colname: name of column containing labels
            """
        self.labels = dataframe[label_colname].to_numpy()
        self.features = dataframe.drop(columns=[label_colname]).to_numpy()

    def __len__(self):
        """
        :return: length of dataset
        """
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Fetches features and label(s) at requested index
        :param idx: requested index
        :return: tuple of numpy arrays - batch_features, batch_labels
        """
        batch_features = self.features[idx,:]
        batch_labels = self.labels[idx]

        return batch_features, batch_labels

In [6]:
def get_train_val_dataloader(training_df, my_batchsize, label_colname, my_seed = None):
    """
    Function to split training data into training and validation subsets and format as dataloaders
    Model performance on validation set will be used for hyperparameter tuning.

    :param training_df: dataframe with full set of training data
    :param my_batchsize: batch size for pytorch DataLoader
    :param label_colname: name of column containing labels
    :param my_seed: optional integer to fix train test split random state

    :return: tuple of pytorch DataLoaders - train_dataloader, val_dataloader
    """

    # separate into training & validation datasets
    train_data, val_data = train_test_split(training_df, test_size = 0.2, random_state = my_seed, stratify=training_df[label_colname])

    #format as pytorch dataloader
    train, val = MyDataset(train_data, label_colname), MyDataset(val_data, label_colname)
    train_dataloader = DataLoader(train, batch_size=my_batchsize, shuffle=True)
    val_dataloader = DataLoader(val, batch_size=my_batchsize)

    return train_dataloader, val_dataloader

In [7]:
def set_parameters(trial):
    """
    Set parameters for neural network, optimisation algorithm etc.
    
    :param trial: Optuna trial object
    
    :return: dictionary of parameters:
            - n_layers: number of layers in neural network
            - n_units_l{i}: number of units in layer i
            - dropout_l{i}: dropout probability for layer i
            - lr: learning rate
            - batch_size: batch size
            - n_epochs: number of epochs (i.e. number of passes through training data to optimise weights)
            - optimiser: optimisation algorithm to be used
    """
    trial.suggest_int("n_layers", 1, 3)

    for i in range(trial.params['n_layers']):
        trial.suggest_int(f'n_units_l{i}', 2, 20)
        trial.suggest_float(f"dropout_l{i}", 0.1, 1)

    trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    
    # TODO: try optimising these as well
    trial.suggest_int("batch_size", 10, 10)
    trial.suggest_int("n_epochs", 30, 30)
    trial.suggest_categorical("optimizer",["SGD"])

    return trial.params

In [8]:
n_features = training_df.shape[1]-1  # number of features in feature matrix. 
n_classes = len(training_df['HeartDisease'].unique())  # number of unique classes. 

def define_model(my_params):
    """Defines feed-forward neural network based on set parameters
    
    :param my_params: dictionary of parameters (see set_parameters() for full list)
    :return: nn model
    """

    layers = []

    in_features = n_features  # number of input features for 1st layer = no. of features in feature matrix
    
    for i in range(my_params['n_layers']):
        # n_inputs = n_outputs of previous layer, n_outputs=no. of units in that lyr
        out_features = my_params[f'n_units_l{i}']
        layers.append(nn.Linear(in_features, out_features))

        layers.append(nn.ReLU())  # activation function

        #drop-out regularisation. (note: drop-out works by zeroing some elements of the tensor. tensor shape is unchanged)
        p = my_params[f"dropout_l{i}"]
        layers.append(nn.Dropout(p))

        in_features = out_features  # no. of inputs for next layer = no. of outputs of this layer

    layers.append(nn.Linear(in_features, n_classes))  # output layer. No. of outputs = no. of unique classes in dataset

    return nn.Sequential(*layers)

In [9]:
def count_correct(predictions, y):
    """
    Counts number of correct predictions in a batch
    
    :param predictions: 1D tensor with predictions
    :param y: 1D tensor with true classes
    
    :return: number of correct predictions (pred==y)
    """
    predictions = predictions.numpy()
    y = y.numpy()

    n_correct = (predictions == y).sum()

    return n_correct

In [10]:
def objective(trial):
    """
    Objective for Optuna to optimise
    :param trial: Optuna trial object
    :return: accuracy - fraction of correctly labelled validation points. This is what Optuna seeks to maximise
    """

    #set parameters
    my_params = set_parameters(trial)

    # Instantiate model
    model = define_model(my_params)

    # Instantiate optimizer
    optimizer_name = my_params['optimizer']
    lr = my_params['lr']
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

    # get data
    train_dataloader, val_dataloader = get_train_val_dataloader(training_df,
                                                                my_batchsize=my_params['batch_size'],
                                                                label_colname='HeartDisease')
    # train model
    for epoch in range(my_params['n_epochs']):

        #train
        model.train()
        for batch, (X, y) in enumerate(train_dataloader):
            # X and y are tensors. X.size() = (batch_size,n_features), y.size()=(batch_size,)
            # set datatype for compatibility with nn.
            X = X.float()
            y = y.long()

            # calculate model output and resulting loss
            model_output = model(X)  # tensor. size=(batch_size x n_classes)
            loss_fn = nn.CrossEntropyLoss() # instantiate loss function
            loss = loss_fn(model_output, y)

            # Backpropagation to update model weights
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # validate. We do this at each epoch to facilitate pruning:
        # i.e. early termination of trials which are clearly not going to be optimum
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch, (X, y) in enumerate(val_dataloader):
                X = X.float()
                y = y.long()

                # calculate model output and total number of correct predictions for this batch
                model_output = model(X)
                pred = torch.argmax(model_output, dim=1)  # prediction = class with highest output value
                correct += count_correct(pred, y)

        accuracy = correct / len(val_dataloader.dataset)

        # report accuracy to allow Optuna to decide whether to prune this trial
        trial.report(accuracy, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return accuracy # return final validation accuracy after all epochs (unless pruned)

In [11]:
# instantiate optuna study
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
# Optimise hyperparameters will try {n_trials} param combinations or till {timeout} seconds is hit
study.optimize(objective, n_trials=100, timeout=600)

[32m[I 2022-11-17 13:08:45,600][0m A new study created in memory with name: no-name-cc9f1f42-27d9-44f2-991c-246c42dd5f70[0m
[32m[I 2022-11-17 13:08:47,186][0m Trial 0 finished with value: 0.8367346938775511 and parameters: {'n_layers': 3, 'n_units_l0': 8, 'dropout_l0': 0.6825611872363542, 'n_units_l1': 11, 'dropout_l1': 0.3339420064687123, 'n_units_l2': 16, 'dropout_l2': 0.5552103657432551, 'lr': 0.011943411637326744, 'batch_size': 10, 'n_epochs': 30, 'optimizer': 'SGD'}. Best is trial 0 with value: 0.8367346938775511.[0m
[32m[I 2022-11-17 13:08:48,893][0m Trial 1 finished with value: 0.5510204081632653 and parameters: {'n_layers': 2, 'n_units_l0': 8, 'dropout_l0': 0.6027422175579493, 'n_units_l1': 8, 'dropout_l1': 0.2798531293210851, 'lr': 5.3792830068769094e-05, 'batch_size': 10, 'n_epochs': 30, 'optimizer': 'SGD'}. Best is trial 0 with value: 0.8367346938775511.[0m
[32m[I 2022-11-17 13:08:51,152][0m Trial 2 finished with value: 0.5510204081632653 and parameters: {'n_layer

[32m[I 2022-11-17 13:09:23,416][0m Trial 49 pruned. [0m
[32m[I 2022-11-17 13:09:23,490][0m Trial 50 pruned. [0m
[32m[I 2022-11-17 13:09:23,558][0m Trial 51 pruned. [0m
[32m[I 2022-11-17 13:09:23,896][0m Trial 52 pruned. [0m
[32m[I 2022-11-17 13:09:24,042][0m Trial 53 pruned. [0m
[32m[I 2022-11-17 13:09:24,178][0m Trial 54 pruned. [0m
[32m[I 2022-11-17 13:09:24,251][0m Trial 55 pruned. [0m
[32m[I 2022-11-17 13:09:25,769][0m Trial 56 finished with value: 0.9047619047619048 and parameters: {'n_layers': 1, 'n_units_l0': 9, 'dropout_l0': 0.850745135319485, 'lr': 0.06393213110898077, 'batch_size': 10, 'n_epochs': 30, 'optimizer': 'SGD'}. Best is trial 29 with value: 0.9047619047619048.[0m
[32m[I 2022-11-17 13:09:25,885][0m Trial 57 pruned. [0m
[32m[I 2022-11-17 13:09:26,082][0m Trial 58 pruned. [0m
[32m[I 2022-11-17 13:09:26,159][0m Trial 59 pruned. [0m
[32m[I 2022-11-17 13:09:26,210][0m Trial 60 pruned. [0m
[32m[I 2022-11-17 13:09:26,374][0m Trial 61 pr

**Display Study Results & Extract Best Trial**

In [12]:
pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("\nBest trial:")
best_trial = study.best_trial

print("  Validation Accuracy: ", best_trial.value)

print("  Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

Study statistics: 
  Number of finished trials:  100
  Number of pruned trials:  77
  Number of complete trials:  23

Best trial:
  Validation Accuracy:  0.9047619047619048
  Params: 
    n_layers: 1
    n_units_l0: 8
    dropout_l0: 0.28062255705218997
    lr: 0.00922086440196931
    batch_size: 10
    n_epochs: 30
    optimizer: SGD


**Train Final Model Using Hyperparameters from Best Trial**

In [13]:
def df_to_dataloader(df, my_batchsize):
    """
    Function to format dataframe as dataloader
    :param df: dataframe
    :param my_batchsize: batch size for dataloader
    :return: dataloader
    """
    data = MyDataset(df, 'HeartDisease')
    my_dataloader = DataLoader(data, batch_size=my_batchsize)

    return my_dataloader

In [14]:
def train_final_model(my_params):
    """
    Train final model using tuned hyperparameters from best Optuna trial
    :param my_params: dictionary of parameters from Optuna trial object that had best validation accuracy

    :return: pytorch neural network model
    """

    # Instantiate model
    model = define_model(my_params)

    # Instantiate optimizer
    optimizer_name = my_params['optimizer']
    lr = my_params['lr']
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

    # get data. Entire training dataset is used here, including validation set
    train_dataloader = df_to_dataloader(training_df, my_params['batch_size']) 
    
    # train model
    for epoch in range(my_params['n_epochs']):
        model.train()
        for batch, (X, y) in enumerate(train_dataloader):
            # X and y are tensors. X.size() = (batch_size,n_features), y.size()=(batch_size,)
            # set datatype for compatibility with nn.
            X = X.float()
            y = y.long()

            # calculate model output and resulting loss
            model_output = model(X)  # tensor. size=(batch_size x n_classes)
            loss_fn = nn.CrossEntropyLoss()  # instantiate loss function
            loss = loss_fn(model_output, y)

            # Backpropagation to update model weights
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    return model

In [15]:
def predict_and_evaluate(model, df):
    """
    Function to run trained and tuned model on provided dataframe to obtain predictions and evaluate
    accuracy

    :param model: trained model
    :param df: dataframe including features and target/label
    
    :return: accuracy
    """
    my_dataloader = df_to_dataloader(df, my_batchsize=10)

    model.eval()
    correct = 0
    with torch.no_grad():
        for batch, (X, y) in enumerate(my_dataloader):
            X = X.float()
            y = y.long()

            # calculate model output and total number of correct predictions for this batch
            model_output = model(X)
            pred = torch.argmax(model_output, dim=1)  # prediction = class with highest output value
            correct += count_correct(pred, y)

    accuracy = correct / len(my_dataloader.dataset)

    return accuracy

In [16]:
best_params = best_trial.params
final_model = train_final_model(best_params)

In [17]:
# Compute final training accuracy
train_acc = predict_and_evaluate(final_model, training_df)
print(f"  Final Training Accuracy: {train_acc}")

  Final Training Accuracy: 0.8608458390177354


## Evaluate Accuracy on Test Data

In [18]:
test_acc = predict_and_evaluate(final_model, test_df)
print(f"  Test Accuracy: {test_acc}")

  Test Accuracy: 0.875


Test accuracy is close to training accuracy, indicating that we have not over-fit the model.