In [14]:
from numpy import vstack
from pandas import read_csv
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, average_precision_score
from sklearn.metrics import confusion_matrix, recall_score, f1_score
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import torch
from torch import Tensor
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn import Module
from torch.optim import SGD
from torch.nn import BCELoss
from torch.optim import lr_scheduler
from torch.nn.init import kaiming_uniform_
from torch.nn.init import xavier_uniform_
import time
import copy

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [15]:

thyroid = pd.read_csv('data/thyroid_new.csv').drop('ID', axis=1)
NUM_INPUTS = len(thyroid.columns)
print(f'The original Thyroid data frame contains {len(thyroid.columns)} columns')
y_label = thyroid['ThryroidClass'] 
# Preprocess and get rid of na
thyroid = thyroid.dropna()

# Scale X data
X = thyroid.drop('ThryroidClass', axis=1)

#thyroid.ThryroidClass
from sklearn.preprocessing import StandardScaler
import numpy as np
scaler = StandardScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
print(f'The thyroid X data frame contains {len(X.columns)} columns')
Y = pd.Series(y_label)
thyroid = pd.concat([X,Y], axis=1)
thyroid = thyroid.dropna()
thyroid.to_csv('thyroid_raw.csv', header=None, index=None)

DATASET_SIZE = len(thyroid)



The original Thyroid data frame contains 27 columns
The thyroid X data frame contains 26 columns


In [16]:

# Replicate data loader process
x = thyroid.values[:, :-1]
y = thyroid.values[:,-1]
x = x.astype('float32')
y_encoded = LabelEncoder().fit_transform(y)
y_encoded = y_encoded.astype('float32')
y_arrayed = y_encoded.reshape((len(y_encoded), 1))
print(y_encoded)


[0. 0. 0. ... 0. 0. 0.]


In [33]:
# Create a custom CSVDataset loader

class ThryoidCSVDataset(Dataset):
    #Constructor for initially loading
    def __init__(self,path):
        df = read_csv(path, header=None)
        # Store the inputs and outputs
        self.X = df.values[:, :-1]
        self.y = df.values[:, -1] #Assuming your outcome variable is in the first column
        self.X = self.X.astype('float32')
        # Label encode the target as values 1 and 0 or sick and not sick
        self.y = LabelEncoder().fit_transform(self.y)
        self.y = self.y.astype('float32')
        self.y = self.y.reshape((len(self.y), 1))

    # Get the number of rows in the dataset
    def __len__(self):
        return len(self.X)
    # Get a row at an index
    def __getitem__(self,idx):
        return [self.X[idx], self.y[idx]]

    # Create custom class method - instead of dunder methods
    def split_data(self, split_ratio=0.2):
        test_size = round(split_ratio * len(self.X))
        train_size = len(self.X) - test_size
        return random_split(self, [train_size, test_size])


In [34]:
# Create model
class ThyroidMLP(Module):
    def __init__(self, n_inputs):
        super(ThyroidMLP, self).__init__()
        # First hidden layer
        self.hidden1 = Linear(n_inputs, 20)
        kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
        self.act1 = ReLU()
        # Second hidden layer
        self.hidden2 = Linear(20, 10)
        kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
        self.act2 = ReLU()
        # Third hidden layer
        self.hidden3 = Linear(10,1)
        xavier_uniform_(self.hidden3.weight)
        self.act3 = Sigmoid()

    def forward(self, X):
        #Input to the first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)
        # Second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)
        # Third hidden layer
        X = self.hidden3(X)
        X = self.act3(X)
        return X


In [35]:
def prepare_thyroid_dataset(path):
    dataset = ThryoidCSVDataset(path)
    train, test = dataset.split_data(split_ratio=0.1)
    # Prepare data loaders
    train_len = len(train)
    train_dl = DataLoader(train, batch_size=32, shuffle=True)
    test_dl = DataLoader(test, batch_size=1024, shuffle=False)
    return train_dl, test_dl, train_len



In [36]:
# Create training loop based off our custom class
def train_model(train_dl, model, epochs=100, lr=0.01, momentum=0.9, save_path='thyroid_best_model.pth'):
    # Define your optimisation function for reducing loss when weights are calculated 
    # and propogated through the network
    start = time.time()
    total = 0
    criterion = BCELoss()
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    loss = 0.0
    best_acc = 0.0

    for epoch in range(epochs):
        correct = 0
        print('Epoch {}/{}'.format(epoch+1, epochs))
        print('-' * 10)
        model.train()
        # Iterate through training data loader
        for i, (inputs, targets) in enumerate(train_dl):
            optimizer.zero_grad()
            outputs = model(inputs)
            _, preds = torch.max(outputs.data,1) #Get the class labels
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total += targets.size(0)
            correct += (preds == targets).sum().item()
            torch.save(model, save_path)
        acc = correct / total
        print(f'Current accuracy is: {acc}')
    time_delta = time.time() - start
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_delta // 60, time_delta % 60
    ))
    
    return model

        



In [38]:
import math
def evaluate_model(test_dl, model, beta=1.0):
    preds = []
    actuals = []

    for (i, (inputs, targets)) in enumerate(test_dl):
        #Evaluate the model on the test set
        yhat = model(inputs)
        #Retrieve a numpy weights array
        yhat = yhat.detach().numpy()
        # Extract the weights using detach to get the numerical values in an ndarray, instead of tensor
        #https://www.tutorialspoint.com/how-to-convert-a-pytorch-tensor-with-gradient-to-a-numpy-array
        actual = targets.numpy()
        actual = actual.reshape((len(actual), 1))
        # Round to get the class value i.e. sick vs not sick
        yhat = yhat.round()
        # Store the predictions in the empty lists initialised at the start of the class
        preds.append(yhat)
        actuals.append(actual)
    
    # Stack the predictions and actual arrays vertically
    preds, actuals = vstack(preds), vstack(actuals)
    #Calculate metrics
    cm = confusion_matrix(actuals, preds)
    # Get descriptions of tp, tn, fp, fn
    tn, fp, fn, tp = cm.ravel()
    total = sum(cm.ravel())
    
    metrics = {
        'accuracy': accuracy_score(actuals, preds),
        'AU_ROC': roc_auc_score(actuals, preds),
        'f1_score': f1_score(actuals, preds),
        'average_precision_score': average_precision_score(actuals, preds),
        'f_beta': ((1+beta**2) * precision_score(actuals, preds) * recall_score(actuals, preds)) / (beta**2 * precision_score(actuals, preds) + recall_score(actuals, preds)),
        'matthews_correlation_coefficient': (tp*tn - fp*fn) / math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)),
        'precision': precision_score(actuals, preds),
        'recall': recall_score(actuals, preds),
        'true_positive_rate_TPR':recall_score(actuals, preds),
        'false_positive_rate_FPR':fp / (fp + tn) ,
        'false_discovery_rate': fp / (fp +tp),
        'false_negative_rate': fn / (fn + tp) ,
        'negative_predictive_value': tn / (tn+fn),
        'misclassification_error_rate': (fp+fn)/total ,
        'sensitivity': tp / (tp + fn),
        'specificity': tn / (tn + fp),
        #'confusion_matrix': confusion_matrix(actuals, preds), 
        'TP': tp,
        'FP': fp, 
        'FN': fn, 
        'TN': tn
    }
    return metrics, preds, actuals
        

In [22]:
# Create prediction routine
def predict(row, model):
    row = Tensor([row])
    yhat = model(row)
    # Get numpy array
    yhat = yhat.detach().numpy()
    return yhat

# Using the model

In [29]:
train_dl, test_dl, train_len = prepare_thyroid_dataset('https://raw.githubusercontent.com/jbrownlee/Datasets/master/ionosphere.csv')
#train_dl, test_dl, train_len = prepare_thyroid_dataset('https://raw.githubusercontent.com/StatsGary/Data/main/thyroid_raw.csv')

In [30]:
print(len(train_dl.dataset), len(test_dl.dataset))
print(train_dl.dataset)
train_len

316 35
<torch.utils.data.dataset.Subset object at 0x7ff7dc526e50>


316

In [39]:
# Specify the number of input dimensions
model = ThyroidMLP(34)
# Train the model
train_model(train_dl, model, save_path='data/thyroid_model.pth')
# Evaluate the model
results = evaluate_model(test_dl, model, beta=5)
model_metrics = results[0]
preds = results[1]
actuals_gt_labesl = results[2]
print(model_metrics)



Epoch 1/100
----------
Current accuracy is: 11.544303797468354
Epoch 2/100
----------
Current accuracy is: 5.7594936708860756
Epoch 3/100
----------
Current accuracy is: 3.8438818565400843
Epoch 4/100
----------
Current accuracy is: 2.8892405063291138
Epoch 5/100
----------
Current accuracy is: 2.30126582278481
Epoch 6/100
----------
Current accuracy is: 1.9282700421940928
Epoch 7/100
----------
Current accuracy is: 1.6491862567811935
Epoch 8/100
----------
Current accuracy is: 1.4414556962025316
Epoch 9/100
----------
Current accuracy is: 1.279887482419128
Epoch 10/100
----------
Current accuracy is: 1.1544303797468354
Epoch 11/100
----------
Current accuracy is: 1.046029919447641
Epoch 12/100
----------
Current accuracy is: 0.9578059071729957
Epoch 13/100
----------
Current accuracy is: 0.8880233690360273
Epoch 14/100
----------
Current accuracy is: 0.8227848101265823
Epoch 15/100
----------
Current accuracy is: 0.770464135021097
Epoch 16/100
----------
Current accuracy is: 0.7191455