In [1]:
from numpy import vstack
from pandas import read_csv
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch import Tensor
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn import Module
from torch.optim import SGD
from torch.nn import BCELoss
from torch.nn.init import kaiming_uniform_
from torch.nn.init import xavier_uniform_

In [2]:
thyroid = pd.read_csv('data/thyroid_new.csv').drop('ID', axis=1)
y_label = (thyroid['ThryroidClass']=='sick').astype(int)
thyroid['ThryroidClass'] = y_label
# Preprocess and get rid of na
thyroid = thyroid.dropna()

# Scale X data
X = thyroid.drop('ThryroidClass', axis=1)

#thyroid.ThryroidClass
from sklearn.preprocessing import StandardScaler
import numpy as np
scaler = StandardScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
Y = pd.Series(y_label)
thyroid = pd.concat([Y,X], axis=1)
thyroid = thyroid.dropna()
thyroid.to_csv('data/final.csv',header=None, index=False)


In [3]:
# Create a custom CSVDataset loader
# https://machinelearningmastery.com/pytorch-tutorial-develop-deep-learning-models/
class ThryoidCSVDataset(Dataset):
    #Constructor for initially loading
    def __init__(self,path):
        df = read_csv(path, header=None)
        # Store the inputs and outputs
        self.X = df.values[:, :-1]
        self.y = df.values[:, -1] #Assuming your outcome variable is in the first column
        self.X = self.X.astype('float32')
        # Label encode the target as values 1 and 0 or sick and not sick
        self.y = LabelEncoder().fit_transform(self.y)
        self.y = self.y.astype('float32')
        self.y = self.y.reshape((len(self.y), 1))

    # Get the number of rows in the dataset
    def __len__(self):
        return len(self.X)
    # Get a row at an index
    def __getitem__(self,idx):
        return [self.X[idx], self.y[idx]]

    # Create custom class method - instead of dunder methods
    def split_data(self, split_ratio=0.2):
        test_size = round(split_ratio * len(self.X))
        train_size = len(self.X) - test_size
        return random_split(self, [train_size, test_size])


In [4]:
# Create model
class ThyroidMLP(Module):
    def __init__(self, n_inputs):
        super(ThyroidMLP, self).__init__()
        # First hidden layer
        self.hidden1 = Linear(n_inputs, 10)
        kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
        self.act1 = ReLU()
        # Second hidden layer
        self.hidden2 = Linear(10, 8)
        kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
        self.act2 = ReLU()
        # Third hidden layer
        self.hidden3 = Linear(8,1)
        xavier_uniform_(self.hidden3.weight)
        self.act3 = Sigmoid()

    def forward(self, X):
        #Input to the first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)
        # Second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)
        # Third hidden layer
        X = self.hidden3(X)
        X = self.act3(X)
        return X


In [5]:
def prepare_thyroid_dataset(path):
    dataset = ThryoidCSVDataset(path)
    train, test = dataset.split_data(split_ratio=0.1)
    # Prepare data loaders
    train_dl = DataLoader(train, batch_size=32, shuffle=True)
    test_dl = DataLoader(test, batch_size=1024, shuffle=False)
    return train_dl, test_dl



In [6]:
# Create training loop based off our custom class
def train_model(train_dl, model):
    # Define your optimisation function for reducing loss when weights are calculated 
    # and propogated through the network
    criterion = BCELoss()
    optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
    for epoch in range(100):
        # Create batches
        for i, (inputs, targets) in enumerate(train_dl):
            #Important to clear the gradients
            optimizer.zero_grad()
            # Compute the model output
            yhat = model(inputs)
            # Calculate the loss method
            loss = criterion(yhat, targets)
            # Update model weights through back propogation
            loss.backward()
            optimizer.step()



In [8]:
import tqdm
def evaluate_model(test_dl, model):
    preds = []
    actuals = []

    for (i, (inputs, targets)) in tqdm(enumerate(test_dl)):
        #Evaluate the model on the test set
        yhat = model(inputs)
        #Retrieve a numpy weights array
        yhat = yhat.detach().numpy()
        # Extract the weights using detach to get the numerical values in an ndarray, instead of tensor
        #https://www.tutorialspoint.com/how-to-convert-a-pytorch-tensor-with-gradient-to-a-numpy-array
        actual = targets.numpy()
        actual = actual.reshape((len(actual), 1))
        # Round to get the class value i.e. sick vs not sick
        yhat = yhat.round()
        # Store the predictions in the empty lists initialised at the start of the class
        preds.append(yhat)
        actuals.append(actual)
    
    # Stack the predictions and actual arrays vertically
    preds, actuals = vstack(preds), vstack(actuals)
    #Calculate metrics
    metrics = {
        'accuracy': accuracy_score(actuals, preds),
        'AU_ROC': roc_auc_score(actuals, preds)
    }
    return metrics, preds, actuals
        

In [9]:
# Create prediction routine
def predict(row, model):
    row = Tensor([row])
    yhat = model(row)
    # Get numpy array
    yhat = yhat.detach().numpy()
    return yhat

# Using the model

In [13]:
train_dl, test_dl = prepare_thyroid_dataset('data/final.csv')

In [14]:
print(len(train_dl.dataset), len(test_dl.dataset))

2476 275


In [15]:
model = ThyroidMLP(34)
# Train the model
train_model(train_dl, model)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x26 and 34x10)