In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import random

## Training and Testing a basic ANN in Pytorch

This tutorial will cover everything you need to train and test a basic ANN in pytorch. Before we get into that, we will redo all the things we did in the data preparation and ANN class tutorial as a reminder!

# Getting into the neural network
Remember our BIOF050 class from last time? Now we are going to finish it so it can take some data and labels, create a neural network, train it, and test it!

In [2]:
class BIOF050:
    
    '''
        
        Inside this Net class, we can define what we want our neural network to look like!
        We will define the layers and the sizes of the layers here - this is just a basic ANN 
        
        Inputs:
        
        n_features - how many features does one sample of our data have (how many columns does the matrix have)
        
        hidden_dimension - the number of hidden neurons we want?
        
        n_classes - the number of unique labels in our data (i.e. 0,1 for the Breast Cancer dataset)
        
    '''
    
    class Net(nn.Module):
        
        def __init__(self,n_features,hidden_dimension,n_classes):
        
            ##### calling the constructor of the parent class - gets everything we need from Pytorch
            super(BIOF050.Net, self).__init__()
            
            ''' When dealing with nn.Linear, the first input is the size of the input data,
            and the second input is how big you want the next layer to be '''
            
            ### The data enters here, then we make the next layer (hidden neurons)
            self.input_layer = nn.Linear(n_features,hidden_dimension)
            
            ### hidden layer #1
            self.layer1 = nn.Linear(hidden_dimension,hidden_dimension)
            
            ### hidden layer #2
            self.layer2 = nn.Linear(hidden_dimension, hidden_dimension)
            
            ### The output layer, where we end up with a series of nodes corresponding to each of our uniquelabels
            self.output_layer = nn.Linear(hidden_dimension,n_classes)
            
            '''
              After each layer, we will apply nn.ReLU to transform our data into a nonlinear space
              We input our data into this function, and ReLU is applied!
            
            '''
            self.relu = nn.ReLU()
       
    
    
        '''
        Now, we have to define the forward method, which takes a data point, or, in most cases, a batch, and
        feeds it through all the layers of our neural network until assigning it a layer
        
        nn.Linear takes one array as an input, so we will input our data right into each layer, and then input the
        outputs of each layer into the next layer
        
        After each layer, we will apply nn.ReLU to transform our data into a nonlinear space
        
        Finally, after the data has been passed through the output layer, we will convert it into a probaboility
        distribution using the softmax function. 
        
        This probabilty dsistribution will be used to assign a label to our
        data points and to figure out just how well our neural network did, as we learned earlier today
        
        '''
        def forward(self,batch):
            
            ## put the data into the input layer of the neural network
            batch = self.input_layer(batch)
            
            batch = self.relu(batch)
      
            ## put the transformed data into the first hidden layer of the neural network
            batch = self.layer1(batch)
            
            ## apply the ReLU function to the output of the 1st hidden layer
            batch = self.relu(batch)
            
            ## put the transformed data into the second hidden layer of the neural network
            batch = self.layer2(batch)
            
            ## apply the ReLU function to the output of the 1st hidden layer
            batch = self.relu(batch)
            
            ## put the transformed data into the output layer of the neural network
            batch = self.output_layer(batch)
            
            ### return the probability distribution via the softmax function
            return nn.functional.softmax(batch)
            
            

# Defining the Constructor
 
We will add a constructor to our BIOF 050 class that takes a dataset and labels and makes them attributes of the class. We will define the constructor here before adding it into our class.

### Labels for pytorch must be numbers starting at zero and increasing sequentially (i.e. [0,1,2]) so always make sure that is the case! Otherwise, make sure it the case!

In [3]:
class BIOF050:
    
        
    
    '''
        
        Inside this BIOF050 class, we can input our data and labels
    '''
    
    def __init__(self,data,labels,tst_size=0.2,n_epochs=4):
        
        self.data = data
        self.labels

## Now, we will add this constructor into our class-in-progress

In [4]:
class BIOF050:

        
    '''
        
        Inside this BIOF050 class, we can input our data and labels
        
    '''
    
    def __init__(self,data,labels,tst_size=0.2,n_epochs=4):
        
        self.data = data
        self.labels = labels

      
    
    '''
        
        Inside this Net class, we can define what we want our neural network to look like!
        We will define the layers and the sizes of the layers here - this is just a basic ANN 
        
        Inputs:
        
        n_features - how many features does one sample of our data have (how many columns does the matrix have)
        
        hidden_dimension - the number of hidden neurons we want
        
        n_classes - the number of unique labels in our data (i.e. 0,1 for the Breast Cancer dataset)
        
    '''
    
    class Net(nn.Module):
        
        def __init__(self,n_features,hidden_dimension,n_classes):
        
            ##### calling the constructor of the parent class - gets everything we need from Pytorch
            super(BIOF050.Net, self).__init__()
            
            ''' When dealing with nn.Linear, the first input is the size of the input data,
            and the second input is how big you want the next layer to be '''
            
            ### The data enters here, then we make the next layer (hidden neurons)
            self.input_layer = nn.Linear(n_features,hidden_dimension)
            
            ### hidden layer #1
            self.layer1 = nn.Linear(hidden_dimension,hidden_dimension)
            
            ### hidden layer #2
            self.layer2 = nn.Linear(hidden_dimension, hidden_dimension)
            
            ### The output layer, where we end up with a series of nodes corresponding to each of our uniquelabels
            self.output_layer = nn.Linear(hidden_dimension,n_classes)
            
            '''
              After each layer, we will apply nn.ReLU to transform our data into a nonlinear space
              We input our data into this function, and ReLU is applied!
            
            '''
            self.relu = nn.ReLU()
       
    
    
        '''
        Now, we have to define the forward method, which takes a data point, or, in most cases, a batch, and
        feeds it through all the layers of our neural network until assigning it a layer
        
        nn.Linear takes one array as an input, so we will input our data right into each layer, and then input the
        outputs of each layer into the next layer
   
        Finally, after the data has been passed through the output layer, we will convert it into a probaboility
        distribution using the softmax function. 
        
        This probabilty dsistribution will be used to assign a label to our
        data points and to figure out just how well our neural network did, as we learned earlier today
        
        '''
        def forward(self,batch):
            
            ## put the data into the input layer of the neural network
            batch = self.input_layer(batch)
            
            batch = self.relu(batch)
      
            ## put the transformed data into the first hidden layer of the neural network
            batch = self.layer1(batch)
            
            ## apply the ReLU function to the output of the 1st hidden layer
            batch = self.relu(batch)
            
            ## put the transformed data into the second hidden layer of the neural network
            batch = self.layer2(batch)
            
            ## apply the ReLU function to the output of the 1st hidden layer
            batch = self.relu(batch)
            
            ## put the transformed data into the output layer of the neural network
            batch = self.output_layer(batch)
            
            ### return the probability distribution via the softmax function
            return nn.functional.softmax(batch)

## Adding in our batchify function from last time
We know we will need to batch our dataset, so we will add in our "batchify" function from last time

In [5]:
class BIOF050:

        
    '''
        
        Inside this BIOF050 class, we can input our data and labels
        
    '''
    
    def __init__(self,data,labels,tst_size=0.2,n_epochs=4):
        
        self.data = data
        self.labels = labels

      
    
    '''
        
        Inside this Net class, we can define what we want our neural network to look like!
        We will define the layers and the sizes of the layers here - this is just a basic ANN 
        
        Inputs:
        
        n_features - how many features does one sample of our data have (how many columns does the matrix have)
        
        hidden_dimension - the number of hidden neurons we want?
        
        n_classes - the number of unique labels in our data (i.e. 0,1 for the Breast Cancer dataset)
        
    '''
    
    class Net(nn.Module):
        
        def __init__(self,n_features,hidden_dimension,n_classes):
        
            ##### calling the constructor of the parent class - gets everything we need from Pytorch
            super(BIOF050.Net, self).__init__()
            
            ''' When dealing with nn.Linear, the first input is the size of the input data,
            and the second input is how big you want the next layer to be '''
            
            ### The data enters here, then we make the next layer (hidden neurons)
            self.input_layer = nn.Linear(n_features,hidden_dimension)
            
            ### hidden layer #1
            self.layer1 = nn.Linear(hidden_dimension,hidden_dimension)
            
            ### hidden layer #2
            self.layer2 = nn.Linear(hidden_dimension, hidden_dimension)
            
            ### The output layer, where we end up with a series of nodes corresponding to each of our uniquelabels
            self.output_layer = nn.Linear(hidden_dimension,n_classes)
            
            '''
              After each layer, we will apply nn.ReLU to transform our data into a nonlinear space
              We input our data into this function, and ReLU is applied!
            
            '''
            self.relu = nn.ReLU()
       
    
    
        '''
        Now, we have to define the forward method, which takes a data point, or, in most cases, a batch, and
        feeds it through all the layers of our neural network until assigning it a layer
        
        nn.Linear takes one array as an input, so we will input our data right into each layer, and then input the
        outputs of each layer into the next layer
        
        Finally, after the data has been passed through the output layer, we will convert it into a probaboility
        distribution using the softmax function. 
        
        This probabilty dsistribution will be used to assign a label to our
        data points and to figure out just how well our neural network did, as we learned earlier today
        
        '''
        def forward(self,batch):
            
            ## put the data into the input layer of the neural network
            batch = self.input_layer(batch)
            
            batch = self.relu(batch)
      
            ## put the transformed data into the first hidden layer of the neural network
            batch = self.layer1(batch)
            
            ## apply the ReLU function to the output of the 1st hidden layer
            batch = self.relu(batch)
            
            ## put the transformed data into the second hidden layer of the neural network
            batch = self.layer2(batch)
            
            ## apply the ReLU function to the output of the 1st hidden layer
            batch = self.relu(batch)
            
            ## put the transformed data into the output layer of the neural network
            batch = self.output_layer(batch)
            
            ### return the probability distribution via the softmax function
            return nn.functional.softmax(batch)

        
        
''' Utility Function - function to turn the data into batches'''

def batchify(data,labels,batch_size=16):
    
    batches= []
    label_batches = []


    for n in range(0,len(data),batch_size):
        if n+batch_size < len(data):
            batches.append(data[n:n+batch_size])
            label_batches.append(labels[n:n+batch_size])

    if len(data)%batch_size > 0:
        batches.append(data[len(data)-(len(data)%batch_size):len(data)])
        label_batches.append(labels[len(data)-(len(data)%batch_size):len(data)])
        
    return batches,label_batches

# Training the neural network
Now, we need to define a function that splits and batchifies the data, then creates and trains a neural network

So we can easily customize the analysis, our train function will take as input parameters:
1. the size of the testing set we want
2. The number of epochs we want (how many times we move through the dataset)
3. the size of the hidden layers we want (how many hidden neurons)
4. The size of our batches

## Splitting:
In most cases, datasets that need NNs are too large and NNs take too long to do cross-validation, so we will just split the data one time using sklearn's train_test_split.

In [6]:
'''
        
        We can specify the size of the test dataset, and the
        number of epochs (number of times we move through the data), the size of the hidden layers, and the 
        size of the batches here in the train_test function for easy customizations
        
        Inside the function, our data will be broken down into train and test sets (arrays)
        using sklearn's train_test_split function.
        
        From there, it will be batchified using our batchify function from the preparation tutorial
        
        Finally, a neural network will be generated and trained on our dataset!
        
'''

def train_test(self,test_size,n_epochs,hidden_dimensions,batch_size,lr):
            
        ### splitting the data into a training/testing set
        train_data,test_data,train_labels,test_labels = train_test_split(self.data,self.labels, test_size=test_size)
        
        ## creating the batches using the batchify function
        train_batches,train_label_batches = batchify(train_data,train_labels,batch_size=batch_size)
        
        '''
        Here is where we define our neural network model - the Net class is inside BIOF050, so we have to call
        it accordingly 
        
        We use the length of our first data point to set the length of our input data (they are all the same)
        
        The number of class is equal to the number of unique values (the set) of our training labels
        '''
        neural_network = BIOF050_Final.Net(len(train_data[0]),hidden_dimensions,len(set(train_labels)))
        
         
    
        '''
        The train function tells the neural network that it is about to be trained and that it 
        will have to calculate the needed information for optimization 
        
        This function should always be called before training
        '''
        neural_network.train()
        
        
        ''' This loop moves through the data once for each epoch'''
        for i in range(n_epochs):
            
            ### track the number we get correct
            correct = 0
            
            ''' This loop moves through each batch and feeds them into the neural network'''
            for ii in range(len(train_batches)):
                
                batch = train_batches[ii]
                labels = train_label_batches[ii]

            
                ''' 
                Puts our batch into the neural network

                Predictions: For each data point in our batch, we would get something that looks like:
                tensor([0.3,0.7]) where each number corresponds to the probability of a class
                '''
                
                predictions = neural_network(torch.tensor(batch.astype(np.float32)))
                
        return neural_network
        

## THE TRAIN FUNCTION WILL NOT WORK
Why? Because the data type is all wrong. The pytorch data type is called a tensor (kind of like an array), and Pytorch neural networks require tensors as inputs. We must convert our data (either a list or an array) to a tensor with the torch.tensor function! If your data is not a list or an array, you need to make it into one!

In [7]:
 def train_test(self,test_size,n_epochs,hidden_dimensions,batch_size,lr):
            
        ### splitting the data into a training/testing set
        train_data,test_data,train_labels,test_labels = train_test_split(self.data,self.labels, test_size=test_size)
        
        ## creating the batches using the batchify function
        train_batches,train_label_batches = batchify(train_data,train_labels,batch_size=batch_size)
        
        '''
        Here is where we define our neural network model - the Net class is inside BIOF050, so we have to call
        it accordingly 
        
        We use the length of our first data point to set the length of our input data (they are all the same)
        
        The number of class is equal to the number of unique values (the set) of our training labels
        '''
        neural_network = BIOF050_Final.Net(len(train_data[0]),hidden_dimensions,len(set(train_labels)))
              
        
        '''
        The train function tells the neural network that it is about to be trained and that it 
        will have to calculate the needed information for optimization 
        
        This function should always be called before training
        '''
        neural_network.train()
        
        
        ''' This loop moves through the data once for each epoch'''
        for i in range(n_epochs):
            
            ### track the number we get correct
            correct = 0
            
            ''' This loop moves through each batch and feeds into the neural network'''
            for ii in range(len(train_batches)):
                
                batch = train_batches[ii]
                labels = train_label_batches[ii]

                
                ''' 
                Puts our batch into the neural network after converting it to a tensor
                
                Pytorch wants numeric data to be floats, so we will convert to a float as well 
                using np.float32
                
                Predictions: For each data point in our batch, we would get something that looks like:
                tensor([0.3,0.7]) where each number corresponds to the probability of a class
                
                
                '''
                predictions = neural_network(torch.tensor(batch.astype(np.float32)))
                
        return neural_network
        

## THE TRAIN FUNCTION WILL NOT LEARN ANYTHING
Why? Because there is no optimization function or loss function!

1. We will first define a stochastic gradient descent otpimization function using the torch.optim package
    We must choose a learning rate for this function, so that will be a parameter of our train_test function (lr)
    
2. We will then define a cross-entropy loss function through the nn module

3. We will then implement both these functions in our training loop. 

In [8]:
 def train_test(self,test_size,n_epochs,hidden_dimensions,batch_size,lr):
            
        ### splitting the data into a training/testing set
        train_data,test_data,train_labels,test_labels = train_test_split(self.data,self.labels, test_size=test_size)
        
        ## creating the batches using the batchify function
        train_batches,train_label_batches = batchify(train_data,train_labels,batch_size=batch_size)
        
        '''
        Here is where we define our neural network model - the Net class is inside BIOF050, so we have to call
        it accordingly 
        
        We use the length of our first data point to set the length of our input data (they are all the same)
        
        The number of class is equal to the number of unique values (the set) of our training labels
        '''
        neural_network = BIOF050_Final.Net(len(train_data[0]),hidden_dimensions,len(set(train_labels)))
        
        
        '''
        Here, we use the torch.optim package to create our stochastic gradient descent function
        
        neural_network.parameters() reads internal information from our NN 
        (don't worry about that - SGD just requires it)
        
        lr is the learning rate
        '''
        optimizer = optim.SGD(neural_network.parameters(), lr=lr)
        
        
        '''
        Here, we use the nn package to create our cross entropy loss function
        '''
        loss_function = nn.CrossEntropyLoss()
        
                
        '''
        The train function tells the neural network that it is about to be trained and that it 
        will have to calculate the needed information for optimization 
        
        This function should always be called before training
        '''
        neural_network.train()
        
        
        ''' This loop moves through the data once for each epoch'''
        for i in range(n_epochs):
            
            ### track the number we get correct
            correct = 0
            
            ''' This loop moves through each batch and feeds into the neural network'''
            for ii in range(len(train_batches)):
                
                ''' 
                Clears previous gradients from the optimizer - the optimizer,
                in this case, does not need to know what happened last time
                '''
                optimizer.zero_grad()
                
                
                batch = train_batches[ii]
                labels = train_label_batches[ii]

                
                ''' 
                Puts our batch into the neural network after converting it to a tensor
                
                Pytorch wants numeric data to be floats, so we will convert to a float as well 
                using np.float32
                
                Predictions: For each data point in our batch, we would get something that looks like:
                tensor([0.3,0.7]) where each number corresponds to the probability of a class
                
                '''
                predictions = neural_network(torch.tensor(batch.astype(np.float32)))
                
                
                ''' 
                We put our probabilities into the loss function to calculate the error for this batch
                
                '''
                loss = loss_function(predictions,torch.LongTensor(labels))
                
                '''
                loss.backward calculates the partial derivatives that we need to optimize
                '''
                loss.backward()
                
                
                '''
                optimizer step calculates the weight updates so the neural network can update the weights 
                '''
                optimizer.step()
                
           
        return neural_network
        

## Testing the Neural Network
We now have a function to train our dataset, but how do we know if we overfit or not? Our neural network 
needs to learn things that can be applied to blind data, which makes it useful in areas like clinical decision making. After training the network, it needs to work on all the data we put into it from patients in the clinic!

We will now update our train_test function to predict the labels of our test dataset using the weights we already optimized with our SGD function (they will not be updated any further). 

In [9]:
 def train_test(self,test_size,n_epochs,hidden_dimensions,batch_size,lr):
            
        ### splitting the data into a training/testing set
        train_data,test_data,train_labels,test_labels = train_test_split(self.data,self.labels, test_size=test_size)
        
        ## creating the batches using the batchify function
        train_batches,train_label_batches = batchify(train_data,train_labels,batch_size=batch_size)
        
        '''
        Here is where we define our neural network model - the Net class is inside BIOF050, so we have to call
        it accordingly 
        
        We use the length of our first data point to set the length of our input data (they are all the same)
        
        The number of class is equal to the number of unique values (the set) of our training labels
        '''
        neural_network = BIOF050_Final.Net(len(train_data[0]),hidden_dimensions,len(set(train_labels)))
        
        
        '''
        Here, we use the torch.optim package to create our stochastic gradient descent function
        
        neural_network.parameters() reads internal information from our NN 
        (don't worry about that - SGD just requires it)
        
        lr is the learning rate
        '''
        optimizer = optim.SGD(neural_network.parameters(), lr=lr)
        
        
        '''
        Here, we use the nn package to create our cross entropy loss function
        '''
        loss_function = nn.CrossEntropyLoss()
        
                
        '''
        The train function tells the neural network that it is about to be trained and that it 
        will have to calculate the needed information for optimization 
        
        This function should always be called before training
        '''
        neural_network.train()
        
        
        ''' This loop moves through the data once for each epoch'''
        for i in range(n_epochs):
            
            ### track the number we get correct
            correct = 0
            
            ''' This loop moves through each batch and feeds into the neural network'''
            for ii in range(len(train_batches)):
                
                ''' 
                Clears previous gradients from the optimizer - the optimizer,
                in this case, does not need to know what happened last time
                '''
                optimizer.zero_grad()
                
                
                batch = train_batches[ii]
                labels = train_label_batches[ii]

                
                ''' 
                Puts our batch into the neural network after converting it to a tensor
                
                Pytorch wants numeric data to be floats, so we will convert to a float as well 
                using np.float32
                
                Predictions: For each data point in our batch, we would get something that looks like:
                tensor([0.3,0.7]) where each number corresponds to the probability of a class
                '''
                predictions = neural_network(torch.tensor(batch.astype(np.float32)))
                
                
                ''' 
                We put our probabilities into the loss function to calculate the error for this batch
                
                '''
                loss = loss_function(predictions,torch.LongTensor(labels))
                
                '''
                loss.backward calculates the partial derivatives that we need to optimize
                '''
                loss.backward()
                
                
                '''
                optimizer step calculates the weight updates so the neural network can update the weights 
                '''
                optimizer.step()
                       
        '''
        The eval function tells the neural network that it is about to be tested on blind test data
        and shouldn't change any of its internal parameters
        
        This function should always be called before eval
        '''
        neural_network.eval()
        
        test_correct = 0
        
        ''' input our test data into the neural network'''
        predictions = neural_network(torch.tensor(test_data.astype(np.float32)))

        return neural_network
        

## But how will we know? 
We need something in our training function that prints out the train and testing accuracies! So we can track our analysis as it runs

In [10]:
  def train_test(self,test_size,n_epochs,hidden_dimensions,batch_size,lr):
            
        ### splitting the data into a training/testing set
        train_data,test_data,train_labels,test_labels = train_test_split(self.data,self.labels, test_size=test_size)
        
        ## creating the batches using the batchify function
        train_batches,train_label_batches = batchify(train_data,train_labels,batch_size=batch_size)
        
        '''
        Here is where we define our neural network model - the Net class is inside BIOF050, so we have to call
        it accordingly 
        
        We use the length of our first data point to set the length of our input data (they are all the same)
        
        The number of class is equal to the number of unique values (the set) of our training labels
        '''
        neural_network = BIOF050_Final.Net(len(train_data[0]),hidden_dimensions,len(set(train_labels)))
        
        
        '''
        Here, we use the torch.optim package to create our stochastic gradient descent function
        
        neural_network.parameters() reads internal information from our NN 
        (don't worry about that - SGD just requires it)
        
        lr is the learning rate
        '''
        optimizer = optim.SGD(neural_network.parameters(), lr=lr)
        
        
        '''
        Here, we use the nn package to create our cross entropy loss function
        '''
        loss_function = nn.CrossEntropyLoss()
        
                
        '''
        The train function tells the neural network that it is about to be trained and that it 
        will have to calculate the needed information for optimization 
        
        This function should always be called before training
        '''
        neural_network.train()
        
        
        ''' This loop moves through the data once for each epoch'''
        for i in range(n_epochs):
            
            ### track the number we get correct
            correct = 0
            
            ''' This loop moves through each batch and feeds into the neural network'''
            for ii in range(len(train_batches)):
                
                ''' 
                Clears previous gradients from the optimizer - the optimizer,
                in this case, does not need to know what happened last time
                '''
                optimizer.zero_grad()
                
                
                batch = train_batches[ii]
                labels = train_label_batches[ii]

                
                ''' 
                Puts our batch into the neural network after converting it to a tensor
                
                Pytorch wants numeric data to be floats, so we will convert to a float as well 
                using np.float32
                
                Predictions: For each data point in our batch, we would get something that looks like:
                tensor([0.3,0.7]) where each number corresponds to the probability of a class
                '''
                predictions = neural_network(torch.tensor(batch.astype(np.float32)))
                
                
                ''' 
                We put our probabilities into the loss function to calculate the error for this batch
                
                '''
                loss = loss_function(predictions,torch.LongTensor(labels))
                
                '''
                loss.backward calculates the partial derivatives that we need to optimize
                '''
                loss.backward()
                
                
                '''
                optimizer step calculates the weight updates so the neural network can update the weights 
                '''
                optimizer.step()
                
                
                '''
                We extract just the data from our predictions, not other stuff Pytorch includes in that object
                
                We can then use the argmax function to figure out which index corresponds to the highest probability.
                If it is the 0th index, and the label is zero, we add one to correct. 
                If it is the 1st index, and the label is one, we add one to correct.
                
                This is why the labels need to start at zero and increase sequentially!
                
                '''
                for n,pred in enumerate(predictions.data):
                    if labels[n] == torch.argmax(pred):
                        correct += 1
                        
                        
            print("Accuracy for Epoch # " + str(i) + ": " + str(correct/len(train_data)))

        print()
        

                    
        '''
        The eval function tells the neural network that it is about to be tested on blind test data
        and shouldn't change any of its internal parameters
        
        This function should always be called before eval
        '''
        neural_network.eval()
        
        test_correct = 0
        
        ''' input our test data into the neural network'''
        predictions = neural_network(torch.tensor(test_data.astype(np.float32)))
        
        ''' checks how many we got right - very simple!'''
        for n,pred in enumerate(predictions.data):
            if test_labels[n] == torch.argmax(pred):
                    test_correct += 1
                    
        print("Accuracy on test set: " + str(test_correct/len(test_data)))
           
        return neural_network
        

# Final updates
Now, we will add our train_test function to the BIOF050 class, which will now be called the BIOF050_Final class (used in Assignment #1)

In [11]:
class BIOF050_Final:

        
    '''
        
        Inside this BIOF050 class, we can input our data and labels
        
    '''
    
    def __init__(self,data,labels,tst_size=0.2,n_epochs=4):
        
        self.data = data
        self.labels = labels

      
    
    '''
        
        Inside this Net class, we can define what we want our neural network to look like!
        We will define the layers and the sizes of the layers here - this is just a basic ANN 
        
        Inputs:
        
        n_features - how many features does one sample of our data have (how many columns does the matrix have)
        
        hidden_dimension - the number of hidden neurons we want?
        
        n_classes - the number of unique labels in our data (i.e. 0,1 for the Breast Cancer dataset)
        
    '''
    
    class Net(nn.Module):
        
        def __init__(self,n_features,hidden_dimension,n_classes):
        
            ##### calling the constructor of the parent class - gets everything we need from Pytorch
            super(BIOF050_Final.Net, self).__init__()
            
            ''' When dealing with nn.Linear, the first input is the size of the input data,
            and the second input is how big you want the next layer to be '''
            
            ### The data enters here, then we make the next layer (hidden neurons)
            self.input_layer = nn.Linear(n_features,hidden_dimension)
            
            ### hidden layer #1
            self.layer1 = nn.Linear(hidden_dimension,hidden_dimension)
            
            ### hidden layer #2
            self.layer2 = nn.Linear(hidden_dimension, hidden_dimension)
            
            ### The output layer, where we end up with a series of nodes corresponding to each of our uniquelabels
            self.output_layer = nn.Linear(hidden_dimension,n_classes)
            
            '''
              After each layer, we will apply nn.ReLU to transform our data into a nonlinear space
              We input our data into this function, and ReLU is applied!
            
            '''
            self.relu = nn.ReLU()
       
    
    
        '''
        Now, we have to define the forward method, which takes a data point, or, in most cases, a batch, and
        feeds it through all the layers of our neural network until assigning it a layer
        
        nn.Linear takes one array as an input, so we will input our data right into each layer, and then input the
        outputs of each layer into the next layer
        
        After each layer, we will apply nn.ReLU to transform our data into a nonlinear space
        
        Finally, after the data has been passed through the output layer, we will convert it into a probaboility
        distribution using the softmax function. 
        
        This probabilty dsistribution will be used to assign a label to our
        data points and to figure out just how well our neural network did, as we learned earlier today
        
        '''
        def forward(self,batch):
            
            ## put the data into the input layer of the neural network
            batch = self.input_layer(batch)
            
            batch = self.relu(batch)
      
            ## put the transformed data into the first hidden layer of the neural network
            batch = self.layer1(batch)
            
            ## apply the ReLU function to the output of the 1st hidden layer
            batch = self.relu(batch)
            
            ## put the transformed data into the second hidden layer of the neural network
            batch = self.layer2(batch)
            
            ## apply the ReLU function to the output of the 1st hidden layer
            batch = self.relu(batch)
            
            ## put the transformed data into the output layer of the neural network
            batch = self.output_layer(batch)
            
            ### return the probability distribution via the softmax function
            return nn.functional.softmax(batch)
            
    

        
    def train_test(self,test_size,n_epochs,hidden_dimensions,batch_size,lr):
            
        ### splitting the data into a training/testing set
        train_data,test_data,train_labels,test_labels = train_test_split(self.data,self.labels, test_size=test_size)
        
        ## creating the batches using the batchify function
        train_batches,train_label_batches = batchify(train_data,train_labels,batch_size=batch_size)
        
        '''
        Here is where we define our neural network model - the Net class is inside BIOF050, so we have to call
        it accordingly 
        
        We use the length of our first data point to set the length of our input data (they are all the same)
        
        The number of class is equal to the number of unique values (the set) of our training labels
        '''
        neural_network = BIOF050_Final.Net(len(train_data[0]),hidden_dimensions,len(set(train_labels)))
        
        
        '''
        Here, we use the torch.optim package to create our stochastic gradient descent function
        
        neural_network.parameters() reads internal information from our NN 
        (don't worry about that - SGD just requires it)
        
        lr is the learning rate
        '''
        optimizer = optim.SGD(neural_network.parameters(), lr=lr)
        
        
        '''
        Here, we use the nn package to create our cross entropy loss function
        '''
        loss_function = nn.CrossEntropyLoss()
        
                
        '''
        The train function tells the neural network that it is about to be trained and that it 
        will have to calculate the needed information for optimization 
        
        This function should always be called before training
        '''
        neural_network.train()
        
        
        ''' This loop moves through the data once for each epoch'''
        for i in range(n_epochs):
            
            ### track the number we get correct
            correct = 0
            
            ''' This loop moves through each batch and feeds into the neural network'''
            for ii in range(len(train_batches)):
                
                ''' 
                Clears previous gradients from the optimizer - the optimizer,
                in this case, does not need to know what happened last time
                '''
                optimizer.zero_grad()
                
                
                batch = train_batches[ii]
                labels = train_label_batches[ii]

                
                ''' 
                Puts our batch into the neural network after converting it to a tensor
                
                Pytorch wants numeric data to be floats, so we will convert to a float as well 
                using np.float32
                
                Predictions: For each data point in our batch, we would get something that looks like:
                tensor([0.3,0.7]) where each number corresponds to the probability of a class
                '''
                predictions = neural_network(torch.tensor(batch.astype(np.float32)))
                
                
                ''' 
                We put our probabilities into the loss function to calculate the error for this batch
                
                '''
                loss = loss_function(predictions,torch.LongTensor(labels))
                
                '''
                loss.backward calculates the partial derivatives that we need to optimize
                '''
                loss.backward()
                
                
                '''
                optimizer step calculates the weight updates so the neural network can update the weights 
                '''
                optimizer.step()
                
                
                '''
                We extract just the data from our predictions, not other stuff Pytorch includes in that object
                
                We can then use the argmax function to figure out which index corresponds to the highest probability.
                If it is the 0th index, and the label is zero, we add one to correct. 
                If it is the 1st index, and the label is one, we add one to correct.
                
                This is why the labels need to start at zero and increase sequentially!
                '''
                for n,pred in enumerate(predictions.data):
                    if labels[n] == torch.argmax(pred):
                        correct += 1
                        
                        
            print("Accuracy for Epoch # " + str(i) + ": " + str(correct/len(train_data)))

        print()
        

                    
        '''
        The eval function tells the neural network that it is about to be tested on blind test data
        and shouldn't change any of its internal parameters
        
        This function should always be called before eval
        '''
        neural_network.eval()
        
        test_correct = 0
        
        ''' input our test data into the neural network'''
        predictions = neural_network(torch.tensor(test_data.astype(np.float32)))
        
        ''' this checks how many we got right - very simple!'''
        for n,pred in enumerate(predictions.data):
            if test_labels[n] == torch.argmax(pred):
                    test_correct += 1
                    
        print("Accuracy on test set: " + str(test_correct/len(test_data)))
           
        return neural_network
        
   


''' Utility Function - function to turn the data into batches'''

def batchify(data,labels,batch_size=16):
    
    batches= []
    label_batches = []


    for n in range(0,len(data),batch_size):
        if n+batch_size < len(data):
            batches.append(data[n:n+batch_size])
            label_batches.append(labels[n:n+batch_size])

    if len(data)%batch_size > 0:
        batches.append(data[len(data)-(len(data)%batch_size):len(data)])
        label_batches.append(labels[len(data)-(len(data)%batch_size):len(data)])
        
    return batches,label_batches

## Actually using our NN
It's time to actually use our neural network, so we are going to use some real-life data. The dataset I provided (data.csv), needs to be in the same directory as this notebook. I recommend simply downloading them both and keeping them in downloads for simplicity. 

### Data.csv
Data.csv contains protein expression values for 50,000 T-Cells. 25,000 come from patietns who did not respond to immunotherapy (label 1), and 25,000 cells come from patients who did respond to immunotherapy (label 0). We are going to use a neural network to see if we can predict which is which and determine if a patient should receive this immunotherapy treatment or not (precison medicine application!)

In [12]:
#### load the data object
data = pd.read_csv('data.csv')


#### break it down into data and labels (for each index)
labels = data['score'].values
data = data[[col for col in data.columns if col != 'score']].values

print(data[0:5])
print(labels[0:5])

[[0.00000000e+00 5.25400281e-01 0.00000000e+00 0.00000000e+00
  2.16054130e+00 0.00000000e+00 5.98086309e+00 0.00000000e+00
  6.57992315e+00 0.00000000e+00 0.00000000e+00 3.29112959e+00
  2.91778231e+00 0.00000000e+00 1.33822002e+01 0.00000000e+00
  3.16695156e+01 2.46206989e+01 0.00000000e+00 5.61018181e+01
  2.36401653e+01 7.31679738e-01 8.71039927e-02 4.96732101e+01
  0.00000000e+00 4.29527617e+00 1.10507095e+00 7.31976318e+01
  0.00000000e+00 1.07481794e+01 0.00000000e+00 0.00000000e+00
  7.37523746e+00 7.98375666e-01 7.33272791e+00 6.87152803e-01]
 [1.00000000e+00 8.51578772e-01 0.00000000e+00 3.19230288e-01
  4.79022264e-01 0.00000000e+00 1.62928429e+01 6.10527515e+00
  4.30491781e+00 8.44067097e-01 0.00000000e+00 1.89833050e+01
  0.00000000e+00 0.00000000e+00 3.10034084e+01 5.77688789e+00
  5.45427418e+00 1.67073410e+02 0.00000000e+00 2.43213959e+01
  1.17073851e+01 0.00000000e+00 2.32332777e-02 5.58739624e+01
  0.00000000e+00 1.84000225e+01 0.00000000e+00 9.97977829e+01
  1.046

In [13]:
scaler = MinMaxScaler()

scaled_data = MinMaxScaler().fit_transform(data)

print(scaled_data)

[[0.00000000e+00 2.10297586e-04 0.00000000e+00 ... 1.55645581e-03
  2.89419678e-04 9.70955447e-05]
 [2.00004000e-05 3.40854329e-04 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 6.24901866e-03]
 [4.00008000e-05 5.65986778e-05 0.00000000e+00 ... 2.01630591e-02
  0.00000000e+00 9.50444485e-04]
 ...
 [9.99959999e-01 2.68773398e-03 0.00000000e+00 ... 9.25861687e-03
  3.40219780e-04 1.48097441e-02]
 [9.99980000e-01 0.00000000e+00 3.31967418e-02 ... 1.41695722e-02
  0.00000000e+00 3.55855501e-03]
 [1.00000000e+00 8.30232735e-02 1.02573030e-02 ... 0.00000000e+00
  0.00000000e+00 8.53994696e-05]]


## Play around with the parameters to see how different values change the training and testing accuracy!

In [None]:
testclass = BIOF050_Final(scaled_data,labels)
model = testclass.train_test(test_size=0.2,n_epochs=4,hidden_dimensions=10,batch_size=16,lr=1)

  return nn.functional.softmax(batch)


Accuracy for Epoch # 0: 0.50865
Accuracy for Epoch # 1: 0.643425
Accuracy for Epoch # 2: 0.6949
