In [2]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import random

In [3]:
class BIOF050_A:
    
    
    '''
        
        Inside this Autoencoder class, we can define what we want out autoencoder to look like!
        We will use nn.Sequential to keep things organized, since we have both an encoder and a
        decoder to worry about!
        
        
    '''
    
    class Autoencoder(nn.Module):
        
        def __init__(self,data_length,hidden_dimension,bottleneck_dimension):
    
            super(BIOF050_A.Autoencoder, self).__init__()
            
            '''
            Encoder - turns input data into low-dimensional bottlenck
            
            '''
            self.encoder = nn.Sequential(
            nn.Linear(data_length,hidden_dimension),
            nn.ReLU(),
            nn.Linear(hidden_dimension,bottleneck_dimension))
            
            
            '''
            Decoder: turns bottleneck layer back into original input
            '''
            
            self.decoder = nn.Sequential(
            nn.Linear(bottleneck_dimension,hidden_dimension),
            nn.ReLU(),
            nn.Linear(hidden_dimension,data_length),
                
            ### we use the Tanh function to smooth/scale the data before comparing with input
            ### since we are comparing input with output, can't use softmax probability here - we
            ### need actual values
            nn.Tanh())
    
        '''
        Now, all we have to do is call both the encoder and decoder in our forward method!
        
        ''' 
        
        def forward(self, batch):
            batch = self.encoder(batch)
            batch = self.decoder(batch)
            return batch


## Using our Autoencoder
Now, we will add in our train_test + batchify method from last time to use our Autoencoder on a dataset! 

In [15]:
class BIOF050_A_Final:
    
    
    def __init__(self,data):
        self.data = data

    
    
    '''
        
        Inside this Autoencoder class, we can define what we want out autoencoder to look like!
        We will use nn.Sequential to keep things organized, since we have both an encoder and a
        decoder to worry about!
        
        
    '''
    
    class Autoencoder(nn.Module):
        
        def __init__(self,data_length,hidden_dimension,bottleneck_dimension):
    
            super(BIOF050_A_Final.Autoencoder, self).__init__()
            
            '''
            Encoder - turns input data into low-dimensional bottlenck
            
            '''
            self.encoder = nn.Sequential(
            nn.Linear(data_length,hidden_dimension),
            nn.ReLU(),
            nn.Linear(hidden_dimension,bottleneck_dimension))
            
            
            '''
            Decoder: turns bottleneck layer back into original input
            '''
            
            self.decoder = nn.Sequential(
            nn.Linear(bottleneck_dimension,hidden_dimension),
            nn.ReLU(),
            nn.Linear(hidden_dimension,data_length),
                
            ### we use the Tanh function to smooth/scale the data before comparing with input
            ### since we are comparing input with output, can't use softmax probability here - we
            ### need actual values
            nn.Tanh())
    
        '''
        Now, all we have to do is call both the encoder and decoder in our forward method!
        
        ''' 
        
        def forward(self, batch):
            batch = self.encoder(batch)
            batch = self.decoder(batch)
            return batch



    
    def train_test(self,hidden_dimension,bottleneck_size,batch_size,n_epochs,lr):
            
           
            ''' We just batchify the whole dataset - no need for train/test here as we 
            want to reduce our entire dataset'''
            batches = batchify_autoencoder(self.data,batch_size=batch_size)
  
            '''
            For this, we use a hidden dimension of 64 for our first linear layer and a
            bottleneck layer of 5
            '''
            neural_network = BIOF050_A_Final.Autoencoder(len(np.ravel(self.data[0])),hidden_dimension,bottleneck_size)
        
        
            ''' Same optimizer as before'''
            optimizer = optim.SGD(neural_network.parameters(), lr=0.01)
        
            ''' We use the mean squared error function here because it works well for tasks that do 
            not have discrete labels but are just compare values (like our autoencoder)'''
            loss_function = nn.MSELoss()
        
            ''' Pretty much the same training process as before'''
            neural_network.train()
        
            for i in range(n_epochs):
                error = 0
                for ii in range(len(batches)):
                
                    optimizer.zero_grad()
                
                    batch = batches[ii]

                    predictions = neural_network(torch.tensor(np.asarray(batch).astype(np.float32)))
                    
                    ### input the predicted data and the original data into the MSE loss function
                    loss = loss_function(predictions,torch.tensor(np.asarray(batch).astype(np.float32)))
                
                    loss.backward()
                
                    optimizer.step()
                    
                    error += loss.data
                    
                print('Error: ' + str((error/len(self.data)*16)))

            return neural_network
        
   


''' Utility Function - function to turn the data into batches'''

def batchify_autoencoder(data,batch_size=16):
    
    batches= []


    for n in range(0,len(data),batch_size):
        if n+batch_size < len(data):
            batches.append(data[n:n+batch_size])
            

    if len(data)%batch_size > 0:
        batches.append(data[len(data)-(len(data)%batch_size):len(data)])

        
    return batches

In [16]:
''' We are using torchvision MNIST, just like our CNN - we can compare these results with the
Autoencoder CNN (next tutorial) to see which one works better!'''

data = torchvision.datasets.MNIST(
    root = './data/MNIST',
    download = True)


labels = data.targets
data = data.data
newdata = []

for image in data:
   image = np.ravel(image).astype(np.float64)
   image *= 1/image.max()
   newdata.append(image)

In [17]:
testclass = BIOF050_A_Final(newdata)
model = testclass.train_test(hidden_dimension=64,bottleneck_size=5,batch_size=16,n_epochs=3,lr=0.01)

Error: tensor(0.1160)
Error: tensor(0.0975)
Error: tensor(0.0829)
