In [1]:
import pickle

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.nn import Embedding, RNN
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# Set any global values

In [2]:
torch.manual_seed(1)

<torch._C.Generator at 0x288bc8edc50>

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


# Load the training data

In [4]:
with open('training_data.pickle','rb') as f:
    training_data = pickle.load(f)

In [5]:
class_to_ix = {"Found":0, "Unfound":1}
all_letters = "()ab~&|>"

vocab = ['<pad>'] + sorted(set([char for seq in all_letters for char in seq]))


n_classes = len(class_to_ix)
n_letters = len(all_letters)

In [6]:
X = []
Y = []
embed_dim = len(vocab)
embed = Embedding(len(vocab), embed_dim) # embedding_dim = len(vocab)
for element in training_data:
    input = [vocab.index(token) for token in element[0]]
    input_tensor = torch.tensor(input, dtype=torch.int)
    classification = class_to_ix[element[1]]
    X.append(input_tensor)
    output_tensor = torch.tensor(classification, dtype=torch.int)
    Y.append(output_tensor)

In [7]:
class TTPStyleDataset(Dataset):
    def __init__(self, X,Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sample = [self.X[idx], self.Y[idx]]
        return sample

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

batch_size = 16
input_dataset = TTPStyleDataset(X_train,y_train)

test_dataset = TTPStyleDataset(X_test,y_test)

def my_collate_fn(data):
    (xx,yy) = zip(*data)
    x_lens = [len(x) for x in xx]
    y_lens = [1 for y in yy]

    xx_pad = pad_sequence(xx, batch_first=False, padding_value=0)
    yy_pad = torch.tensor(yy)

    return xx_pad, yy_pad, x_lens, y_lens
    

dataloader = DataLoader(input_dataset, shuffle=True, batch_size=batch_size, num_workers=0, collate_fn=my_collate_fn)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, num_workers=0, collate_fn=my_collate_fn)

# Neural Network

In [None]:
class SoftmaxRNN(nn.RNN):
    def __init__(self, input_size, hidden_size, num_layers, nonlinearity, bias, batch_first, dropout, bidirectional, output_size):
        super(SoftmaxRNN, self).__init__(input_size, hidden_size, num_layers, nonlinearity, bias, batch_first, dropout, bidirectional)
        self.fc = nn.Linear(hidden_size, output_size)
        
    
    def forward(self, x, h_0=None):
        output, h = super(SoftmaxRNN, self).forward(x, h_0)
        output, seq_len = pad_packed_sequence(output, batch_first=False)
        output = self.fc(h[-1])
        return output, h

In [None]:
#input_size - The number of expected features in the input x
rnn = SoftmaxRNN(input_size=embed_dim, hidden_size=128, num_layers=1, nonlinearity='relu', bias=True, batch_first=False, dropout=0.0, bidirectional=False, output_size=2)
rnn.to(device)

SoftmaxRNN(
  9, 128
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

# GRU

Referencing "Dive into Deep Learning" by Aston Zhang, et al. pages 376-382.
Referencing https://medium.com/@anishnama20/understanding-gated-recurrent-unit-gru-in-deep-learning-2e54923f3e2

__Below are some notes.__

GRU contains the following.
* Two gates in which are the **Reset** and **Update** gates. 
* Candidate Hidden State - Combines information from the input and the previous hidden state that is used to update the hidden state for the next time step. The Long & Short Term memory are now in this candidate hidden state(?)

Reset Gate - Determines how much of the previous hidden state to forget. 
* Takes hidden state h_t-1 and current word x_t apply
Update Gate - Determines how much of the candidate hidden state to incorprate into the new hidden state.

A bit of recap on Gaussian Distribution: https://stackoverflow.com/questions/12616406/anyone-can-tell-me-why-we-always-use-the-gaussian-distribution-in-machine-learni


In [12]:

'''

#https://d2l.ai/chapter_recurrent-modern/gru.html#implementation-from-scratch
class GRU(nn.GRU):

    
    #Difference between hidden layer and hidden state:
        #Hidden Layers - Layers that are hidden from view on the path from input to output. 
        #Hidden States - Inputs to whatever we do at a given step.
    def __init__(self, input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional, output_size):
        super(GRU, self).__init__(input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional, output_size) #Initializing parent class of our GRU class (in this case it is torch.nn.GRU)
        self.gru = nn.GRU(input_size, hidden_size) #GRU 
        #Output layer via fully connected
    
    #Inputs: input, h_0
    def forward(self, input, h_0=None):
        output, h = super(GRU, self).forward(input, h_0)
        output, seq_len = pad_packed_sequence(output, batch_first=False)
        output = self.fc(h[-1])
        return output, h
'''

class GRU(nn.GRU):
    #Input size - input_size - Defines the number of features that define each element (time-stamp) of the input sequence.
        #This passes the features into the hidden layers that will perform computations. 
    #hidden_size - Defines the size (amount of features) of the hidden state. Therefore, if hidden_size is set as 4, then the hidden state at each time step is a vector of length 4
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(GRU, self).__init__(input_size, hidden_size, output_size) #Initializing parent class of our GRU class (in this case it is torch.nn.GRU)
        self.gru = nn.GRU(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input, h_0 = None):
        output, h = super(GRU, self).forward(input, h_0) #Returns packed sequence only
        output, seq_len = pad_packed_sequence(output, batch_first=False)
        output = self.fc(h[-1])
        return output, h
        

In [14]:
'''
#input_size - The number of expected features in the input x
rnn = GRU(input_size=embed_dim, hidden_size=128, num_layers=1, bias='True', batch_first=False, dropout=0.0, bidirectional=False, output_size=2)
rnn.to(device)
'''

rnn = GRU(input_size = embed_dim, hidden_size = 128, num_layers = 2, output_size = 2)
rnn.to(device)

GRU(
  9, 128, num_layers=2
  (gru): GRU(9, 128)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

# Training

In [11]:
num_epochs = 10
learning_rate = 0.01 # If you set this too high, it might explode. If too low, it might not learn
optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)

criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    print("epoch: "+str(epoch))
    size = len(dataloader.dataset)
    rnn.train()
    for batch,(x_padded, y_padded, x_lens, y_lens) in enumerate(dataloader):

        
        rnn.zero_grad()
        x_embed = embed(x_padded)
        x_packed = pack_padded_sequence(x_embed, x_lens, batch_first=False, enforce_sorted=False)
        x_packed = x_packed.to(device)
        y_padded = y_padded.to(device)
        output, hidden = rnn(x_packed)
        output = torch.reshape(output, (batch_size,2))
        
        y_padded = torch.reshape(y_padded, (batch_size,))
        y_padded = y_padded.long()
        loss = criterion(output, y_padded)

        loss.backward()
        optimizer.step()
        

        if batch % 500 == 0:
            x_s = np.shape(x_padded)[1]
            loss, current = loss.item(), (batch + 1) * x_s
            print(f"train loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")



    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    rnn.eval()
    size = len(test_dataloader.dataset)
    num_batches = len(test_dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for batch,(x_padded, y_padded, x_lens, y_lens) in enumerate(test_dataloader):
            rnn.zero_grad()
            x_embed = embed(x_padded)
            x_packed = pack_padded_sequence(x_embed, x_lens, batch_first=False, enforce_sorted=False)
            x_packed = x_packed.to(device)
            y_padded = y_padded.to(device)
            output, hidden = rnn(x_packed)
            output = torch.reshape(output, (batch_size,2))
            
            y_padded = torch.reshape(y_padded, (batch_size,))
            y_padded = y_padded.long()
            loss += criterion(output, y_padded)
        
        #loss, current = loss, (batch + 1) * len(x_padded)
        test_loss = loss/num_batches
        print(f"test loss: {test_loss:>7f}")

epoch: 0
train loss: 0.708207  [   16/141120]
train loss: 0.556275  [ 8016/141120]
train loss: 0.569994  [16016/141120]
train loss: 0.507301  [24016/141120]
train loss: 0.788458  [32016/141120]
train loss: 0.816246  [40016/141120]
train loss: 0.715744  [48016/141120]
train loss: 0.812579  [56016/141120]
train loss: 0.749376  [64016/141120]
train loss: 0.583476  [72016/141120]
train loss: 0.566569  [80016/141120]
train loss: 0.772137  [88016/141120]
train loss: 0.718617  [96016/141120]
train loss: 0.725388  [104016/141120]
train loss: 0.637809  [112016/141120]
train loss: 0.571319  [120016/141120]
train loss: 0.531904  [128016/141120]
train loss: 0.723781  [136016/141120]
test loss: 0.596739
epoch: 1
train loss: 0.608466  [   16/141120]
train loss: 0.507929  [ 8016/141120]
train loss: 0.565207  [16016/141120]
train loss: 0.472105  [24016/141120]
train loss: 0.587796  [32016/141120]
train loss: 0.657933  [40016/141120]
train loss: 0.519692  [48016/141120]
train loss: 0.710564  [56016/141