In [281]:
import pickle
import random

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.nn import Embedding, RNN
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [287]:
class_to_ix = {"Found":0, "Unfound":1}
all_letters = "()ab~&|>."

vocab = ['<pad>'] + sorted(set([char for seq in all_letters for char in seq]))


n_classes = len(class_to_ix)
n_letters = len(all_letters)

In [288]:
X = []
Y = []
embed_dim = len(vocab)
embed = Embedding(len(vocab), embed_dim) # embedding_dim = len(vocab)
for element in training_data:
    input = [vocab.index(token) for token in element[0]]
    input_tensor = torch.tensor(input, dtype=torch.int)
    classification = class_to_ix[element[1]]
    X.append(input_tensor)
    output_tensor = torch.tensor(classification, dtype=torch.int)
    Y.append(output_tensor)

new_xtrain = []
new_ytrain = []
new_xtest = []
new_ytest = []
for element in xtrain:
    input = [vocab.index(token) for token in element]
    input_tensor = torch.tensor(input, dtype=torch.int)
    new_xtrain.append(input_tensor)
for element in ytrain:
    classification = class_to_ix[element]
    output_tensor = torch.tensor(classification, dtype=torch.int)
    new_ytrain.append(output_tensor)

for element in xtest:
    input = [vocab.index(token) for token in element]
    input_tensor = torch.tensor(input, dtype=torch.int)
    new_xtest.append(input_tensor)
for element in ytest:
    classification = class_to_ix[element]
    output_tensor = torch.tensor(classification, dtype=torch.int)
    new_ytest.append(output_tensor)


xtrain = new_xtrain
ytrain = new_ytrain
xtest = new_xtest
ytest = new_ytest

In [289]:
class TTPStyleDataset(Dataset):
    def __init__(self, X,Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sample = [self.X[idx], self.Y[idx]]
        return sample

In [290]:
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

batch_size = 32
#input_dataset = TTPStyleDataset(X_train,y_train)

#test_dataset = TTPStyleDataset(X_test,y_test)
print(len(xtrain))
print(len(ytrain))
print(len(xtest))
print(len(ytest))
xtrain, _, ytrain, _ = train_test_split(xtrain, ytrain, test_size=0.001, random_state=42)
_, xtest, _, ytest = train_test_split(xtest, ytest, test_size=.999, random_state=42)
print(len(xtrain))
print(len(ytrain))
print(len(xtest))
print(len(ytest))



input_dataset = TTPStyleDataset(xtrain,ytrain)

test_dataset = TTPStyleDataset(xtest,ytest)

def my_collate_fn(data):
    (xx,yy) = zip(*data)
    x_lens = [len(x) for x in xx]
    y_lens = [1 for y in yy]

    xx_pad = pad_sequence(xx, batch_first=False, padding_value=0)
    yy_pad = torch.tensor(yy)

    return xx_pad, yy_pad, x_lens, y_lens

#dataloader = DataLoader(input_dataset, shuffle=True, batch_size=batch_size, num_workers=0, collate_fn=my_collate_fn, drop_last=True)
#test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, num_workers=0, collate_fn=my_collate_fn, drop_last=True)

52368
52368
62240
62240
52315
52315
62178
62178


# GRU

Referencing "Dive into Deep Learning" by Aston Zhang, et al. pages 376-382.
Referencing https://medium.com/@anishnama20/understanding-gated-recurrent-unit-gru-in-deep-learning-2e54923f3e2

__Below are some notes.__

GRU contains the following.
* Two gates in which are the **Reset** and **Update** gates. 
* Candidate Hidden State - Combines information from the input and the previous hidden state that is used to update the hidden state for the next time step. The Long & Short Term memory are now in this candidate hidden state(?)

Reset Gate - Determines how much of the previous hidden state to forget. 
* Takes hidden state h_t-1 and current word x_t apply
Update Gate - Determines how much of the candidate hidden state to incorprate into the new hidden state.

A bit of recap on Gaussian Distribution: https://stackoverflow.com/questions/12616406/anyone-can-tell-me-why-we-always-use-the-gaussian-distribution-in-machine-learni
<hr>


In [291]:
#https://d2l.ai/chapter_recurrent-modern/gru.html#implementation-from-scratch
class GRU(nn.GRU):
    #Input size - input_size - Defines the number of features that define each element (time-stamp) of the input sequence.
        #This passes the features into the hidden layers that will perform computations. 
    #hidden_size - Defines the size (amount of features) of the hidden state. Therefore, if hidden_size is set as 4, then the hidden state at each time step is a vector of length 4
    def __init__(self, input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional, output_size): #Why is it if I have num_layers = 2, we get an error? 
        super(GRU, self).__init__(input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional) #Initializing parent class of our GRU class (in this case it is torch.nn.GRU)
        self.gru = nn.GRU(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input, h_0 = None):
        output, h = super(GRU, self).forward(input, h_0) #Returns packed sequence only
        output, seq_len = pad_packed_sequence(output, batch_first=False)
        output = self.fc(h[-1])
        return output, h

In [292]:

input_size = embed_dim
hidden_size = 128
num_layers = 3
bias = True
batch_first = False
dropout = 0.0
bidirectional = False
output_size = 2


rnn = GRU(input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional, output_size)
rnn.to(device)


GRU(
  10, 128, num_layers=3
  (gru): GRU(10, 128)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

# Output best to worst trials

In [293]:

list_of_params = []
def add_to_list_of_params(entry):
    list_of_params.append(entry)
    list_of_params.sort(key=lambda x: x["accuracy"], reverse=True)
    
def print_all_trials():
    print("=========================\nTrials from Best to Worst\n=========================\n")
    for i, entry in enumerate(list_of_params, 1):
        print(f"{i}: "
              f"Trial #{entry['trial']}, "
              f"Accuracy = {entry['accuracy']}, "
              f"Avg Test Loss = {entry['avg_test_loss']}, "
              f"Hidden Size = {entry['hidden_size']}, "
              f"Num Layers = {entry['num_layers']}, "
              f"Bias = {entry['bias']}, "
              f"Batch First = {entry['batch_first']}, "
              f"Dropout = {entry['dropout']}, "
              f"Bidirectional = {entry['bidirectional']}, "
              f"Num Epochs = {entry['num_epochs']}, "
              f"Learning Rate = {entry['learning_rate']}, "
              f"Batch Size = {entry['batch_size']}")

# Training with Trials

In [294]:
num_trials = 20
for trial in range(num_trials):
    best_accuracy = 0
    best_params = None

    input_size = embed_dim
    hidden_size = random.choice([32, 64, 128, 256])
    num_layers = random.choice([2,3,4,5])
    bias = random.choice([True, False])
    batch_first = random.choice([True, False])
    dropout = 0.0    
    bidirectional = random.choice([True, False])
    output_size = 2
    num_epochs = random.choice([10, 11, 12, 13, 14, 15])
    #num_epochs = random.choice([1])
    
    learning_rate = random.choice([0.01, 0.001]) # If you set this too high, it might explode. If too low, it might not learn

    '''
    input_size = embed_dim
    hidden_size = 128
    num_layers = 3
    bias = True
    batch_first = False
    dropout = 0.0
    bidirectional = False
    output_size = 2
    num_epochs = 10
    learning_rate = 0.01
    '''

    #Loading in data!
    dataloader = DataLoader(input_dataset, shuffle=True, batch_size=batch_size, num_workers=0, collate_fn=my_collate_fn, drop_last = True)
    test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, num_workers=0, collate_fn=my_collate_fn, drop_last = True)

    rnn = GRU(input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional, output_size)
    rnn.to(device)

    optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    print("==============================================================================================",
        "\nTrial", str(trial + 1), "\n"
        "\n", rnn, "\n"
        "\ninput_size: ", input_size, "\n",
        #"\noptimizer: ", optimizer, "\ncriterion: ", criterion,
        "\nhidden_size: ", hidden_size, "\nnum_layers: ", num_layers, 
        "\nbias = ", bias, "\nbatch_first = ", batch_first, 
        "\ndropout = ", dropout, "\nbidirectional = ", bidirectional, 
        "\nnum_epochs = ", num_epochs, "\nlearning_rate = ", learning_rate, 
        "\nbatch_size = ", batch_size,
        "\n=========================")

    for epoch in range(num_epochs):
        print("epoch: "+str(epoch))
        
        size = len(dataloader.dataset)
        rnn.train()
        for batch,(x_padded, y_padded, x_lens, y_lens) in enumerate(dataloader):
            rnn.zero_grad()
            x_embed = embed(x_padded)
            x_packed = pack_padded_sequence(x_embed, x_lens, batch_first=False, enforce_sorted=False)
            x_packed = x_packed.to(device)
            y_padded = y_padded.to(device)
            output, hidden = rnn(x_packed)
            output = torch.reshape(output, (batch_size,2))
            
            y_padded = torch.reshape(y_padded, (batch_size,))
            y_padded = y_padded.long()
            loss = criterion(output, y_padded)

            loss.backward()
            optimizer.step()
            

            if batch % 5000 == 0:
                x_s = np.shape(x_padded)[1]
                loss, current = loss.item(), (batch + 1) * x_s
                print(f"train loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")



        # Set the model to evaluation mode - important for batch normalization and dropout layers
        # Unnecessary in this situation but added for best practices
        rnn.eval()
        size = len(test_dataloader.dataset)
        num_batches = len(test_dataloader)
        test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
        # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
        with torch.no_grad():
            for batch,(x_padded, y_padded, x_lens, y_lens) in enumerate(test_dataloader):
                rnn.zero_grad()
                x_embed = embed(x_padded)
                x_packed = pack_padded_sequence(x_embed, x_lens, batch_first=False, enforce_sorted=False)
                x_packed = x_packed.to(device)
                y_padded = y_padded.to(device)
                output, hidden = rnn(x_packed)
                output = torch.reshape(output, (batch_size,2))
                
                y_padded = torch.reshape(y_padded, (batch_size,))
                y_padded = y_padded.long()
                correct += (output.argmax(1) == y_padded).type(torch.float).sum().item()
                loss += criterion(output, y_padded)
            
            #loss, current = loss, (batch + 1) * len(x_padded)
            test_loss = loss/num_batches
            correct /= size
            accuracy = 100*correct
            print(f"Test Error: \n Accuracy: {accuracy:>0.1f}%, Avg loss: {test_loss:>8f} \n") 
    hyperparams = {
        'trial': (trial+1),
        'accuracy': accuracy,
        'avg_test_loss': test_loss,
        'hidden_size': hidden_size,
        'num_layers': num_layers,
        'bias': bias,
        'batch_first': batch_first,
        'dropout': dropout, 
        'bidirectional': bidirectional,
        'num_epochs': num_epochs,
        'learning_rate': learning_rate, 
        'batch_size': batch_size
    }
    add_to_list_of_params(hyperparams)
    print_all_trials()
    '''
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = {
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'bias': bias,
            'batch_first': batch_first,
            'dropout': dropout, 
            'bidirectional': bidirectional,
            'num_epochs': num_epochs,
            'learning_rate': learning_rate, 
            'batch_size': batch_size
        }
    '''

#print(f"Best Accuracy: {best_accuracy:.2f}%")
#print(f"Best Parameters: {best_params}")

Trial 1 

 GRU(
  10, 64, num_layers=2, bias=False
  (gru): GRU(10, 64)
  (fc): Linear(in_features=64, out_features=2, bias=True)
) 

input_size:  10 
 
hidden_size:  64 
num_layers:  2 
bias =  False 
batch_first =  False 
dropout =  0.0 
bidirectional =  False 
num_epochs =  14 
learning_rate =  0.01 
batch_size =  32 
epoch: 0
train loss: 0.669587  [   32/52315]
Test Error: 
 Accuracy: 0.0%, Avg loss: 5.381236 

epoch: 1
train loss: 0.001899  [   32/52315]
Test Error: 
 Accuracy: 0.0%, Avg loss: 6.286865 

epoch: 2
train loss: 0.000708  [   32/52315]
Test Error: 
 Accuracy: 0.0%, Avg loss: 6.787519 

epoch: 3
train loss: 0.000444  [   32/52315]
Test Error: 
 Accuracy: 0.0%, Avg loss: 7.133251 

epoch: 4
train loss: 0.000313  [   32/52315]
Test Error: 
 Accuracy: 0.0%, Avg loss: 7.396954 

epoch: 5
train loss: 0.000223  [   32/52315]
Test Error: 
 Accuracy: 0.0%, Avg loss: 7.609927 

epoch: 6
train loss: 0.000216  [   32/52315]
Test Error: 
 Accuracy: 0.0%, Avg loss: 7.788416 

epoch