In [1]:
import pickle
import random

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.nn import Embedding, RNN
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# Set any global values

In [2]:
torch.manual_seed(1)

<torch._C.Generator at 0x7ff2581f3810>

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


# Load the training data

In [4]:
with open('theorum.pickle','rb') as f:
    theorum_data = pickle.load(f)

with open('non_theorum.pickle','rb') as f:
    non_theorum_data = pickle.load(f)

a = theorum_data
b = non_theorum_data

training_data = []
xtrain = []
ytrain = []


random.shuffle(a)
random.shuffle(b)

if len(a) < len(b):
    total = len(a)
else:
    total = len(b)

for i in range(total):
    training_data.append(a[i])
    training_data.append(b[i])
    xtrain.append(a[i][0])
    ytrain.append(a[i][1])

In [5]:
print(len(training_data))

104736


In [6]:
with open('theorum_v2.pickle','rb') as f:
    theorum_data2 = pickle.load(f)

with open('non_theorum_v2.pickle','rb') as f:
    non_theorum_data2 = pickle.load(f)

a = theorum_data2
b = non_theorum_data2


random.shuffle(a)
random.shuffle(b)
xtest = []
ytest = []

if len(a) < len(b):
    total = len(a)
else:
    total = len(b)
for i in range(total):
    training_data.append(a[i])
    training_data.append(b[i])
    xtest.append(b[i][0])
    ytest.append(b[i][1])

In [7]:
class_to_ix = {"Found":0, "Unfound":1}
all_letters = "()ab~&|>."

vocab = ['<pad>'] + sorted(set([char for seq in all_letters for char in seq]))


n_classes = len(class_to_ix)
n_letters = len(all_letters)

In [8]:
X = []
Y = []
embed_dim = len(vocab)
embed = Embedding(len(vocab), embed_dim) # embedding_dim = len(vocab)
for element in training_data:
    input = [vocab.index(token) for token in element[0]]
    input_tensor = torch.tensor(input, dtype=torch.int)
    classification = class_to_ix[element[1]]
    X.append(input_tensor)
    output_tensor = torch.tensor(classification, dtype=torch.int)
    Y.append(output_tensor)

new_xtrain = []
new_ytrain = []
new_xtest = []
new_ytest = []
for element in xtrain:
    input = [vocab.index(token) for token in element]
    input_tensor = torch.tensor(input, dtype=torch.int)
    new_xtrain.append(input_tensor)
for element in ytrain:
    classification = class_to_ix[element]
    output_tensor = torch.tensor(classification, dtype=torch.int)
    new_ytrain.append(output_tensor)

for element in xtest:
    input = [vocab.index(token) for token in element]
    input_tensor = torch.tensor(input, dtype=torch.int)
    new_xtest.append(input_tensor)
for element in ytest:
    classification = class_to_ix[element]
    output_tensor = torch.tensor(classification, dtype=torch.int)
    new_ytest.append(output_tensor)


xtrain = new_xtrain
ytrain = new_ytrain
xtest = new_xtest
ytest = new_ytest

In [9]:
class TTPStyleDataset(Dataset):
    def __init__(self, X,Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sample = [self.X[idx], self.Y[idx]]
        return sample

In [10]:
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

batch_size = 32
#input_dataset = TTPStyleDataset(X_train,y_train)

#test_dataset = TTPStyleDataset(X_test,y_test)
print(len(xtrain))
print(len(ytrain))
print(len(xtest))
print(len(ytest))
xtrain, _, ytrain, _ = train_test_split(xtrain, ytrain, test_size=0.001, random_state=42)
_, xtest, _, ytest = train_test_split(xtest, ytest, test_size=.999, random_state=42)
print(len(xtrain))
print(len(ytrain))
print(len(xtest))
print(len(ytest))



input_dataset = TTPStyleDataset(xtrain,ytrain)

test_dataset = TTPStyleDataset(xtest,ytest)

def my_collate_fn(data):
    (xx,yy) = zip(*data)
    x_lens = [len(x) for x in xx]
    y_lens = [1 for y in yy]

    xx_pad = pad_sequence(xx, batch_first=False, padding_value=0)
    yy_pad = torch.tensor(yy)

    return xx_pad, yy_pad, x_lens, y_lens
    

dataloader = DataLoader(input_dataset, shuffle=True, batch_size=batch_size, num_workers=0, collate_fn=my_collate_fn, drop_last=True)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, num_workers=0, collate_fn=my_collate_fn, drop_last=True)

52368
52368
62240
62240
52315
52315
62178
62178


# Neural Network

In [11]:
class SoftmaxRNN(nn.RNN):
    def __init__(self, input_size, hidden_size, num_layers, nonlinearity, bias, batch_first, dropout, bidirectional, output_size):
        super(SoftmaxRNN, self).__init__(input_size, hidden_size, num_layers, nonlinearity, bias, batch_first, dropout, bidirectional)
        self.fc = nn.Linear(hidden_size, output_size)
        
    
    def forward(self, x, h_0=None):
        output, h = super(SoftmaxRNN, self).forward(x, h_0)
        output, seq_len = pad_packed_sequence(output, batch_first=False)
        output = self.fc(h[-1])
        return output, h

In [12]:
#input_size - The number of expected features in the input x
rnn = SoftmaxRNN(input_size=embed_dim, hidden_size=128, num_layers=1, nonlinearity='relu', bias=True, batch_first=False, dropout=0.0, bidirectional=False, output_size=2)
rnn.to(device)

SoftmaxRNN(
  10, 128
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

In [13]:
num_epochs = 20
learning_rate = 0.01 # If you set this too high, it might explode. If too low, it might not learn
optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)

criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    print("epoch: "+str(epoch))
    size = len(dataloader.dataset)
    rnn.train()
    for batch,(x_padded, y_padded, x_lens, y_lens) in enumerate(dataloader):

        
        rnn.zero_grad()
        x_embed = embed(x_padded)
        x_packed = pack_padded_sequence(x_embed, x_lens, batch_first=False, enforce_sorted=False)
        x_packed = x_packed.to(device)
        y_padded = y_padded.to(device)
        output, hidden = rnn(x_packed)
        output = torch.reshape(output, (batch_size,2))

        
        y_padded = torch.reshape(y_padded, (batch_size,))
        y_padded = y_padded.long()
        loss = criterion(output, y_padded)

        loss.backward()
        optimizer.step()
        

        if batch % (64*1000) == 0:
            x_s = np.shape(x_padded)[1]
            loss, current = loss.item(), (batch + 1) * x_s
            #print(f"train loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")



    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    rnn.eval()
    size = len(test_dataloader.dataset)
    num_batches = len(test_dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for batch,(x_padded, y_padded, x_lens, y_lens) in enumerate(test_dataloader):
            rnn.zero_grad()
            x_embed = embed(x_padded)
            x_packed = pack_padded_sequence(x_embed, x_lens, batch_first=False, enforce_sorted=False)
            x_packed = x_packed.to(device)
            y_padded = y_padded.to(device)
            output, hidden = rnn(x_packed)
            output = torch.reshape(output, (batch_size,2))
            
            y_padded = torch.reshape(y_padded, (batch_size,))
            y_padded = y_padded.long()
            correct += (output.argmax(1) == y_padded).type(torch.float).sum().item()
            loss += criterion(output, y_padded)
        
        #loss, current = loss, (batch + 1) * len(x_padded)
        test_loss = loss/num_batches
        correct /= size
        print(f"Test Error: Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}")

epoch: 0
Test Error: Accuracy: 0.0%, Avg loss: 7.326534
epoch: 1
Test Error: Accuracy: 0.0%, Avg loss: 8.311952
epoch: 2
Test Error: Accuracy: 0.0%, Avg loss: 8.894710
epoch: 3
Test Error: Accuracy: 0.0%, Avg loss: 9.314728
epoch: 4
Test Error: Accuracy: 0.0%, Avg loss: 9.646645
epoch: 5
Test Error: Accuracy: 0.0%, Avg loss: 9.922068
epoch: 6
Test Error: Accuracy: 0.0%, Avg loss: 10.158711
epoch: 7
Test Error: Accuracy: 0.0%, Avg loss: 10.365868
epoch: 8
Test Error: Accuracy: 0.0%, Avg loss: 10.551189
epoch: 9
Test Error: Accuracy: 0.0%, Avg loss: 10.719498
epoch: 10
Test Error: Accuracy: 0.0%, Avg loss: 10.872582
epoch: 11
Test Error: Accuracy: 0.0%, Avg loss: 11.014877
epoch: 12
Test Error: Accuracy: 0.0%, Avg loss: 11.146877
epoch: 13
Test Error: Accuracy: 0.0%, Avg loss: 11.269485
epoch: 14
Test Error: Accuracy: 0.0%, Avg loss: 11.385170
epoch: 15
Test Error: Accuracy: 0.0%, Avg loss: 11.493189
epoch: 16
Test Error: Accuracy: 0.0%, Avg loss: 11.594954
epoch: 17
Test Error: Accuracy

In [14]:
print(size*batch_size)

1989696
