In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import json
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

In [3]:
# https://wandb.ai/sauravmaheshkar/LSTM-PyTorch/reports/Using-LSTM-in-PyTorch-A-Tutorial-With-Examples--VmlldzoxMDA2NTA5

# https://towardsdatascience.com/the-lstm-reference-card-6163ca98ae87

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        print(self.fc.weight.dtype)
    
    def forward(self, x, hidden=None):
        if x.dim() == 2:  # Assuming 2D input (no batch dimension)
            x = x.unsqueeze(0)
        # Initialize hidden state if not provided
        if hidden is None:
            batch_size = x.size(0)
            hidden = hidden = (torch.zeros(1, batch_size, self.hidden_dim),  # Hidden state
                      torch.zeros(1, batch_size, self.hidden_dim))
                      
        # Forward pass through LSTM
        # THIS IS THE ERROR LINE BUT WHY -- something wrong in forwarding
        lstm_out, hidden = self.lstm(x, hidden)
        
        # Take the output from the last time step
        last_output = lstm_out[:, -1, :]
        
        # Apply fully connected layer
        output = self.fc(last_output)
        
        return output, hidden


In [4]:
def create_corpus(json_file, x_key, y_key):
    x = []
    y = []
    
    for json_obj in json_file:
        x.append(json_obj[x_key])
        y.append(json_obj[y_key])

    return (x, y)

def create_questions_answers(json_array):
    questions = []
    answers = []
    for json_obj in json_array:
        questions.append(json_obj["text"])
        answers.append(json_obj["answer"])
    
    return (questions, answers)
    

In [5]:
with open('resources/wiki_text_16.json') as f:
    data = json.load(f)

with open('resources/small.guesstrain.json') as f:
    train = json.load(f)

In [6]:
questions, answers = create_questions_answers(train["questions"])

In [7]:
tokenizer = get_tokenizer("basic_english")

tokenized_corpus = [tokenizer(text) for text in corpus]
tokenized_titles = [tokenizer(text) for text in titles]
tokenized_questions = [tokenizer(text) for text in questions]
tokenized_answers = [tokenizer(text) for text in answers]
all_tokenized = [tokenized_corpus, tokenized_titles, tokenized_questions, tokenized_answers]

In [8]:
def combine_tokenized(tokenized_list):
    acc = []
    for list in tokenized_list:
        acc += list
    return acc

# Build vocabulary
combined_lists = combine_tokenized(all_tokenized)
vocab = build_vocab_from_iterator(combined_lists)

50472lines [00:02, 22491.68lines/s]


In [9]:
def tokenize_with_indices(tokenized_list):
    tokenized_texts_with_indices = []
    for list in tokenized_list:
        tokenized_text = []
        for token in list:
            tokenized_text.append(vocab[token])
        tokenized_texts_with_indices.append(tokenized_text)
    return tokenized_texts_with_indices

# tokenized_corpus_updated = tokenize_with_indices(tokenized_corpus)
# tokenized_titles_updated = tokenize_with_indices(tokenized_titles)
tokenized_questions_updated = tokenize_with_indices(tokenized_questions)
tokenized_answers_updated = tokenize_with_indices(tokenized_answers)

In [49]:
#pad questions and answers to be the same length and then stack them for the input tensors

questions_max_len = max(len(question) for question in tokenized_questions_updated)
answers_max_len = max(len(answer) for answer in tokenized_answers_updated)
max_length = max(questions_max_len, answers_max_len)

def pad_lists(lists, max_length):
    padded_lists = []
    for lst in lists:
        padding_size = max_length - len(lst)
        padded_list = lst + [0] * padding_size
        padded_lists.append(padded_list) 
    return padded_lists

padded_questions = pad_lists(tokenized_questions_updated, max_length)
padded_answers = pad_lists(tokenized_answers_updated, max_length)

questions_and_answers_tensors = []

for i in range(len(tokenized_questions_updated)):
    question_tensor = torch.tensor(padded_questions[i])
    answer_tensor = torch.tensor(padded_answers[i])
    padded_question_and_answer = pad_sequence([question_tensor, answer_tensor], batch_first=True, padding_value=0)
    stacked_question_and_answer = torch.stack(tuple(padded_question_and_answer))
    questions_and_answers_tensors.append(stacked_question_and_answer)

print(str(len(questions_and_answers_tensors)))

input_tensors = torch.stack(tuple(questions_and_answers_tensors))


23847


In [50]:
#make the output tensors -- these are all ones since all the answers are correct

answer_tensors = []

for i in range(len(questions_and_answers_tensors)):
    answer_tensor = torch.tensor(1)
    answer_tensors.append(answer_tensor)

output_tensors = torch.stack(tuple(answer_tensors))

print(str(len(output_tensors)))


23847


In [51]:
tensor_dataset = TensorDataset(input_tensors.to(torch.float32), output_tensors.to(torch.float32))

In [37]:
# THIS IS OLDDDDDDDDDDDDDDDDD

# # batch_questions_tensor = torch.tensor(tokenized_questions_updated)
# # turn all the questions into tensors
# questions_tensors = [torch.tensor(question) for question in tokenized_questions_updated]
# answers_tensors = [torch.tensor(answer) for answer in tokenized_answers_updated]

# # make sure all tensors are the same size
# padded_questions = pad_sequence(questions_tensors, batch_first=True, padding_value=-1)
# padded_answers = pad_sequence(answers_tensors, batch_first=True, padding_value=-1)

# # stack all the tensors into a single tensor so that it can be passed into tensor_dataset
# questions_tensors_stack = torch.stack(tuple(padded_questions))
# answers_tensors_stack = torch.stack(tuple(padded_answers))

# tensor_dataset = TensorDataset(questions_tensors_stack.to(torch.float32), output_tensors.to(torch.float32))
# # tensor_dataset = TensorDataset(questions_tensors_stack.to(torch.float32), answers_tensors_stack.to(torch.float32))

In [11]:
# DEBUGGING CODE

# def is_batched(input_tensor):
#     return input_tensor.dim() > 1 and input_tensor.size(0) != 1

# print(is_batched(questions_tensors_stack))
# print(is_batched(answers_tensors_stack))

In [54]:
# Define hyperparameters
input_dim = 337  # Input dimension (e.g., size of vocabulary)
hidden_dim = 128  # Hidden dimension of the LSTM
output_dim = 1  # Output dimension (1 for binary classification)

# Instantiate the model
model = LSTMModel(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

batch_size = 50
train_loader = DataLoader(dataset=tensor_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0)


torch.float32


In [13]:
# DEBUGGING CODE (checking inputs and outputs have the right shape)

# for batch_inputs, batch_outputs in train_loader:
#     print("Batch inputs shape:", batch_inputs.dtype)
# #     print("Batch outputs shape:", batch_outputs.shape)

In [59]:
# Iterations
num_epochs = 10
for epoch in range(num_epochs):
  for ex, (inputs, labels) in enumerate(train_loader):

    #output is the whether the answer is true or false
    y_predicted = model(inputs) 
    output, state = y_predicted 

    print(output)
    print(labels)
    output_collapsed = torch.squeeze(output, dim=1)
    loss = criterion(output_collapsed, labels) #ERROR IS NOW HERE LOL

    # Backward pass
    loss.backward()
    # Updates
    optimizer.step()
    # Zero gradients
    optimizer.zero_grad()

    if (epoch % 10) == 0:
      print(f'epoch: {epoch+1}, loss = {loss.item(): .4f}')

tensor([[ 0.2054],
        [-0.0374],
        [ 0.3475],
        [-0.0750],
        [-0.0881],
        [ 0.4544],
        [ 0.1159],
        [-0.0064],
        [ 0.2578],
        [ 0.1278],
        [ 0.1918],
        [ 0.1346],
        [-0.1503],
        [-0.0166],
        [ 0.2393],
        [ 0.4028],
        [-0.0959],
        [-0.0738],
        [ 0.0420],
        [-0.0584],
        [ 0.1399],
        [-0.0132],
        [ 0.1074],
        [ 0.1577],
        [ 0.3188],
        [ 0.1110],
        [ 0.0032],
        [-0.1912],
        [ 0.0456],
        [ 0.1885],
        [ 0.1351],
        [-0.0351],
        [-0.2241],
        [ 0.1246],
        [-0.1572],
        [-0.0206],
        [-0.5091],
        [ 0.1124],
        [ 0.4596],
        [-0.2306],
        [ 0.1595],
        [-0.0262],
        [-0.1000],
        [ 0.2580],
        [ 0.1527],
        [ 0.1876],
        [-0.2518],
        [-0.1973],
        [ 0.2256],
        [ 0.3090]], grad_fn=<AddmmBackward0>)
tensor([1., 1., 1., 1.,

In [63]:
#prepare test data
with open('resources/small.buzztrain.json') as f:
    test = json.load(f)

test_questions, test_answers = create_questions_answers(test["questions"])

tokenized_test_questions = [tokenizer(text) for text in test_questions]
tokenized_test_answers = [tokenizer(text) for text in test_answers]
all_test_tokenized = [tokenized_test_questions, tokenized_test_answers]

tokenized_test_questions_updated = tokenize_with_indices(tokenized_test_questions)
tokenized_test_answers_updated = tokenize_with_indices(tokenized_test_answers)

test_questions_max_len = max(len(question) for question in tokenized_test_questions_updated)
test_answers_max_len = max(len(answer) for answer in tokenized_test_answers_updated)
max_length_test = max(test_questions_max_len, test_answers_max_len)

padded_test_questions = pad_lists(tokenized_test_questions_updated, max_length_test)
padded_test_answers = pad_lists(tokenized_test_answers_updated, max_length_test)

test_questions_and_answers_tensors = []

for i in range(len(tokenized_test_questions_updated)):
    question_tensor = torch.tensor(padded_test_questions[i])
    answer_tensor = torch.tensor(padded_test_answers[i])
    padded_test_question_and_answer = pad_sequence([question_tensor, answer_tensor], batch_first=True, padding_value=0)
    stacked_test_question_and_answer = torch.stack(tuple(padded_test_question_and_answer))
    test_questions_and_answers_tensors.append(stacked_test_question_and_answer)

test_input_tensors = torch.stack(tuple(test_questions_and_answers_tensors))


In [67]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

test_loader = DataLoader(dataset=test_input_tensors, num_workers=0)

print(str(len(test_input_tensors)))

with torch.no_grad():
  y_pred = [model(inputs).round().cpu().detach().numpy()[0][0] for inputs in test_loader]
  y_true = [labels.cpu().detach().numpy()[0][0] for _, labels in test_loader]

  acc = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)

  print(f'Accuracy: {acc:.4f}')
  print(f'Precision: {precision:.4f}')
  print(f'Recall: {recall:.4f}')

4594


RuntimeError: input.size(-1) must be equal to input_size. Expected 337, got 254