In [2]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

## Encoder

In [3]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)

    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, (hidden, cell)

## Attention

In [4]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.permute(0, 2, 1)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        attention_weights = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention_weights, dim=1)

## Decoder

In [5]:
class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)
        self.attention = Attention(hidden_size)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.embedding(input)
        lstm_output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        attention_weights = self.attention(hidden[-1], encoder_outputs)
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)
        lstm_output = lstm_output.squeeze(1)
        context = context.squeeze(1)
        output = self.fc(torch.cat((lstm_output, context), dim=1))
        return output, hidden, cell, attention_weights

## Tokenization and Vocabulary Setup
First, create a vocabulary and tokenize the input sentence (e.g., "two plus four" etc).

In [6]:
# Tokenization and vocab setup
# Create a vocabulary mapping words to indices
word_to_index = {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "two": 3, "plus": 4, "four": 5, "equals": 6, "six": 7, "three": 8, "minus": 9, "one": 10}
index_to_word = {v: k for k, v in word_to_index.items()}  # Reverse mapping for decoding

# Example input and target sequences
input_sentence = "two plus four"
target_sentence = "equals six"

# Tokenize the sentences
input_tokens = [word_to_index[word] for word in input_sentence.split()]
target_tokens = [word_to_index["<SOS>"]] + [word_to_index[word] for word in target_sentence.split()] + [word_to_index["<EOS>"]]

## Generate the math problem dataset from the tokenzied data

In [7]:
class MathWordProblemDataset(Dataset):
    def __init__(self, input_sentences, target_sentences, word_to_index):
        self.input_data = [[word_to_index[word] for word in sentence.split()] for sentence in input_sentences]
        self.target_data = [[word_to_index["<SOS>"]] + [word_to_index[word] for word in sentence.split()] + [word_to_index["<EOS>"]] for sentence in target_sentences]

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        return torch.tensor(self.input_data[idx], dtype=torch.long), torch.tensor(self.target_data[idx], dtype=torch.long)

# Example data
input_sentences = ["two plus four"]
target_sentences = ["equals six"]

# Create dataset and dataloader
dataset = MathWordProblemDataset(input_sentences, target_sentences, word_to_index)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

target_length = len(target_tokens)

## Train the model

In [8]:
# Define the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the encoder and decoder
input_size = len(word_to_index)  # Total vocabulary size
output_size = len(word_to_index)  # Vocabulary size
hidden_size = 128
encoder = Encoder(input_size, hidden_size).to(device)
decoder = Decoder(output_size, hidden_size).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=word_to_index["<PAD>"])  # Ignore padding tokens
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.001)

num_epochs = 100  # Define the number of epochs
for epoch in range(num_epochs):
    for input_seq, target_seq in dataloader:
        # Move data to device (CPU or GPU)
        input_seq, target_seq = input_seq.to(device), target_seq.to(device)

        # Zero the gradients
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        # Encoder forward pass
        encoder_outputs, (hidden, cell) = encoder(input_seq)

        # Decoder initialization
        decoder_input = torch.tensor([word_to_index["<SOS>"]]*input_seq.size(0), device=device)
        decoder_hidden, decoder_cell = hidden, cell

        # Iterate over the target sequence
        loss = 0
        for t in range(target_seq.size(1)):
            output, decoder_hidden, decoder_cell, attention_weights = decoder(
                decoder_input, decoder_hidden, decoder_cell, encoder_outputs
            )
            loss += criterion(output, target_seq[:, t])
            decoder_input = target_seq[:, t]  # Teacher forcing

        # Backpropagation and optimization
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item() / target_seq.size(1)}")


torch.save(encoder.state_dict(), "encoder.pth")
torch.save(decoder.state_dict(), "decoder.pth")

Epoch 1/100, Loss: 2.438589572906494
Epoch 2/100, Loss: 2.271918296813965
Epoch 3/100, Loss: 2.1174967288970947
Epoch 4/100, Loss: 1.9624028205871582
Epoch 5/100, Loss: 1.803347110748291
Epoch 6/100, Loss: 1.6440132856369019
Epoch 7/100, Loss: 1.4897698163986206
Epoch 8/100, Loss: 1.343505859375
Epoch 9/100, Loss: 1.2075986862182617
Epoch 10/100, Loss: 1.084038496017456
Epoch 11/100, Loss: 0.9736529588699341
Epoch 12/100, Loss: 0.8760510087013245
Epoch 13/100, Loss: 0.7900800704956055
Epoch 14/100, Loss: 0.7143363952636719
Epoch 15/100, Loss: 0.6474581360816956
Epoch 16/100, Loss: 0.5882113575935364
Epoch 17/100, Loss: 0.5355003476142883
Epoch 18/100, Loss: 0.4883628189563751
Epoch 19/100, Loss: 0.4459781050682068
Epoch 20/100, Loss: 0.40768325328826904
Epoch 21/100, Loss: 0.3729633390903473
Epoch 22/100, Loss: 0.3414159119129181
Epoch 23/100, Loss: 0.31273677945137024
Epoch 24/100, Loss: 0.2867087423801422
Epoch 25/100, Loss: 0.2630557715892792
Epoch 26/100, Loss: 0.24134668707847595


## Test the trained model

In [9]:
# load the trained model
input_size = len(word_to_index)  # Same as during training
output_size = len(word_to_index)
hidden_size = 128  # Same as during training

encoder = Encoder(input_size, hidden_size)
decoder = Decoder(output_size, hidden_size)

# Load the trained weights
encoder.load_state_dict(torch.load("encoder.pth"))
decoder.load_state_dict(torch.load("decoder.pth"))

# Set the models to evaluation mode
encoder.eval()
decoder.eval()

# Tokenize the input sentence
new_input_sentence = "two plus four"
input_tokens = [word_to_index[word] for word in new_input_sentence.split()]
input_seq = torch.tensor(input_tokens, dtype=torch.long).unsqueeze(0)  # Add batch dimension

# generate the output sequence
# Forward pass through the encoder
encoder_outputs, (hidden, cell) = encoder(input_seq)

# Initialize the decoder
decoder_input = torch.tensor([word_to_index["<SOS>"]], dtype=torch.long)  # Start-of-Sequence token
decoder_hidden = hidden
decoder_cell = cell

# Generate output sequence
output_sequence = []
target_length = 10  # Maximum output sequence length

for _ in range(target_length):
    output, decoder_hidden, decoder_cell, _ = decoder(
        decoder_input, decoder_hidden, decoder_cell, encoder_outputs
    )
    predicted_token = output.argmax(1).item()  # Get token with the highest probability
    if predicted_token == word_to_index["<EOS>"]:  # Stop at End-of-Sequence token
        break
    output_sequence.append(predicted_token)
    decoder_input = torch.tensor([predicted_token], dtype=torch.long)

# Convert tokens back to words
output_sentence = " ".join([index_to_word[token] for token in output_sequence])
print("Output Sentence:", output_sentence)

Output Sentence: <SOS> equals six


#############################################################################################################################

# Build with padding for variable lengths

In [10]:
# Define special tokens
word_to_index = {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2}  
 

# Function to tokenize a sentence and update mapping dynamically
def tokenize(sentence, word_to_index):
    tokens = []
    for word in sentence.lower().split():
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)  # Assign index to new words
        tokens.append(word_to_index[word])
    return tokens

# Load dataset from CSV
def load_sequences_from_csv(csv_file):
    df = pd.read_csv(csv_file)
    return df["Problem"].tolist(), df["Solution"].tolist()

csv_file = "simple_math_problems.csv"
input_sentences, target_sentences = load_sequences_from_csv(csv_file)

# Tokenize input and target sentences
input_data = [tokenize(sentence, word_to_index) for sentence in input_sentences]
target_data = [[word_to_index["<SOS>"]] + tokenize(sentence, word_to_index) + [word_to_index["<EOS>"]]
               for sentence in target_sentences]

index_to_word = {v: k for k, v in word_to_index.items()}

# Convert tokenized sentences into tensors
input_tensors = [torch.tensor(seq) for seq in input_data]
target_tensors = [torch.tensor(seq) for seq in target_data]

# Apply dynamic padding
input_padded = pad_sequence(input_tensors, batch_first=True, padding_value=word_to_index["<PAD>"])
target_padded = pad_sequence(target_tensors, batch_first=True, padding_value=word_to_index["<PAD>"])

class MathWordProblemDataset(Dataset):
    def __init__(self, input_padded, target_padded):
        self.input_data = input_padded
        self.target_data = target_padded

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        return self.input_data[idx], self.target_data[idx]

dataset = MathWordProblemDataset(input_padded, target_padded)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_size = len(word_to_index)  # Vocabulary size
output_size = len(word_to_index)
hidden_size = 128

encoder = Encoder(input_size, hidden_size).to(device)
decoder = Decoder(output_size, hidden_size).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=word_to_index["<PAD>"])
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.001)

num_epochs = 100
for epoch in range(num_epochs):
    for input_seq, target_seq in dataloader:
        input_seq, target_seq = input_seq.to(device), target_seq.to(device)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, (hidden, cell) = encoder(input_seq)

        decoder_input = torch.tensor([word_to_index["<SOS>"]] * input_seq.size(0), device=device)
        decoder_hidden, decoder_cell = hidden, cell

        target_lengths = (target_seq != word_to_index["<PAD>"]).sum(dim=1)

        loss = 0
        max_target_length = target_lengths.max().item()

        for t in range(max_target_length):
            still_active = t < target_lengths
            if not still_active.any():
                break

            output, decoder_hidden, decoder_cell, _ = decoder(
                decoder_input, decoder_hidden, decoder_cell, encoder_outputs
            )
            loss += (criterion(output, target_seq[:, t]) * still_active.float()).sum() / still_active.sum()

            decoder_input = target_seq[:, t]  # Teacher forcing

        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

torch.save(encoder.state_dict(), "encoder-dynamic.pth")
torch.save(decoder.state_dict(), "decoder-dynamic.pth")

Epoch 1/100, Loss: 4.455081462860107
Epoch 2/100, Loss: 3.914167642593384
Epoch 3/100, Loss: 1.9789586067199707
Epoch 4/100, Loss: 1.2735586166381836
Epoch 5/100, Loss: 2.5654962062835693
Epoch 6/100, Loss: 4.185352802276611
Epoch 7/100, Loss: 0.7171554565429688
Epoch 8/100, Loss: 0.5447468757629395
Epoch 9/100, Loss: 3.604170799255371
Epoch 10/100, Loss: 6.16872501373291
Epoch 11/100, Loss: 0.6679847240447998
Epoch 12/100, Loss: 0.09699192643165588
Epoch 13/100, Loss: 0.08766615390777588
Epoch 14/100, Loss: 1.9832141399383545
Epoch 15/100, Loss: 0.25376373529434204
Epoch 16/100, Loss: 2.5221049785614014
Epoch 17/100, Loss: 0.13194917142391205
Epoch 18/100, Loss: 0.06081368774175644
Epoch 19/100, Loss: 1.653520107269287
Epoch 20/100, Loss: 2.3549647331237793
Epoch 21/100, Loss: 2.0650980472564697
Epoch 22/100, Loss: 0.027934523299336433
Epoch 23/100, Loss: 0.17629414796829224
Epoch 24/100, Loss: 1.1658247709274292
Epoch 25/100, Loss: 2.563602924346924
Epoch 26/100, Loss: 0.200281977653

# Test the son bitch

In [11]:
# load the trained model
input_size = len(word_to_index)  # Same as during training
output_size = len(word_to_index)
hidden_size = 128  # Same as during training

encoder = Encoder(input_size, hidden_size)
decoder = Decoder(output_size, hidden_size)

# Load the trained weights
encoder.load_state_dict(torch.load("encoder-dynamic.pth"))
decoder.load_state_dict(torch.load("decoder-dynamic.pth"))

# Set the models to evaluation mode
encoder.eval()
decoder.eval()

# Tokenize the input sentence
new_input_sentence = "twelve minus fourteen" #zero point eight six
input_tokens = [word_to_index[word] for word in new_input_sentence.split()]
input_seq = torch.tensor(input_tokens, dtype=torch.long).unsqueeze(0)  # Add batch dimension


# generate the output sequence
# Forward pass through the encoder
encoder_outputs, (hidden, cell) = encoder(input_seq)

# Initialize the decoder
decoder_input = torch.tensor([word_to_index["<SOS>"]], dtype=torch.long)  # Start-of-Sequence token
decoder_hidden = hidden
decoder_cell = cell

# Generate output sequence
output_sequence = []
target_length = 100  # Maximum output sequence length

for _ in range(target_length):
    output, decoder_hidden, decoder_cell, _ = decoder(
        decoder_input, decoder_hidden, decoder_cell, encoder_outputs
    )
    predicted_token = output.argmax(1).item()  # Get token with the highest probability
    if predicted_token == word_to_index["<EOS>"]:  # Stop at End-of-Sequence token
        break
    output_sequence.append(predicted_token)
    decoder_input = torch.tensor([predicted_token], dtype=torch.long)

print(word_to_index)

print(input_tokens)

print(output_sequence)


# Convert tokens back to words
output_sentence = " ".join([index_to_word[token] for token in output_sequence])
print("Output Sentence:", output_sentence)

{'<SOS>': 0, '<EOS>': 1, '<PAD>': 2, 'seventeen': 3, 'times': 4, 'one': 5, 'hundred': 6, 'twenty': 7, 'plus': 8, 'minus': 9, 'ninety': 10, 'two': 11, 'six': 12, 'divided': 13, 'by': 14, 'sixty': 15, 'nine': 16, 'three': 17, 'forty': 18, 'seven': 19, 'five': 20, 'thirty': 21, 'seventy': 22, 'eighty': 23, 'fourteen': 24, 'fifty': 25, 'sixteen': 26, 'four': 27, 'eight': 28, 'ten': 29, 'thirteen': 30, 'eighteen': 31, 'twelve': 32, 'fifteen': 33, 'nineteen': 34, 'eleven': 35, 'thousand': 36, 'zero': 37, 'point': 38, 'and': 39}
[32, 9, 24]
[0, 35]
Output Sentence: <SOS> eleven
