# LSTM-arithmetic

## Dataset
- [Arithmetic dataset](https://drive.google.com/file/d/1cMuL3hF9jefka9RyF4gEBIGGeFGZYHE-/view?usp=sharing)

In [1]:
# ! pip install seaborn
# ! pip install opencc
# ! pip install -U scikit-learn

import numpy as np
import pandas as pd
import torch
import torch.nn
import torch.nn.utils.rnn
import torch.utils.data
import matplotlib.pyplot as plt
import seaborn as sns
import opencc
import os
from sklearn.model_selection import train_test_split

# data_path = 'Assignments/Assignment2/'

In [2]:
df_train = pd.read_csv('arithmetic_train.csv')
df_eval = pd.read_csv('arithmetic_eval.csv')
df_train.head()

Unnamed: 0.1,Unnamed: 0,src,tgt
0,2285313,14*(43+20)=,882
1,317061,(6+1)*5=,35
2,718770,13+32+29=,74
3,170195,31*(3-11)=,-248
4,2581417,24*49+1=,1177


In [3]:
df_eval.head()

Unnamed: 0.1,Unnamed: 0,src,tgt
0,2573208,48+43+34=,125
1,1630340,30-(48+13)=,-31
2,549277,(21*31)+10=,661
3,133957,2-27-10=,-35
4,1279828,(15*20)+24=,324


In [4]:
# transform the input data to string
df_train['tgt'] = df_train['tgt'].apply(lambda x: str(x))
df_train['src'] = df_train['src'].add(df_train['tgt'])
df_train['len'] = df_train['src'].apply(lambda x: len(x))

df_eval['tgt'] = df_eval['tgt'].apply(lambda x: str(x))
df_eval['src'] = df_eval['src'].add(df_eval['tgt'])
df_eval['len'] = df_eval['src'].apply(lambda x: len(x))

In [5]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,src,tgt,len
0,2285313,14*(43+20)=882,882,14
1,317061,(6+1)*5=35,35,10
2,718770,13+32+29=74,74,11
3,170195,31*(3-11)=-248,-248,14
4,2581417,24*49+1=1177,1177,12


# Build Dictionary
 - The model cannot perform calculations directly with plain text.
 - Convert all text (numbers/symbols) into numerical representations.
 - Special tokens
    - '&lt;pad&gt;'
        - Each sentence within a batch may have different lengths.
        - The length is padded with '&lt;pad&gt;' to match the longest sentence in the batch.
    - '&lt;eos&gt;'
        - Specifies the end of the generated sequence.
        - Without '&lt;eos&gt;', the model will not know when to stop generating.

In [6]:
char_to_id = {}
id_to_char = {}

# write your code here
# Build a dictionary and give every token in the train dataset an id
# The dictionary should contain <eos> and <pad>
# char_to_id is to conver charactors to ids, while id_to_char is the opposite

char_to_id['<eos>'] = 1
char_to_id['<pad>'] = 0
id_to_char[1] = '<eos>'
id_to_char[0] = '<pad>'

id = 2
for row in df_train['src']:
    for char in row:
        if char not in char_to_id:
            char_to_id[char] = id
            id_to_char[id] = char
            id += 1


vocab_size = len(char_to_id)
print('Vocab size{}'.format(vocab_size))

Vocab size18


# Data Preprocessing
 - The data is processed into the format required for the model's input and output.
 - Example: 1+2-3=0
     - Model input: 1 + 2 - 3 = 0
     - Model output: / / / / / 0 &lt;eos&gt;  (the '/' can be replaced with &lt;pad&gt;)
     - The key for the model's output is that the model does not need to predict the next character of the previous part. What matters is that once the model sees '=', it should start generating the answer, which is '0'. After generating the answer, it should also generate&lt;eos&gt;


In [7]:
# Write your code here

df_train['char_id_list'] = df_train['src'].apply(lambda x: [char_to_id[c] for c in x] + [char_to_id['<eos>']])

def convert_tgt_to_label_id_list(row):
    value_after_equal = False
    label_id_list = []
    for i in row['char_id_list']:
        if value_after_equal:
            label_id_list.append(i)
            continue
        if i == char_to_id['=']:
            value_after_equal = True
        label_id_list.append(char_to_id['<pad>'])

    return label_id_list

df_train['label_id_list'] = df_train.apply(convert_tgt_to_label_id_list, axis=1)
    
df_eval['char_id_list'] = df_eval['src'].apply(lambda x: [char_to_id[c] for c in x] + [char_to_id['<eos>']])
df_eval['label_id_list'] = df_eval.apply(convert_tgt_to_label_id_list, axis=1)

df_train.head(10)

Unnamed: 0.1,Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,2285313,14*(43+20)=882,882,14,"[2, 3, 4, 5, 3, 6, 7, 8, 9, 10, 11, 12, 12, 8, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 8, 1]"
1,317061,(6+1)*5=35,35,10,"[5, 13, 7, 2, 10, 4, 14, 11, 6, 14, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 6, 14, 1]"
2,718770,13+32+29=74,74,11,"[2, 6, 7, 6, 8, 7, 8, 15, 11, 16, 3, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 3, 1]"
3,170195,31*(3-11)=-248,-248,14,"[6, 2, 4, 5, 6, 17, 2, 2, 10, 11, 17, 8, 3, 12...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 8, 3, 12, 1]"
4,2581417,24*49+1=1177,1177,12,"[8, 3, 4, 3, 15, 7, 2, 11, 2, 2, 16, 16, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 16, 16, 1]"
5,184807,3+(25*25)=628,628,13,"[6, 7, 5, 8, 14, 4, 8, 14, 10, 11, 13, 8, 12, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 8, 12, 1]"
6,1590207,8*(30+10)=320,320,13,"[12, 4, 5, 6, 9, 7, 2, 9, 10, 11, 6, 8, 9, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 8, 9, 1]"
7,2052496,9*38+49=391,391,11,"[15, 4, 6, 12, 7, 3, 15, 11, 6, 15, 2, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 6, 15, 2, 1]"
8,1228852,23-17=6,6,7,"[8, 6, 17, 2, 16, 11, 13, 1]","[0, 0, 0, 0, 0, 0, 13, 1]"
9,1238649,23-26*15=-367,-367,13,"[8, 6, 17, 8, 13, 4, 2, 14, 11, 17, 6, 13, 16, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 6, 13, 16, 1]"


# Hyper Parameters

|Hyperparameter|Meaning|Value|
|-|-|-|
|`batch_size`|Number of data samples in a single batch|64|
|`epochs`|Total number of epochs to train|10|
|`embed_dim`|Dimension of the word embeddings|256|
|`hidden_dim`|Dimension of the hidden state in each timestep of the LSTM|256|
|`lr`|Learning Rate|0.001|
|`grad_clip`|To prevent gradient explosion in RNNs, restrict the gradient range|1|

In [9]:
batch_size = 256
epochs = 10
embed_dim = 512
hidden_dim = 512
lr = 0.001
grad_clip = 1

# Data Batching
- Use `torch.utils.data.Dataset` to create a data generation tool called  `dataset`.
- The, use `torch.utils.data.DataLoader` to randomly sample from the `dataset` and group the samples into batches.

In [10]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
    
    def __len__(self):
        # return the amount of data
        return len(self.sequences)
    
    def __getitem__(self, index):
        # Extract the input data x and the ground truth y from the data
        x = self.sequences.iloc[index]['char_id_list'][:-1]
        y = self.sequences.iloc[index]['label_id_list'][1:]
        return x, y

# collate function, used to build dataloader 來確保每個batch中的data長度一樣
def collate_fn(batch):
    batch_x = [torch.tensor(data[0]) for data in batch]
    batch_y = [torch.tensor(data[1]) for data in batch]
    batch_x_lens = torch.LongTensor([len(x) for x in batch_x])
    batch_y_lens = torch.LongTensor([len(y) for y in batch_y])
    
    # Pad the input sequence
    pad_batch_x = torch.nn.utils.rnn.pad_sequence(batch_x,
                                                  batch_first=True,
                                                  padding_value=char_to_id['<pad>'])
    
    pad_batch_y = torch.nn.utils.rnn.pad_sequence(batch_y,
                                                  batch_first=True,
                                                  padding_value=char_to_id['<pad>'])
    
    return pad_batch_x, pad_batch_y, batch_x_lens, batch_y_lens

In [11]:
ds_train = Dataset(df_train[['char_id_list', 'label_id_list']])
ds_eval = Dataset(df_eval[['char_id_list', 'label_id_list']])

In [12]:
# Build dataloader of train set and eval set, collate_fn is the collate function
dl_train = torch.utils.data.DataLoader(
    ds_train,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)
dl_eval = torch.utils.data.DataLoader(
    ds_eval,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)

In [13]:
for x, y, x_len, y_len in dl_train:
    print(x.shape)
    print(y.shape)
    print(x_len)
    print(y_len)
    break

torch.Size([256, 16])
torch.Size([256, 16])
tensor([16, 11, 14, 15, 10, 11, 13, 14, 11, 10, 10, 14, 14, 10, 14, 13,  8, 10,
        12, 14, 15, 15, 12,  9, 12, 14, 10, 13, 11, 12, 11, 12, 13, 13, 14, 14,
        10, 14, 15, 12, 13, 12, 12, 12,  9, 11, 14, 13, 13, 12, 13, 15, 12, 13,
        12, 11, 13, 12, 10, 11, 11, 14, 15, 11, 12, 12, 14, 10, 13, 13, 13, 13,
        10, 11, 14, 10, 15, 15, 10, 11, 13, 10, 11, 13, 11, 13, 13, 15, 14, 11,
        15, 13, 14, 12, 14, 13, 14, 15, 15, 13, 13, 11, 14, 12, 14, 14, 12, 13,
        11, 13, 11, 13, 11, 11, 13, 14, 10, 12, 15, 12, 14, 12, 12, 11, 14, 11,
        15, 10, 11, 13, 11, 12, 12, 12, 12, 12, 14, 12, 15, 13, 12,  9, 12, 11,
        11, 12, 14, 12, 10,  8, 12, 15, 11, 13, 10, 13, 12,  8, 12, 14, 12, 14,
        11, 12, 11, 13, 15, 14, 14, 13, 11, 14, 12, 16, 13, 10, 13, 12, 10, 11,
        12, 10, 12, 11, 14, 14, 11, 14, 12, 12, 11, 11, 10, 14, 13, 13,  9, 12,
        14, 11, 10, 11, 13, 13, 11, 15, 11,  9, 12, 12, 10, 13, 13, 14, 12, 

# Model Design

## Execution Flow
1. Convert all characters in the sentence into embeddings.
2. Pass the embeddings through an LSTM sequentially.
3. The output of the LSTM is passed into another LSTM, and additional layers can be added.
4. The output from all time steps of the final LSTM is passed through a Fully Connected layer.
5. The character corresponding to the maximum value across all output dimensions is selected as the next character.

## Loss Function
Since this is a classification task, Cross Entropy is used as the loss function.

## Gradient Update
Adam algorithm is used for gradient updates.

In [18]:
class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()
        
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embed_dim,
                                            padding_idx=char_to_id['<pad>'])
        
        self.rnn_layer1 = torch.nn.LSTM(input_size=embed_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        
        self.rnn_layer2 = torch.nn.LSTM(input_size=hidden_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        
        # self.rnn_layer3 = torch.nn.LSTM(input_size=hidden_dim,
        #                                 hidden_size=hidden_dim,
        #                                 batch_first=True)
        
        # self.rnn_layer4 = torch.nn.LSTM(input_size=hidden_dim,
        #                                 hidden_size=hidden_dim,
        #                                 batch_first=True)        
        self.linear = torch.nn.Sequential(torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=hidden_dim),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=vocab_size))
        
    def forward(self, batch_x, batch_x_lens):
        return self.encoder(batch_x, batch_x_lens)
    
    # The forward pass of the model
    def encoder(self, batch_x, batch_x_lens):
        batch_x = self.embedding(batch_x)
        
        batch_x = torch.nn.utils.rnn.pack_padded_sequence(batch_x,
                                                          batch_x_lens,
                                                          batch_first=True,
                                                          enforce_sorted=False)
        
        batch_x, _ = self.rnn_layer1(batch_x)
        batch_x, _ = self.rnn_layer2(batch_x)
        # batch_x, _ = self.rnn_layer3(batch_x)
        # batch_x, _ = self.rnn_layer4(batch_x)

        
        batch_x, _ = torch.nn.utils.rnn.pad_packed_sequence(batch_x,
                                                            batch_first=True)
        
        batch_x = self.linear(batch_x)
        
        return batch_x
    
    def generator(self, start_char, max_len=20):
        
        char_list = [char_to_id[c] for c in start_char]
        
        next_char = None
        
        while len(char_list) < max_len: 
            # Write your code here 
            # Pack the char_list to tensor
            # Input the tensor to the embedding layer, LSTM layers, linear respectively
            # Obtain the next token prediction y

            char_list_tensor = torch.tensor(char_list).unsqueeze(0).to(torch.device('cuda'))

            char_list_tensor = self.embedding(char_list_tensor)
            char_list_tensor, _ = self.rnn_layer1(char_list_tensor)
            char_list_tensor, _ = self.rnn_layer2(char_list_tensor)
            # char_list_tensor, _ = self.rnn_layer3(char_list_tensor)
            # char_list_tensor, _ = self.rnn_layer4(char_list_tensor)

            y = self.linear(char_list_tensor)
            y = y.squeeze(0)
            
            next_char = torch.argmax(y[-1]).item()

            # next_char =  # Use argmax function to get the next token prediction

            if next_char == char_to_id['<eos>']:
                break
            
            char_list.append(next_char)
            
        return [id_to_char[ch_id] for ch_id in char_list]

In [19]:
torch.manual_seed(2)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = CharRNN(vocab_size,
                embed_dim,
                hidden_dim)

In [20]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=char_to_id['<pad>'])# Write your code here. Cross-entropy loss function. The loss function should ignore <pad>
optimizer = torch.optim.Adam(model.parameters(), lr=lr) # Write your code here. Use Adam or AdamW for Optimizer

# Training
1. The outer `for` loop controls the `epoch`
    1. The inner `for` loop uses `data_loader` to retrieve batches.
        1. Pass the batch to the `model` for training.
        2. Compare the predicted results `batch_pred_y` with the true labels `batch_y` using Cross Entropy to calculate the loss `loss`
        3. Use `loss.backward` to automatically compute the gradients.
        4. Use `torch.nn.utils.clip_grad_value_` to limit the gradient values between `-grad_clip` &lt; and &lt; `grad_clip`.
        5. Use `optimizer.step()` to update the model (backpropagation).
2.  After every `1000` batches, output the current loss to monitor whether it is converging.

In [21]:
from tqdm import tqdm
from copy import deepcopy
model = model.to(device)
model.train()
i = 0
for epoch in range(1, epochs+1):
    # The process bar
    bar = tqdm(dl_train, desc=f"Train epoch {epoch}")
    for batch_x, batch_y, batch_x_lens, batch_y_lens in bar:
        # batch_x size: (batch_size, sequence_length)
        # batch_y size: (batch_size, sequence_length)
        # batch_x_lens size: (batch_size)
        # batch_y_lens size: (batch_size)


        # Write your code here
        # Clear the gradient
        optimizer.zero_grad()
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        batch_pred_y = model(batch_x, batch_x_lens) #same size as batch_x

        # Write your code here
        # Input the prediction and ground truths to loss function
        # Back propagation
        
        packed_batch_pred_y = torch.nn.utils.rnn.pack_padded_sequence(batch_pred_y, batch_y_lens, batch_first=True, enforce_sorted=False).data
        packed_y = torch.nn.utils.rnn.pack_padded_sequence(batch_y, batch_y_lens, batch_first=True, enforce_sorted=False).data
        # print(f'packed_batch_pred_y shape : {packed_batch_pred_y.shape} / packed_y shape : {packed_y.shape}')
        # print(f'batch_pred_y : {batch_pred_y} / batch_y : {packed_y}')
        loss = criterion(packed_batch_pred_y, packed_y)
        loss.backward()

        torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip) # gradient clipping

        # Write your code here
        # Optimize parameters in the model
        optimizer.step()

        i+=1
        if i%50==0:
            bar.set_postfix(loss = loss.item())
    
    # Evaluate your model 以下有許多部份參考chatgpt
    bar = tqdm(dl_eval, desc=f"Validation epoch {epoch}")
    matched = 0
    total = 0
    for batch_x, batch_y, batch_x_lens, batch_y_lens in bar:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        # Generate predictions
        predictions = model(batch_x, batch_x_lens)  # 輸入 batch_x，根據 batch_x_lens 生成預測

        # 將預測值轉為索引
        pred_labels = torch.argmax(predictions, dim=-1)

        for i, (pred_row, true_row) in enumerate(zip(pred_labels, batch_y)):
            # 找到 ground truth 中的第一個有效字符（非 <pad>）
            valid_start_idx = (true_row != char_to_id['<pad>']).nonzero(as_tuple=True)[0][0]
            
            # 找到 <eos> 的位置
            if char_to_id['<eos>'] in true_row:
                valid_end_idx = (true_row == char_to_id['<eos>']).nonzero(as_tuple=True)[0][0]
            else:
                valid_end_idx = len(true_row)  # 如果沒有 <eos>，則使用整個序列

            # 截取 ground truth 和 prediction 的有效部分
            true_row_valid = true_row[valid_start_idx:valid_end_idx + 1]
            pred_row_valid = pred_row[valid_start_idx:valid_end_idx + 1]

            # 將索引轉換成字符並列印
            pred_chars = ''.join([id_to_char[idx] for idx in pred_row_valid.tolist()])
            true_chars = ''.join([id_to_char[idx] for idx in true_row_valid.tolist()])
            # print(f"Prediction: {pred_chars}")
            # print(f"Ground Truth: {true_chars}")

            # 僅在有效字符完全匹配時才計為 1
            if torch.equal(pred_row_valid, true_row_valid):
                matched += 1
            total += 1  # 每處理一個有效序列，計為 1

    # Compute Exact Match (EM)
    EM = matched / total if total > 0 else 0
    print(f"Exact Match (EM) accuracy: {EM:.4f}")


Train epoch 1: 100%|██████████| 9255/9255 [02:04<00:00, 74.06it/s, loss=0.271] 
Validation epoch 1: 100%|██████████| 1029/1029 [03:15<00:00,  5.27it/s]


Exact Match (EM) accuracy: 0.6627


Train epoch 2: 100%|██████████| 9255/9255 [02:05<00:00, 73.99it/s, loss=0.158] 
Validation epoch 2: 100%|██████████| 1029/1029 [03:13<00:00,  5.31it/s]


Exact Match (EM) accuracy: 0.8016


Train epoch 3: 100%|██████████| 9255/9255 [02:02<00:00, 75.57it/s, loss=0.109]  
Validation epoch 3: 100%|██████████| 1029/1029 [03:14<00:00,  5.28it/s]


Exact Match (EM) accuracy: 0.8758


Train epoch 4: 100%|██████████| 9255/9255 [02:02<00:00, 75.67it/s, loss=0.0602] 
Validation epoch 4: 100%|██████████| 1029/1029 [03:13<00:00,  5.31it/s]


Exact Match (EM) accuracy: 0.9083


Train epoch 5: 100%|██████████| 9255/9255 [02:03<00:00, 74.92it/s, loss=0.0377] 
Validation epoch 5: 100%|██████████| 1029/1029 [03:05<00:00,  5.56it/s]


Exact Match (EM) accuracy: 0.9152


Train epoch 6: 100%|██████████| 9255/9255 [02:04<00:00, 74.10it/s, loss=0.0585] 
Validation epoch 6: 100%|██████████| 1029/1029 [03:00<00:00,  5.69it/s]


Exact Match (EM) accuracy: 0.9133


Train epoch 7: 100%|██████████| 9255/9255 [02:03<00:00, 74.70it/s, loss=0.0896] 
Validation epoch 7: 100%|██████████| 1029/1029 [03:01<00:00,  5.68it/s]


Exact Match (EM) accuracy: 0.9223


Train epoch 8: 100%|██████████| 9255/9255 [02:01<00:00, 75.99it/s, loss=0.0404] 
Validation epoch 8: 100%|██████████| 1029/1029 [02:58<00:00,  5.76it/s]


Exact Match (EM) accuracy: 0.9226


Train epoch 9: 100%|██████████| 9255/9255 [02:01<00:00, 75.91it/s, loss=0.0449] 
Validation epoch 9: 100%|██████████| 1029/1029 [03:00<00:00,  5.70it/s]


Exact Match (EM) accuracy: 0.9410


Train epoch 10: 100%|██████████| 9255/9255 [02:01<00:00, 76.34it/s, loss=0.0325] 
Validation epoch 10: 100%|██████████| 1029/1029 [02:59<00:00,  5.73it/s]

Exact Match (EM) accuracy: 0.9456





# Generation
Use `model.generator` and provide an initial character to automatically generate a sequence.

In [24]:
# model = model.to("cpu")
print("".join(model.generator('1+1=')))

1+1=1


In [36]:
model.generator('98+82=')

['9', '8', '+', '8', '2', '=', '3', '2']

# 預測沒看過的數字 測試在訓練資料集皆是50以內的數字時，測試資料集都加49來做評估EM。

In [28]:
df_eval2 = pd.read_csv('arithmetic_eval.csv')
df_eval2

Unnamed: 0.1,Unnamed: 0,src,tgt
0,2573208,48+43+34=,125
1,1630340,30-(48+13)=,-31
2,549277,(21*31)+10=,661
3,133957,2-27-10=,-35
4,1279828,(15*20)+24=,324
...,...,...,...
263245,782879,14*43*23=,13846
263246,2533040,48-(5*27)=,-87
263247,2228788,30*42+16=,1276
263248,542747,21*(10-15)=,-105


In [29]:
# 這個cell是Chatgpt生成的 
import re

def add_49_and_evaluate(expression):
    # 移除等號 "="
    expression = expression.replace("=", "")
    # 使用正則表達式找到所有的數字並加上 49
    modified_expression = re.sub(r'\d+', lambda x: str(int(x.group()) + 49), expression)
    # 計算修改後的結果
    result = eval(modified_expression)
    # 加回等號 "="
    modified_expression += "="
    return modified_expression, result

# 對每一行應用，並直接更新原始的 `src` 和 `tgt` 欄位
df_eval2[['src', 'tgt']] = df_eval2['src'].apply(lambda x: pd.Series(add_49_and_evaluate(x)))

# 顯示結果
df_eval2


Unnamed: 0.1,Unnamed: 0,src,tgt
0,2573208,97+92+83=,272
1,1630340,79-(97+62)=,-80
2,549277,(70*80)+59=,5659
3,133957,51-76-59=,-84
4,1279828,(64*69)+73=,4489
...,...,...,...
263245,782879,63*92*72=,417312
263246,2533040,97-(54*76)=,-4007
263247,2228788,79*91+65=,7254
263248,542747,70*(59-64)=,-350


In [30]:

df_eval2['tgt'] = df_eval2['tgt'].apply(lambda x: str(x))
df_eval2['src'] = df_eval2['src'].add(df_eval2['tgt'])
df_eval2['len'] = df_eval2['src'].apply(lambda x: len(x))

df_eval2['char_id_list'] = df_eval2['src'].apply(lambda x: [char_to_id[c] for c in x] + [char_to_id['<eos>']])
df_eval2['label_id_list'] = df_eval2.apply(convert_tgt_to_label_id_list, axis=1)
df_eval2.head()


Unnamed: 0.1,Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,2573208,97+92+83=272,272,12,"[15, 16, 7, 15, 8, 7, 12, 6, 11, 8, 16, 8, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 16, 8, 1]"
1,1630340,79-(97+62)=-80,-80,14,"[16, 15, 17, 5, 15, 16, 7, 13, 8, 10, 11, 17, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 12, 9, 1]"
2,549277,(70*80)+59=5659,5659,15,"[5, 16, 9, 4, 12, 9, 10, 7, 14, 15, 11, 14, 13...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 13, 14, ..."
3,133957,51-76-59=-84,-84,12,"[14, 2, 17, 16, 13, 17, 14, 15, 11, 17, 12, 3, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 12, 3, 1]"
4,1279828,(64*69)+73=4489,4489,15,"[5, 13, 3, 4, 13, 15, 10, 7, 16, 6, 11, 3, 3, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 12, 15..."


In [32]:
ds_eval2 = Dataset(df_eval2[['char_id_list', 'label_id_list']])
dl_eval2 = torch.utils.data.DataLoader(
    ds_eval2,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)


In [33]:
bar = tqdm(dl_eval2, desc=f"Validation epoch {epoch}")
matched = 0
total = 0
for batch_x, batch_y, batch_x_lens, batch_y_lens in bar:
    batch_x = batch_x.to(device)
    batch_y = batch_y.to(device)

    # Generate predictions
    predictions = model(batch_x, batch_x_lens)  # 輸入 batch_x，根據 batch_x_lens 生成預測

    # 將預測值轉為索引
    pred_labels = torch.argmax(predictions, dim=-1)

    for i, (pred_row, true_row) in enumerate(zip(pred_labels, batch_y)):
        # 找到 ground truth 中的第一個有效字符（非 <pad>）
        valid_start_idx = (true_row != char_to_id['<pad>']).nonzero(as_tuple=True)[0][0]
        
        # 找到 <eos> 的位置
        if char_to_id['<eos>'] in true_row:
            valid_end_idx = (true_row == char_to_id['<eos>']).nonzero(as_tuple=True)[0][0]
        else:
            valid_end_idx = len(true_row)  # 如果沒有 <eos>，則使用整個序列

        # 截取 ground truth 和 prediction 的有效部分
        true_row_valid = true_row[valid_start_idx:valid_end_idx + 1]
        pred_row_valid = pred_row[valid_start_idx:valid_end_idx + 1]

        # 將索引轉換成字符並列印
        pred_chars = ''.join([id_to_char[idx] for idx in pred_row_valid.tolist()])
        true_chars = ''.join([id_to_char[idx] for idx in true_row_valid.tolist()])
        # print(f"Prediction: {pred_chars}")
        # print(f"Ground Truth: {true_chars}")

        # 僅在有效字符完全匹配時才計為 1
        if torch.equal(pred_row_valid, true_row_valid):
            matched += 1
        total += 1  # 每處理一個有效序列，計為 1

# Compute Exact Match (EM)
EM = matched / total if total > 0 else 0
print(f"Exact Match (EM) accuracy: {EM:.4f}")

Validation epoch 10: 100%|██████████| 1029/1029 [03:14<00:00,  5.29it/s]

Exact Match (EM) accuracy: 0.0016





In [37]:
model.generator('79*39=')

['7', '9', '*', '3', '9', '=', '2', '8', '1', '3']