In [139]:
import math
import random
import torch
import transformers
import numpy as np
from functools import partial
print(transformers.__version__)

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

4.28.1


In [140]:
from experiment import *

In [141]:
model_checkpoint = "facebook/bart-base"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = {
    "batch_size": 16,
    "learning_rate": 2e-5,
    "rpn": True,
    "dataset": "mawps",
    "epochs": 2,
    "weight_decay": 0.01,
}

In [142]:
def mwp_to_dict(mwp):
    return {
        "id": mwp.id,
        "question": mwp.question,
        "equation": mwp.equation,
        "answer": mwp.answer,
        "numbers": mwp.numbers,
    }

def train_test_split(data, test_size=0.1):
    random.seed(1)
    random.shuffle(data)

    boundary = math.floor(len(data) * (1 - test_size))
    train = data[:len(data) - boundary - 1]
    test = data[boundary:]
    return { "train": train, "test": test }

In [143]:
class MWPDataset(torch.utils.data.Dataset):
  def __init__(self, inputs, targets):
    self.inputs = inputs
    self.targets = targets

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx], device=device) for key, val in self.inputs.items()}
    item['labels'] = torch.tensor(self.targets['input_ids'][idx], device=device)
    return item
  
  def __len__(self):
    return len(self.inputs['input_ids'])

In [144]:
def get_data(config):
    # mwps, _, _ = load_data(config)
    mwps = prepare_training_data(config['dataset'])
    print(f"Num mwps: {len(mwps)}")
    data = list(map(mwp_to_dict, mwps))

    inputs = train_test_split([mwp["question"] for mwp in data])
    targets = train_test_split([mwp["equation"] for mwp in data])
    mwps = train_test_split(data)

    return inputs, targets, mwps

In [145]:
def tokenise_data(tokeniser, inputs, targets):
    max_input_length = 1024
    max_target_length = 64

    tokenised_inputs = {
        "train": tokeniser(inputs["train"], max_length=max_input_length, truncation=True),
        "test": tokeniser(inputs["test"], max_length=max_input_length, truncation=True),
    }

    tokenised_targets = {
        "train": tokeniser(targets["train"], max_length=max_target_length, truncation=True),
        "test": tokeniser(targets["test"], max_length=max_target_length, truncation=True),
    }

    train_dataset = MWPDataset(tokenised_inputs["train"], tokenised_targets["train"])
    test_dataset = MWPDataset(tokenised_inputs["test"], tokenised_targets["test"])

    return train_dataset, test_dataset

In [146]:
def train_model(config, model, tokeniser, train_dataset, test_dataset, test_mwps):
    batch_size = config["batch_size"]
    args = Seq2SeqTrainingArguments(
        f"{model_checkpoint}-finetunes-mawps",
        evaluation_strategy = "epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        learning_rate=config["learning_rate"],
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=config["weight_decay"],
        save_total_limit=3,
        num_train_epochs=config["epochs"],
        predict_with_generate=True,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
    )

    data_collator = DataCollatorForSeq2Seq(tokeniser, model=model)

    def compute_metrics(tokeniser, mwps, eval_pred):
        print(eval_pred.predictions)
        print(eval_pred.predictions.shape)
        print(eval_pred.label_ids)
        print(eval_pred.label_ids.shape)
        print(eval_pred.inputs)
        if eval_pred.inputs is not None:
            print(eval_pred.inputs.shape)

        correct = 0

        for i in range(len(eval_pred.predictions)):
            mwp = mwps[i]

            numbers = list(map(float, mwp['numbers'].split(",")))
            answer = mwp["answer"]
            target = mwp["equation"]

            pred_tokens = np.expand_dims(eval_pred.predictions[i], 0)
            pred = [tokeniser.decode(token, skip_special_tokens=True, clean_up_tokenization_spaces=False) for token in pred_tokens]
            
            rpn_exp = infix_to_rpn(pred[0].split(" "))
            output_ans = eval_rpn(rpn_exp, numbers)

            if output_ans is not None and math.isclose(output_ans, answer, rel_tol=1e-4):
                # print("CORRECT:", pred[0], "<>", target)
                correct += 1
            # else:
            #     print("WRONG:", pred[0], " @@@ ", target)
            
        return {
            'accuracy': correct / len(eval_pred.predictions)
        }

    trainer = Seq2SeqTrainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=data_collator,
        tokenizer=tokeniser,
        compute_metrics=partial(compute_metrics, tokeniser, test_mwps),
    )

    print("Training now...")

    trainer.train()

    return trainer

In [147]:
def is_correct(model, input_tokens, target, numbers, answer, tokeniser, attempts=3):
    for _ in range(attempts):
        pred_tokens = model.generate(input_tokens['input_ids'], num_beams=4, max_length=32, early_stopping=True)
        pred = [tokeniser.decode(token, skip_special_tokens=True, clean_up_tokenization_spaces=False) for token in pred_tokens]

        rpn_exp = infix_to_rpn(pred[0].split(" "))
        output_ans = eval_rpn(rpn_exp, numbers)

        if output_ans is None:
            print("RETRYING")
            continue

        if math.isclose(output_ans, answer, rel_tol=1e-4):
            print("CORRECT:", pred[0], "<>", target)
            return True
        else:
            print("WRONG:", pred[0], " @@@ ", target)
            return False

        # print(pred[0], "<>", rpn_exp)
        # if rpn_exp is None:
        #     print("Retrying")
        #     continue

        # if pred[0] == target:
        #     print("CORRECT:", pred)
        #     return True
        # else:
        #     print(pred[0], " @@@ ", target)
        #     return False
    return False

def evaluate_accuracy(model, tokeniser, inputs, targets, mwps):
    print("Evaluating...")
    correct = 0
    for i in range(len(inputs)):
        input = inputs[i]
        target = targets[i]
        mwp = mwps[i]

        numbers = list(map(float, mwp['numbers'].split(",")))
        answer = mwp["answer"]

        input_tokens = tokeniser([input], max_length=1024, return_tensors='pt')
        input_tokens['input_ids'].to(device)

        if is_correct(model, input_tokens, target, numbers, answer, tokeniser):
            correct += 1

        # pred_tokens = model.generate(input_tokens['input_ids'], num_beams=4, max_length=32, early_stopping=True)
        # pred = [tokeniser.decode(token, skip_special_tokens=True, clean_up_tokenization_spaces=False) for token in pred_tokens]

        # if pred[0] == target:
        #     print("CORRECT:", pred)
        #     correct += 1
        # else:
        #     print(pred[0], " @@@ ", target)

    return correct / len(inputs)

In [148]:
tokeniser = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

inputs, targets, mwps = get_data(config)
print(f"# train: {len(inputs['train'])}, # test: {len(inputs['test'])}")
train_dataset, test_dataset = tokenise_data(tokeniser, inputs, targets)

# print(evaluate_accuracy(model, tokeniser, inputs['test'], targets['test'], mwps['test']))

trainer = train_model(config, model, tokeniser, train_dataset, test_dataset, mwps['test'])

trainer.save_model('./bart_model_trained')

print(evaluate_accuracy(model, tokeniser, inputs['test'], targets['test'], mwps['test']))

Num mwps: 1954
# train: 195, # test: 196
Training now...




  0%|          | 0/26 [00:00<?, ?it/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.0426, 'learning_rate': 1e-05, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

[[   2    0 1640 ...    1    1    1]
 [   2    0 1640 ...    1    1    1]
 [   2    0 1640 ...    1    1    1]
 ...
 [   2    0 1640 ...    1    1    1]
 [   2    0 1640 ...    1    1    1]
 [   2    0 1640 ...    1    1    1]]
(196, 20)
[[    0 10431   134 ...     1  -100  -100]
 [    0 10431   288 ...     1  -100  -100]
 [    0  1640   849 ...     1  -100  -100]
 ...
 [    0  1640   849 ...     1  -100  -100]
 [    0 10431   288 ...     1  -100  -100]
 [    0 10431   288 ...     1  -100  -100]]
(196, 22)
None
{'eval_loss': 0.8050152659416199, 'eval_accuracy': 0.20918367346938777, 'eval_runtime': 47.1142, 'eval_samples_per_second': 4.16, 'eval_steps_per_second': 0.276, 'epoch': 1.0}
{'loss': 0.9254, 'learning_rate': 0.0, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

[[    2     0  1640 ...     1     1     1]
 [    2     0  1640 ...     1     1     1]
 [    2     0  1640 ...     1     1     1]
 ...
 [    2     0 10431 ...     1     1     1]
 [    2     0  1640 ...     1     1     1]
 [    2     0  1640 ...     1     1     1]]
(196, 20)
[[    0 10431   134 ...     1  -100  -100]
 [    0 10431   288 ...     1  -100  -100]
 [    0  1640   849 ...     1  -100  -100]
 ...
 [    0  1640   849 ...     1  -100  -100]
 [    0 10431   288 ...     1  -100  -100]
 [    0 10431   288 ...     1  -100  -100]]
(196, 22)
None
{'eval_loss': 0.7482272386550903, 'eval_accuracy': 0.22448979591836735, 'eval_runtime': 53.3795, 'eval_samples_per_second': 3.672, 'eval_steps_per_second': 0.244, 'epoch': 2.0}
{'train_runtime': 255.6662, 'train_samples_per_second': 1.525, 'train_steps_per_second': 0.102, 'train_loss': 1.483998885521522, 'epoch': 2.0}


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Evaluating...
WRONG: ( #0 - #1 )  @@@  #1 / #0
WRONG: ( #0 - #1 )  @@@  #0 + #1
WRONG: ( #0 - #2 )  @@@  ( #2 * ( #0 - #1 ) )
WRONG: ( #0 - #1 )  @@@  #0 + #1
WRONG: ( #0 - #1 )  @@@  #0 + #0
WRONG: ( #0 + #1 )  @@@  ( #2 * ( #0 - #1 ) )


KeyboardInterrupt: 

In [150]:
model2 = AutoModelForSeq2SeqLM.from_pretrained('./bart_model_trained')
tokeniser2 = AutoTokenizer.from_pretrained('./bart_model_trained')

print(evaluate_accuracy(model2, tokeniser2, inputs['test'], targets['test'], mwps['test']))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Evaluating...
WRONG: ( #0 - #1 )  @@@  #1 / #0
WRONG: ( #0 - #1 )  @@@  #0 + #1
WRONG: ( #0 - #2 )  @@@  ( #2 * ( #0 - #1 ) )
WRONG: ( #0 - #1 )  @@@  #0 + #1
WRONG: ( #0 - #1 )  @@@  #0 + #0
WRONG: ( #0 + #1 )  @@@  ( #2 * ( #0 - #1 ) )
WRONG: ( #0 + #1 )  @@@  ( #0 * #1 )
WRONG: ( #0 + #3 )  @@@  ( #0 * #2 ) + ( #1 * #3 )
WRONG: ( #0 + #1 )  @@@  ( #0 * ( #1 + #2 ) )
WRONG: ( #0 - #2 )  @@@  ( ( #0 + #1 ) / #2 )
WRONG: ( #0 + #1 )  @@@  ( #0 - #2 )
WRONG: ( #0 + #1 )  @@@  ( #2 * ( #0 - #1 ) )
WRONG: ( #0 + #1 )  @@@  #0 - #1
CORRECT: ( #0 + #1 ) <> ( #0 + #1 )
WRONG: ( #0 + #1 )  @@@  #0 + #2
WRONG: ( #0 + #1 )  @@@  ( #0 / #0 )
WRONG: ( #0 - #1 )  @@@  ( #1 / #0 )
WRONG: ( #0 - #1 )  @@@  #0 + #1
WRONG: ( #0 - #1 )  @@@  #0 + #1
WRONG: ( #0 + #1 )  @@@  ( #2 - #1 ) / #0


KeyboardInterrupt: 

In [None]:
# for i in range(len(targets['test'])):
#     t = targets['test'][i]
#     i = inputs['']
#     label_ids = tokeniser(t, max_length=64, truncation=True)
#     pred_tokens = model.generate(input_tokens['input_ids'], num_beams=4, max_length=32, early_stopping=True)
#     print(label_ids)

for i in range(len(test_dataset)):
    t = test_dataset[i]
    # input_  = t['input_ids']
    # print(t['input_ids'])

    input = inputs['test'][i]
    mwp = mwps['test'][i]

    numbers = list(map(float, mwp['numbers'].split(",")))
    answer = mwp["answer"]

    # input_tokens = tokeniser([input], max_length=1024, return_tensors='pt')
    # input_tokens.to(device)

    # print(t['input_ids'])
    # print(input_tokens['input_ids'])

    pred_tokens = model.generate(t['input_ids'].unsqueeze(0), num_beams=4, max_length=32, early_stopping=True)
    pred = [tokeniser.decode(token, skip_special_tokens=True, clean_up_tokenization_spaces=False) for token in pred_tokens]

    rpn_exp = infix_to_rpn(pred[0].split(" "))
    output_ans = eval_rpn(rpn_exp, numbers)


    print(pred_tokens)

tensor([[   2,    0, 1640,  849,  288, 2055,  849,  134, 4839,    2]])
tensor([[    2,     0, 10431,   288,  2055,   849,   134,     2]])
tensor([[   2,    0, 1640,  849,  288, 2055,  849,  134, 4839,    2]])
tensor([[   2,    0, 1640,  849,  288,  111,  849,  176, 4839,    2]])
tensor([[    2,     0, 10431,   288,  2055,   849,   176,     2]])
tensor([[    2,     0, 10431,   288,  2055,   849,   134,     2]])
tensor([[   2,    0, 1640,  849,  288, 2055,  849,  134, 4839,    2]])
tensor([[   2,    0, 1640,  849,  288, 2055,  849,  134, 4839,    2]])
tensor([[   2,    0, 1640,  849,  288,  111,  849,  134, 4839,    2]])
tensor([[   2,    0, 1640,  849,  288, 2055,  849,  134, 4839,    2]])
tensor([[   2,    0, 1640,  849,  288, 2055,  849,  176, 4839,    2]])
tensor([[   2,    0, 1640,  849,  288, 2055,  849,  134, 4839,    2]])
tensor([[   2,    0, 1640,  849,  288, 2055,  849,  134, 4839,    2]])
tensor([[   2,    0, 1640,  849,  288, 2055,  849,  134, 4839,    2]])
tensor([[   2,    

In [None]:
print(evaluate_accuracy(model, tokeniser, inputs['test'], targets['test']))

TypeError: evaluate_accuracy() missing 1 required positional argument: 'mwps'