# Imports

In [1]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoModelForCausalLM
import pandas as pd
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import numpy as np
import random

  from .autonotebook import tqdm as notebook_tqdm


# Enforce Reproducibility

In [2]:
seed = 44

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(42)  # if using multi-GPU

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load Pre-trained GPT-2 Components

In [3]:
# Load pre-trained model (GPT-2)
base_gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')

# Load pre-trained tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add padding token for the tokenizer if not present
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
base_gpt2_model.config.pad_token_id = base_gpt2_model.config.eos_token_id

# Add special tokens to the tokenizer
special_tokens = {
    'bos_token': '<|bos|>',
    'eos_token': '<|eos|>',
    'pad_token': '<|pad|>',
    'sep_token': '<|sep|>'
}
gpt2_tokenizer.add_special_tokens(special_tokens)

# Initialize model with resized token embeddings
base_gpt2_model.resize_token_embeddings(len(gpt2_tokenizer))

# Set the pad_token_id in the model configuration
base_gpt2_model.config.pad_token_id = gpt2_tokenizer.pad_token_id




# Load and Preprocess Data

In [4]:
def prep_e2e_for_gpt2(tokenizer):
    dataset = load_dataset("e2e_nlg")
    # print(dataset["test"]["meaning_representation"])

    # # save test set for metric computation with official script
    # testSet = dataset["test"]
    # df = testSet.to_pandas()
    # df.to_csv("e2e_nlg_test_set.csv", index=False)

    # Add special tokens
    special_tokens = {
        'bos_token': '<|bos|>',
        'eos_token': '<|eos|>',
        'pad_token': '<|pad|>',
        'sep_token': '<|sep|>'
    }
    tokenizer.add_special_tokens(special_tokens)

    def preprocess_train(examples):
        # Format: <|bos|> meaning_representation <|sep|> target <|eos|>
        texts = [
            f"{tokenizer.bos_token} {mr} {tokenizer.sep_token} {ref} {tokenizer.eos_token}"
            for mr, ref in zip(examples['meaning_representation'], examples['human_reference'])
        ]
        
        # Tokenize
        encodings = tokenizer(
            texts,
            truncation=True,
            padding='max_length',
            max_length=128,
        )
        
        # Create labels (same as input_ids initially)
        labels = encodings['input_ids']
    
        # Mask out MR tokens in labels
        for i, (mr, ref) in enumerate(zip(examples['meaning_representation'], examples['human_reference'])):
            # Tokenize MR and find its length
            mr_tokens = tokenizer(f"{tokenizer.bos_token} {mr} {tokenizer.sep_token}", add_special_tokens=False).input_ids
        
            # Mask MR tokens with -100
            num_mr_tokens = len(mr_tokens)
            labels[i][:num_mr_tokens] = [-100] * num_mr_tokens
    
        # Add labels to the encodings
        encodings['labels'] = labels
        return encodings

        # # Create labels (same as input_ids)
        # encodings['labels'] = encodings['input_ids']
        # return encodings
    

    def preprocess_test(examples):
        mr_texts = [
            f"{tokenizer.bos_token} {mr} {tokenizer.sep_token} "
            for mr in examples['meaning_representation']
        ]

        mr_encodings = tokenizer(
            mr_texts,
            truncation=True,
            padding='max_length',
            max_length=64,
        )
    
        ref_texts = [
            f"{ref} {tokenizer.eos_token}"
            for ref in examples['human_reference']
        ]

        ref_encodings = tokenizer(
            ref_texts,
            truncation=True,
            padding='max_length',
            max_length=64,
        )

        mr_encodings['labels'] = ref_encodings['input_ids']
        return mr_encodings


    # Process all splits
    tokenized_train = dataset.map(
        preprocess_train,
        batched=True,
        remove_columns=dataset['train'].column_names
    )

    tokenized_test = dataset.map(
        preprocess_test,
        batched=True,
    )

    # Set format to PyTorch tensors
    tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    
    return tokenized_train, tokenized_test
#=====================================================

tokenized_train, tokenized_test = prep_e2e_for_gpt2(gpt2_tokenizer)


# Define function to compute metrics

In [9]:
import evaluate
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from nltk.translate.nist_score import sentence_nist

# Load metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
cider = Cider()
meteor = Meteor()

def compute_metrics(pred):
    predictions = pred["predictions"]
    labels = pred["label_ids"]
    
    print(predictions[0])
    # print(labels[0])
    
    # Decode predictions and labels
    decoded_preds = predictions #gpt2_tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = labels #gpt2_tokenizer.batch_decode(labels, skip_special_tokens=True)
    # decoded_labels = [[label] for label in decoded_labels]  # Format for some metrics

    # Compute BLEU
    bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)["bleu"]
    
    # Compute ROUGE-L
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=["rougeL"])
    rougeL_score = rouge_result["rougeL"]
    
    # Compute CIDEr
    # Format the predictions and references into dictionaries with index as key
    # CIDEr expects the input format to be a dictionary with example ID as the key
    refs = {i: [label] for i, label in enumerate(decoded_labels)}
    preds = {i: [pred] for i, pred in enumerate(decoded_preds)}
    cider_score, _ = cider.compute_score(refs, preds)
    
    # Compute METEOR
    meteor_score, _ = meteor.compute_score(preds, refs)
    
    # Compute NIST
    nist_scores = []
    decoded_labels = [label.split() for label in decoded_labels]
    # print(f"decoded Labels: {decoded_labels}")
    for candidate in decoded_preds:
        nist_score = sentence_nist(decoded_labels, candidate.split())  # You can adjust n as needed
        nist_scores.append(nist_score)

    avg_nist_score = sum(nist_scores) / len(nist_scores) if nist_scores else 0  # Average NIST score

    return {
        "bleu": bleu_score,
        "nist": avg_nist_score,
        "meteor": meteor_score,
        "rougeL": rougeL_score,
        "cider": cider_score,
    }



# Define Custom Eval/Test Loop

In [13]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

def evaluate_with_custom_generation(trainer, out_filename, beam_size, length_penalty, no_repeat_ngram_size, batch_size=8):
    model = trainer.model
    tokenizer = gpt2_tokenizer
    model.eval()  # Set the model to evaluation mode

    # DataLoader for evaluation dataset
    eval_dataloader = DataLoader(
        trainer.eval_dataset,
        batch_size=batch_size,
        collate_fn=DataCollatorWithPadding(tokenizer)
    )

    eval_predictions = []
    eval_labels = []

    with torch.no_grad():  # Disable gradient calculations for evaluation
        for batch in eval_dataloader:
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)

            # print(f"batch: {tokenizer.batch_decode(input_ids, skip_special_tokens=True)}")
            # print(f"input ids: {input_ids}")
            
            # Generate predictions with beam search
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                num_beams=beam_size,
                length_penalty=length_penalty,
                no_repeat_ngram_size=no_repeat_ngram_size,
                max_length=input_ids.size(1) + 40,
                eos_token_id=model.config.eos_token_id,  # Ensure eos_token_id is set
                early_stopping=True,
            )

            # print(model.config.eos_token_id)
            # Remove the input_ids from the generated outputs
            # Slice the generated output to remove input part (this assumes `outputs` includes the input tokens)
            generated_ids = outputs[:, input_ids.size(1):]  # Skip the input length

            # Decode predictions
            decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            eval_predictions.extend(decoded_outputs)

            # Decode labels for metric comparison
            decoded_labels = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)
            eval_labels.extend(decoded_labels)
            break  #=====================================================================

    # Prepare predictions and labels for metric computation
    pred = {
        'predictions': eval_predictions,
        'label_ids': eval_labels
    }

    # save preds to file
    with open(out_filename, 'w') as f:
        f.writelines(item + "\n" for item in eval_predictions)

    # Compute custom evaluation metrics
    custom_eval_metrics = trainer.compute_metrics(pred)
    print("Evaluation Metrics:", custom_eval_metrics)

    return custom_eval_metrics 

# Baseline LoRA Config and Model

In [11]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# From the original implementation in https://arxiv.org/abs/2106.09685
# Note that we are using vanilla GPT-2 (i.e. GPT-2 Small), rather than the original GPT-2 Medium
base_lora_config = LoraConfig(
    r=4,
    lora_alpha=32,
    target_modules=["c_attn"],  # only W_q and W_v are used in the benchmark
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",  # TaskType.LM,
    init_lora_weights = True,
)
base_model = get_peft_model(base_gpt2_model, base_lora_config)
# Print trainable parameters
trainable_params = sum(p.numel() for p in base_model.parameters() if p.requires_grad)
print(f"Trainable Parameters: {trainable_params}")

# all batches originally 8, but made smaller bc ran out of memory
base_training_args = TrainingArguments(
    output_dir="./base_results",
    eval_strategy="no",
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_steps=500,
    label_smoothing_factor=0.1,
    logging_dir='./base_logs',
    logging_steps=500,
    save_total_limit=2,
    fp16=True,
)

base_trainer = Trainer(
    model=base_model,
    args=base_training_args,
    train_dataset=tokenized_train['train'],
    eval_dataset=tokenized_test['test'],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForLanguageModeling(tokenizer=gpt2_tokenizer, mlm=False),
)


base_trainer.train()
base_trainer.save_model("./base_model")


Trainable Parameters: 147456


# Eval Base Model

In [14]:
# reload model and trainer
reloaded_base_model = PeftModel.from_pretrained(base_gpt2_model, "./base_model")

# reloaded_base_model.resize_token_embeddings(len(gpt2_tokenizer))
base_trainer = Trainer(
    model=reloaded_base_model,
    args=base_training_args,
    train_dataset=tokenized_train['train'],
    eval_dataset=tokenized_test['test'],  #tokenized_train['validation'],  #['validation'],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForLanguageModeling(tokenizer=gpt2_tokenizer, mlm=False),
)

beam_size = 10
length_penalty = 0.9
no_repeat_ngram_size = 4

print('='*20)
print("BASE METRICS:")
base_metrics = evaluate_with_custom_generation(base_trainer, out_filename="base-output.txt", beam_size=beam_size, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram_size)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


BASE METRICS:
batch: [' name[Blue Spice], eatType[coffee shop], area[city centre]  ', ' name[Blue Spice], eatType[coffee shop], area[city centre]  ', ' name[Blue Spice], eatType[coffee shop], area[riverside]  ', ' name[Blue Spice], eatType[coffee shop], area[riverside]  ', ' name[Blue Spice], eatType[coffee shop], customer rating[5 out of 5], near[Crowne Plaza Hotel]  ', ' name[Blue Spice], eatType[coffee shop], customer rating[5 out of 5], near[Crowne Plaza Hotel]  ', ' name[Blue Spice], eatType[coffee shop], customer rating[5 out of 5], near[Crowne Plaza Hotel]  ', ' name[Blue Spice], eatType[coffee shop], customer rating[average], near[Burger King]  ']
                   � ................  


ZeroDivisionError: division by zero

# Better LoRA Config and Model

In [32]:
import os
from copy import deepcopy
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

better_gpt2_model = deepcopy(base_gpt2_model)

# My implementation to improve upon baseline
better_lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj", "c_fc", "wte", "wpe"],  # only W_q and W_v are used in the benchmark
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",  # TaskType.LM,
    init_lora_weights = "gaussian",
)
better_model = get_peft_model(better_gpt2_model, better_lora_config)
# Print trainable parameters
trainable_params = sum(p.numel() for p in better_model.parameters() if p.requires_grad)
print(f"Trainable Parameters: {trainable_params}")

better_training_args = TrainingArguments(
    output_dir="./better_results",
    eval_strategy="no",
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_steps=500,
    label_smoothing_factor=0.1,
    logging_dir='./better_logs',
    logging_steps=500,
    save_total_limit=2,
    fp16=True,
)

# better_trainer = Trainer(
#     model=better_model,
#     args=better_training_args,
#     train_dataset=tokenized_train['train'],
#     eval_dataset=tokenized_test['test'],
#     compute_metrics=compute_metrics,
#     data_collator=DataCollatorForLanguageModeling(tokenizer=gpt2_tokenizer, mlm=False),
# )


# better_trainer.train()
# better_trainer.save_model("./better_model")


Trainable Parameters: 3204432




# Eval Better Model

In [61]:
# reload model and trainer
reloaded_better_model = PeftModel.from_pretrained(better_gpt2_model, "./better_model")

# reloaded_base_model.resize_token_embeddings(len(gpt2_tokenizer))
better_trainer = Trainer(
    model=reloaded_better_model,
    args=better_training_args,
    train_dataset=tokenized_train['train'],
    eval_dataset=tokenized_test['test'],  #['validation'],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForLanguageModeling(tokenizer=gpt2_tokenizer, mlm=False),
)

beam_size = 10
length_penalty = 0.9
no_repeat_ngram_size = 4

print('='*20)
print("BETTER METRICS:")
better_metrics = evaluate_with_custom_generation(better_trainer, out_filename="better-output.txt", beam_size=beam_size, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram_size)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


BETTER METRICS:
Evaluation Metrics: {'bleu': 0.20487219861156433, 'nist': 2.5084701959526274, 'meteor': 0.23775311842805094, 'rougeL': 0.46319725238068893, 'cider': 0.12804611983750253}
