# Imports

In [1]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import pandas as pd
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np
import random

  from .autonotebook import tqdm as notebook_tqdm


# Enforce Reproducibility

In [2]:
seed = 44

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(42)  # if using multi-GPU

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load Pre-trained GPT-2 Components

In [3]:
# Load pre-trained model (GPT-2)
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')

# Load pre-trained tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add padding token for the tokenizer if not present
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
gpt2_model.config.pad_token_id = gpt2_model.config.eos_token_id

# Add special tokens to the tokenizer
special_tokens = {
    'bos_token': '<|bos|>',
    'eos_token': '<|eos|>',
    'pad_token': '<|pad|>',
    'sep_token': '<|sep|>'
}
gpt2_tokenizer.add_special_tokens(special_tokens)

# Initialize model with resized token embeddings
gpt2_model.resize_token_embeddings(len(gpt2_tokenizer))



Embedding(50261, 768)

# Load and Preprocess Data

In [4]:
def prep_e2e_for_gpt2(tokenizer):
    dataset = load_dataset("e2e_nlg")

    # Add special tokens
    special_tokens = {
        'bos_token': '<|bos|>',
        'eos_token': '<|eos|>',
        'pad_token': '<|pad|>',
        'sep_token': '<|sep|>'
    }
    tokenizer.add_special_tokens(special_tokens)

    def preprocess_function(examples):
        # Format: <|bos|> meaning_representation <|sep|> target <|eos|>
        texts = [
            f"{tokenizer.bos_token} {mr} {tokenizer.sep_token} {ref} {tokenizer.eos_token}"
            for mr, ref in zip(examples['meaning_representation'], examples['human_reference'])
        ]
        
        # Tokenize
        encodings = tokenizer(
            texts,
            truncation=True,
            padding='max_length',
            max_length=128,
        )
        
        # Create labels (same as input_ids)
        encodings['labels'] = encodings['input_ids']
        return encodings
    
    # Process all splits
    tokenized_dataset = dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=dataset['train'].column_names
    )
    
    # Set format to PyTorch tensors
    tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    
    return tokenized_dataset
#=====================================================

tokenized_dataset = prep_e2e_for_gpt2(gpt2_tokenizer)


Map: 100%|██████████| 42061/42061 [00:08<00:00, 4683.70 examples/s]
Map: 100%|██████████| 4672/4672 [00:01<00:00, 4540.14 examples/s]
Map: 100%|██████████| 4693/4693 [00:01<00:00, 4185.35 examples/s]


# Baseline LoRA Config and Model

In [5]:
# From the original implementation in https://arxiv.org/abs/2106.09685
# Note that we are using vanilla GPT-2 (i.e. GPT-2 Small), rather than the original GPT-2 Medium
base_lora_config = LoraConfig(
    r=4,
    lora_alpha=32,
    target_modules=["c_attn"],  # only W_q and W_v are used in the benchmark
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",  # TaskType.LM,
    init_lora_weights = True,
)
base_model = get_peft_model(gpt2_model, base_lora_config)



# Better LoRA Config and Model

In [6]:
# My implementation to improve upon baseline
better_lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj", "c_fc", "wte", "wpe"],  # only W_q and W_v are used in the benchmark
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",  # TaskType.LM,
    init_lora_weights = "gaussian",
)
better_model = get_peft_model(gpt2_model, better_lora_config)

# Define function to compute metrics

In [7]:
import evaluate
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from nltk.translate.nist_score import sentence_nist

# Load metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
cider = Cider()
meteor = Meteor()

def compute_metrics(pred):
    predictions = pred.predictions
    labels = pred.label_ids
    
    # Decode predictions and labels
    decoded_preds = gpt2_tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = gpt2_tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_labels = [[label] for label in decoded_labels]  # Format for some metrics

    # Compute BLEU
    bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)["bleu"]
    
    # Compute ROUGE-L
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=["rougeL"])
    rougeL_score = rouge_result["rougeL"].mid.fmeasure
    
    # Compute CIDEr
    cider_score, _ = cider.compute_score(decoded_labels, decoded_preds)
    
    # Compute METEOR
    meteor_score, _ = meteor.compute_score(decoded_labels, decoded_preds)
    
    # Compute NIST
    nist_scores = []
    for candidate in decoded_preds:
        nist_score = sentence_nist(decoded_labels, candidate.split(), n=5)  # You can adjust n as needed
        nist_scores.append(nist_score)

    avg_nist_score = sum(nist_scores) / len(nist_scores) if nist_scores else 0  # Average NIST score

    return {
        "bleu": bleu_score,
        "rougeL": rougeL_score,
        "cider": cider_score,
        "meteor": meteor_score,
        "nist": avg_nist_score
    }



# Define custom eval/test loop

In [8]:
def evaluate_with_custom_generation(trainer, beam_size, length_penalty, no_repeat_ngram_size):
    model = trainer.model
    tokenizer = trainer.tokenizer
    model.eval()  # Set the model to evaluation mode
    eval_predictions = []
    
    with torch.no_grad():  # Disable gradient calculations for evaluation
        for example in trainer.eval_dataset:
            input_ids = example['input_ids']
            # Generate predictions with the specified beam size
            outputs = model.generate(input_ids=input_ids, num_beams=beam_size, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram_size)
            decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
            eval_predictions.append(decoded_output)
    # Compute metrics on the evaluation predictions
    pred = {
        'predictions': tokenizer.batch_encode_plus(eval_predictions, return_tensors='pt', padding=True)['input_ids'],
        'label_ids': tokenizer.batch_encode_plus(trainer.eval_dataset['validation']['target'], return_tensors='pt', padding=True)['input_ids']
    }
    custom_eval_metrics = compute_metrics(pred)
    print("Evaluation Metrics:", custom_eval_metrics)
    return custom_eval_metrics

# Trainer Configs (same args for both models)

In [9]:
base_training_args = TrainingArguments(
    output_dir="./base_results",
    evaluation_strategy="epoch",
    learning_rate=2e-2,
    lr_scheduler_type="linear",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_steps=500,
    label_smoothing_factor=0.1,
    logging_dir='./base_logs',
    logging_steps=10,
    save_total_limit=2,
    fp16=True,
)

better_training_args = TrainingArguments(
    output_dir="./better_results",
    evaluation_strategy="epoch",
    learning_rate=2e-2,
    lr_scheduler_type="linear",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_steps=500,
    label_smoothing_factor=0.1,
    logging_dir='./better_logs',
    logging_steps=10,
    save_total_limit=2,
    fp16=True,
)

base_trainer = Trainer(
    model=base_model,
    args=base_training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForLanguageModeling(tokenizer=gpt2_tokenizer, mlm=False),
)

better_trainer = Trainer(
    model=better_model,
    args=better_training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForLanguageModeling(tokenizer=gpt2_tokenizer, mlm=False),
)



# Fine-Tune and Save Both Models

In [10]:
base_trainer.train()
better_trainer.train()

base_trainer.save_model("./base_model")
base_trainer.save_model("./better_model")

  0%|          | 10/26290 [00:19<12:18:22,  1.69s/it]

{'loss': 39.2352, 'grad_norm': 103.02232360839844, 'learning_rate': 0.0004, 'epoch': 0.0}


  0%|          | 20/26290 [00:36<12:20:33,  1.69s/it]

{'loss': 7.6753, 'grad_norm': 6.699336528778076, 'learning_rate': 0.0008, 'epoch': 0.0}


  0%|          | 30/26290 [00:53<12:27:50,  1.71s/it]

{'loss': 4.4593, 'grad_norm': 4.942017078399658, 'learning_rate': 0.0012, 'epoch': 0.01}


  0%|          | 40/26290 [01:10<12:26:25,  1.71s/it]

{'loss': 3.3318, 'grad_norm': 3.2313144207000732, 'learning_rate': 0.0016, 'epoch': 0.01}


  0%|          | 50/26290 [01:27<12:27:16,  1.71s/it]

{'loss': 2.9052, 'grad_norm': 3.7672278881073, 'learning_rate': 0.002, 'epoch': 0.01}


  0%|          | 60/26290 [01:44<12:38:42,  1.74s/it]

{'loss': 2.8394, 'grad_norm': 4.038674831390381, 'learning_rate': 0.0024, 'epoch': 0.01}


  0%|          | 70/26290 [02:02<12:34:11,  1.73s/it]

{'loss': 2.6866, 'grad_norm': 4.397104740142822, 'learning_rate': 0.0028000000000000004, 'epoch': 0.01}


  0%|          | 80/26290 [02:19<12:36:24,  1.73s/it]

{'loss': 2.7132, 'grad_norm': 4.284475803375244, 'learning_rate': 0.0032, 'epoch': 0.02}


  0%|          | 90/26290 [02:36<12:33:42,  1.73s/it]

{'loss': 2.6996, 'grad_norm': 4.1000590324401855, 'learning_rate': 0.0036, 'epoch': 0.02}


  0%|          | 100/26290 [02:54<12:27:45,  1.71s/it]

{'loss': 2.7198, 'grad_norm': 7.721547603607178, 'learning_rate': 0.004, 'epoch': 0.02}


  0%|          | 110/26290 [03:11<12:32:24,  1.72s/it]

{'loss': 2.6921, 'grad_norm': 7.529531002044678, 'learning_rate': 0.0044, 'epoch': 0.02}


  0%|          | 118/26290 [03:25<12:41:14,  1.75s/it]

KeyboardInterrupt: 

# Evaluate Both Models

In [None]:
beam_size = 10
length_penalty = 0.9
no_repeat_ngram_size = 4

print('='*20)
print("BASE METRICS:")
base_metrics = evaluate_with_custom_generation(base_trainer, beam_size=beam_size, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram_size)

print('='*20)
print("BETTER METRICS:")
better_metrics = evaluate_with_custom_generation(better_trainer, beam_size=beam_size, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram_size)
