In [3]:
import pandas as pd
import torch
from transformers import (AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, 
                          DataCollatorForLanguageModeling, EarlyStoppingCallback, get_linear_schedule_with_warmup, AdamW, 
                          TrainerCallback)

from datasets import load_dataset, Dataset, DatasetDict
from peft import get_peft_model, LoraConfig, TaskType
from accelerate import Accelerator
import evaluate
from peft import PeftModel

from sklearn.model_selection import train_test_split

In [4]:
PROC_NAME = 'ft_model_5epochs'

In [2]:
INPUT_SIZE = 64
TARGET_SIZE = 128
HYPER_TRAIN_SIZE = 0.2
DATASET = 'e2e_nlg_cleaned'
BASE_MODEL_NAME = 'gpt2-medium'
EARLY_STOPPING_PATIENCE = 3 
EARLY_STOPPING_THRESHOLD = 0.001
NUM_TRAIN_EPOCHS = 5
WARMUP_FRACTION = 0.1

LEARNING_RATE = 5e-5
BATCH_SIZE = 8
WEIGHT_DECAY = 0.01


In [6]:
# Load the E2E NLG dataset from Hugging Face datasets library
dataset = load_dataset(DATASET)

In [7]:
len(dataset['train']), len(dataset['train'])

(33525, 33525)

In [8]:
# Load the GPT-2 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME)

In [9]:
# # Preprocess the dataset to include the meaning representation (MR) as input and human reference as target
# def preprocess_function(examples):
#     # Concatenate MR and human reference with a separator
#     # inputs = [f"<bos> {mr} <eos>" for mr in examples["meaning_representation"]]
#     inputs = [f"{mr}" for mr in examples["meaning_representation"]]
#     # targets = [f"<bos> {ref} <eos>" for ref in examples["human_reference"]]
#     targets = [f"{ref}" for ref in examples["human_reference"]]
#     # print(inputs)
#     # print(targets)
#     model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
#     labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    
#     # Replace padding token id's of the labels by -100 so that it's ignored by the loss
#     labels["input_ids"] = [
#         [(label if label != tokenizer.pad_token_id else -100) for label in labels_seq] 
#         for labels_seq in labels["input_ids"]
#     ]
#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

In [10]:
def preprocess_function(examples):
    # Extract the meaning representations (MR) and human references (target text) from the examples
    inputs = examples["meaning_representation"]
    targets = examples["human_reference"]
    
    # Tokenize the inputs (meaning representations)
    tokenized_inputs = tokenizer(
        inputs, 
        max_length=INPUT_SIZE, 
        truncation=True, 
        padding="max_length", 
        # return_tensors="pt"  # Use numpy for batch processing
    )
    
    # Tokenize the targets (human references)
    tokenized_targets = tokenizer(
        targets, 
        max_length=TARGET_SIZE, 
        truncation=True, 
        padding="max_length", 
        # return_tensors="pt"  # Use numpy for batch processing
    )
    
    # Concatenate input_ids (MR) and input_ids from the targets (human reference) into one sequence
    # This creates the full sequence: [MR, target] (all tokenized)
    concatenated_input_ids = [
        list(input_seq) + list(target_seq) for input_seq, target_seq in zip(tokenized_inputs["input_ids"], tokenized_targets["input_ids"])
    ]
    
    # Concatenate attention masks for both MR and target
    concatenated_attention_mask = [
        list(input_mask) + list(target_mask) for input_mask, target_mask in zip(tokenized_inputs["attention_mask"], tokenized_targets["attention_mask"])
    ]
    
    # Prepare the labels for loss computation:
    # We need to ignore the loss for the part corresponding to MR and only compute it for the target (human reference).
    
    labels = []
    for input_len, target_seq in zip([INPUT_SIZE] * len(inputs), tokenized_targets["input_ids"]):
        # Ignore loss for MR part by setting it to -100
        labels_seq = [-100] * input_len
        
        # For the target sequence, we keep the tokens, but set padding tokens to -100
        labels_seq += [token if token != tokenizer.pad_token_id else -100 for token in target_seq]
        
        labels.append(labels_seq)
    
    # Return the final dictionary containing input_ids, attention_mask, and labels
    return {
        "input_ids": torch.tensor(concatenated_input_ids),
        "attention_mask": torch.tensor(concatenated_attention_mask),
        "labels": torch.tensor(labels)
    }


In [11]:
# # Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["meaning_representation"], text_target=examples["human_reference"], padding="max_length", 
                     truncation=True, max_length=TARGET_SIZE, add_special_tokens=True)

In [12]:
# Apply tokenization to the dataset
# tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = dataset.map(preprocess_function, batched=True, 
                                # remove_columns=["meaning_representation", "human_reference"]
                               )

Map: 100%|██████████| 33525/33525 [00:07<00:00, 4260.26 examples/s]
Map: 100%|██████████| 4299/4299 [00:01<00:00, 3641.89 examples/s]
Map: 100%|██████████| 4693/4693 [00:01<00:00, 4152.32 examples/s]


In [13]:
# tokenized_dataset['train'], tokenized_dataset['hypervalidation'] = tokenized_dataset['train'].train_test_split(
#     test_size=0.2, seed=42).values()

In [14]:
# Convert the tokenized dataset to a pandas DataFrame for easier manipulation
df = pd.DataFrame(tokenized_dataset['train'])

# Step 1: Identify unique MRs and group by MR
grouped_by_mr = df.groupby('meaning_representation')

# Step 2: Extract all unique MRs
unique_mrs = df['meaning_representation'].unique()

# Step 3: Perform train-test split on the unique MRs
train_mrs, hypervalidation_mrs = train_test_split(unique_mrs, test_size=HYPER_TRAIN_SIZE, random_state=42)

# Step 4: Create new DataFrames for train and hypervalidation based on the split MRs
train_df = df[df['meaning_representation'].isin(train_mrs)]
hypervalidation_df = df[df['meaning_representation'].isin(hypervalidation_mrs)]

# Step 5: Convert back to the Dataset format for Hugging Face
train_dataset = DatasetDict({"train": Dataset.from_pandas(train_df)})
hypervalidation_dataset = DatasetDict({"hypervalidation": Dataset.from_pandas(hypervalidation_df)})

tokenized_dataset = {}

# Update tokenized_dataset with the new split
tokenized_dataset['train'] = train_dataset['train'].remove_columns(["meaning_representation", "human_reference", "__index_level_0__"])

tokenized_dataset['hypervalidation'] = hypervalidation_dataset['hypervalidation'].remove_columns(["meaning_representation", "human_reference", "__index_level_0__"])

In [15]:
# Prepare data collator for language modeling
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [16]:
# # PEFT LoRA Configuration
# lora_config = LoraConfig(
#     # task_type=TaskType.CAUSAL_LM,  # Type of task
#     r=8,                           # Low-rank dimension
#     lora_alpha=32,                 # Scaling factor
#     lora_dropout=0.1,              # Dropout
#     target_modules=["c_attn", "q_attn", "v_attn"],  # GPT-2 target modules for LoRA
#     # target_modules=["c_attn"],  # GPT-2 target modules for LoRA
# )

In [17]:
# Apply PEFT LoRA to the GPT-2 model
# model = get_peft_model(model, lora_config)

In [18]:
# for name, param in model.named_parameters():
#     if(param.requires_grad):
#         print(name, param.requires_grad)

In [19]:
class PrintPredictionsCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, model=None, tokenizer=None, eval_dataloader=None, **kwargs):
        # Generate a few predictions
        model.eval()
        for batch in eval_dataloader:
            inputs = batch["input_ids"].to(model.device)
            attention_mask = batch["attention_mask"].to(model.device)

            # Generate predictions
            with torch.no_grad():
                generated_ids = model.generate(inputs, attention_mask=attention_mask, max_length=256)

            # Filter out invalid token IDs and padding (-100) from inputs and predictions
            def safe_decode(token_ids):
                # Filter out invalid token IDs (e.g., -100) before decoding
                # print(token_ids)
                valid_token_ids = [token_id for token_id in token_ids if 0 <= token_id < tokenizer.vocab_size]
                return tokenizer.decode(valid_token_ids, skip_special_tokens=True)

            # Decode the input, predictions, and true references
            inputs_decoded = [safe_decode(input_ids) for input_ids in inputs]
            preds_decoded = [safe_decode(generated_id) for generated_id in generated_ids]
            refs_decoded = [safe_decode(ref) for ref in batch["labels"]]

            # Print out the input, prediction, and true reference
            for i in range(min(3, len(inputs_decoded))):  # Print up to 3 samples per evaluation
                print(f"\nInput (MR): {inputs_decoded[i]}")
                print(f"Prediction: {preds_decoded[i]}")
                print(f"Reference: {refs_decoded[i]}")
            
            break  # Remove this to print for every batch during evaluation

In [21]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=f'./results/{PROC_NAME}',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,  # This batch size is per GPU
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_dir=f'./logs/{PROC_NAME}',
    logging_steps=10,
    push_to_hub=False,
    # fp16=True,  # Enable mixed-precision training for faster training
    report_to=["tensorboard"],
    load_best_model_at_end=True,  # Required for early stopping
    metric_for_best_model="eval_loss",  # Metric to determine the best model (optional)
    greater_is_better=False,  # Set to False if lower metric is better (e.g., loss)
    save_total_limit=1,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
# Initialize Accelerator
# accelerator = Accelerator()
# device = accelerator.device

In [23]:
# Accelerator for multi-GPU support
train_dataset = tokenized_dataset['train']
eval_dataset = tokenized_dataset['hypervalidation']

# peft_model, train_dataset, eval_dataset = accelerator.prepare(peft_model, train_dataset, eval_dataset)

In [24]:
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)

total_steps = len(train_dataset) * training_args.num_train_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(WARMUP_FRACTION * total_steps),  # Warm-up for the first 10% of steps
    num_training_steps=total_steps
)



In [26]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                       # The model with PEFT applied
    args=training_args,                     # Training arguments
    train_dataset=train_dataset, # Training data
    eval_dataset=eval_dataset, # Validation data
    # data_collator=data_collator,
    tokenizer = tokenizer,
    optimizers=(optimizer, scheduler),  # Pass optimizer and scheduler
    callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE, 
                                     early_stopping_threshold=EARLY_STOPPING_THRESHOLD), 
               PrintPredictionsCallback()]  # Add early stopping
)

In [27]:
# Fine-tune the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,3.2319,2.522599
2,1.5387,1.466495
3,1.351,1.313167
4,1.2486,1.235538
5,1.2358,1.189717


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Input (MR): name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food.
Prediction: name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food. It is a coffee shop with a family friendly atmosphere and is near the riverside. It is a family friendly coffee shop with a family friendly atmosphere and is near the riverside. It is a family friendly coffee shop with a family friendly atmosphere and is near the riverside. It is a family friendly coffee shop with a
Reference: The Eagle is a low rated coffee shop near Burger King an

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Input (MR): name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food.
Prediction: name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food. It is located in the riverside area. It is not family friendly. It is not family friendly. It is not family friendly. It is not family friendly. It is not family friendly. It is not family friendly. It is not family friendly. It is not family friendly. It is not family friendly. It
Reference: The Eagle is a low rated coffee shop near Burger King and the riverside that is family frie

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Input (MR): name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food.
Prediction: name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food. It is located in the riverside area. It is located in the Riverside area. It is a coffee shop with a low customer rating. It is located in the riverside area. It is not family friendly. It is not family friendly. It is located in the Riverside area. It is not family friendly. It
Reference: The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendl

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Input (MR): name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food.
Prediction: name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food. It is located in the Riverside area. It is located near Burger King. It is located in the riverside area. It is a coffee shop that serves Japanese food. It is not family friendly. It has a low customer rating. It is located near Burger King. It is located near Burger King. It is near
Reference: The Eagle is a low rated coffee shop near Burger King and the riverside that is family fr

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Input (MR): name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food.
Prediction: name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food. It is located in the Riverside area. It is a coffee shop that serves Japanese food. It is located near Burger King. It has a customer rating of 1 out of 5. It is located near Burger King. It is near Burger King. It is near Burger King. It is near Burger King. It has a
Reference: The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is le

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=2095, training_loss=2.5584196131667545, metrics={'train_runtime': 1153.6363, 'train_samples_per_second': 115.964, 'train_steps_per_second': 1.816, 'total_flos': 4.659059209273344e+16, 'train_loss': 2.5584196131667545, 'epoch': 5.0})

In [28]:
# accelerator.wait_for_everyone()  # Synchronize GPUs

In [29]:
# Save the model
# peft_model.save_pretrained(f'./models/{PROC_NAME}')
model.save_pretrained(f'./models/{PROC_NAME}')
tokenizer.save_pretrained(f'./models/{PROC_NAME}')

('./models/ft_model_5epochs/tokenizer_config.json',
 './models/ft_model_5epochs/special_tokens_map.json',
 './models/ft_model_5epochs/vocab.json',
 './models/ft_model_5epochs/merges.txt',
 './models/ft_model_5epochs/added_tokens.json',
 './models/ft_model_5epochs/tokenizer.json')

In [30]:
a = 5

In [31]:
a

5

In [36]:
a

5

In [29]:
a

5