## Step 0: Mounting Google Drive and Importing Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/multimodal-xray-agent

!ls

In [None]:
!pip install -U bitsandbytes -q

In [None]:
!pip install flash-attn

In [None]:
import os
import json
import torch
import random
import shutil
import logging
import pandas as pd

from tqdm import tqdm
from math import ceil
from pathlib import Path
from huggingface_hub import login
from torch.utils.data import DataLoader
from transformers.utils import logging as hf_logging
from transformers import Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset, load_from_disk
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, get_peft_model_state_dict
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, EvalPrediction, default_data_collator

%load_ext tensorboard

In [None]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Step 1: Verifying GPU and Environment

In [None]:
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    device = torch.device("cuda")
    print(f"GPU detected: {device_name}")
else:
    device = torch.device("cpu")
    print("GPU not detected. Falling back to CPU.")

print(f"Running on device: {device}")

GPU detected: NVIDIA A100-SXM4-40GB
Running on device: cuda


## Step 2: Load & Preprocess Full Q/A Dataset

In [None]:
# Setting paths

PROJECT_ROOT = Path("/content/drive/MyDrive/multimodal-xray-agent")
QA_DIR = PROJECT_ROOT / "data" / "qapairs"
ADAPTER_SAVE_PATH = PROJECT_ROOT / "models" / "llama_lora_adapter"
OUTPUT_PATH = PROJECT_ROOT / "data" / "qapairs" / "llama_validation_predictions.jsonl"
METRICS_PATH = PROJECT_ROOT / "logs" / "llama_epoch_metrics.csv"
VAL_PATH = PROJECT_ROOT / "data" / "qapairs" / "val.jsonl"

SOURCE_LOG_DIR = Path("./logs")
DEST_LOG_DIR = PROJECT_ROOT / "logs"

DEST_LOG_DIR.mkdir(parents=True, exist_ok=True)
ADAPTER_SAVE_PATH.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
METRICS_PATH.parent.mkdir(parents=True, exist_ok=True)

In [None]:
dataset = load_from_disk("file://./data/tokenized_dataset")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 630
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 70
    })
})


In [None]:
print(dataset["train"][0])

{'input_ids': [128000, 128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 975, 12044, 220, 2366, 20, 271, 128009, 128006, 882, 128007, 271, 861, 279, 3682, 73833, 94257, 14955, 304, 420, 2217, 13, 128009, 128006, 78191, 128007, 271, 2822, 30883, 73151, 454, 360, 55892, 1920, 26, 23900, 11, 4325, 18251, 65324, 296, 1123, 269, 94257, 67861, 42743, 64785, 79212, 488, 13, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,

## Step 3: Model + Tokenizer Setup (QLoRA + FlashAttention)

In [None]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# It is crucial that the tokenizer here has the same pad_token setting
# as the one used in the data preparation notebook.
tokenizer.pad_token = tokenizer.eos_token

In [None]:
print("Pad token:", tokenizer.pad_token)
print("Pad token ID:", tokenizer.pad_token_id)
print("EOS token:", tokenizer.eos_token)
print("EOS token ID:", tokenizer.eos_token_id)

Pad token: <|eot_id|>
Pad token ID: 128009
EOS token: <|eot_id|>
EOS token ID: 128009


In [None]:
# Set quantization config for QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  # NormalFloat4: best for LLMs
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
# Load Llama model with FlashAttention if supported
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation="flash_attention_2",
)

In [None]:
base_model = prepare_model_for_kbit_training(base_model)

In [None]:
base_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,)

## Step 4: LoRA Configuration + PEFT Wrapping

In [None]:
# Target modules for GPT2-style transformers (BioGPT)
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"]

In [None]:
# LoRA configuration (QLoRA-optimized)
peft_config = LoraConfig(
    r=64,                         # Rank of the LoRA decomposition
    lora_alpha=128,                # Scaling factor
    target_modules=target_modules,
    lora_dropout=0.05,            # Regularization
    bias="none",                  # Do not fine-tune bias terms
    task_type="CAUSAL_LM",        # Language modeling
)

In [None]:
# Inject LoRA adapters into the base model
model = get_peft_model(base_model, peft_config)

In [None]:
model.gradient_checkpointing_enable()

In [None]:
model.config.use_cache = False

In [None]:
model.print_trainable_parameters()

trainable params: 56,885,248 || all params: 3,269,635,072 || trainable%: 1.7398


In [None]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

## Step 5: TrainingArguments configuration

In [None]:
# Define constants for clarity
BATCH_SIZE = 8
GRAD_ACC_STEPS = 4
EPOCHS = 2
LEARNING_RATE = 2e-4

In [None]:
# Calculate total training steps
total_training_samples = len(dataset["train"])
steps_per_epoch = ceil(total_training_samples / (BATCH_SIZE * GRAD_ACC_STEPS))
total_steps = steps_per_epoch * EPOCHS

In [None]:
# Calculate warmup steps
WARMUP_STEPS = int(0.05 * total_steps)

print(f"Total training steps: {total_steps}")
print(f"Warmup steps: {WARMUP_STEPS}")

Total training steps: 40
Warmup steps: 2


In [None]:
training_args = TrainingArguments(
    output_dir="./models/lora_adapter",                   # Save path
    per_device_train_batch_size=BATCH_SIZE,               # Empirically stable for A100 with QLoRA
    per_device_eval_batch_size=4,                         # Same for validation
    gradient_accumulation_steps=GRAD_ACC_STEPS,           # Effective batch size = 12 × 2 = 24
    eval_strategy="epoch",                                # Evaluate once per epoch
    save_strategy="epoch",                                # Save checkpoint once per epoch
    logging_strategy="steps",                             # Log losses periodically
    logging_dir="./logs",                                 # Save logs
    logging_steps=5,                                      # Log every 20 steps
    num_train_epochs=EPOCHS,                              # Number of fine-tuning epochs
    learning_rate=LEARNING_RATE,                          # Higher LR often better for small LoRA adapters
    warmup_steps=WARMUP_STEPS,                            # Small warmup to stabilize first few updates
    lr_scheduler_type="cosine",                           # Smooth decay
    save_total_limit=2,                                   # Retain 2 best checkpoints only
    load_best_model_at_end=True,                          # Restore best checkpoint (lowest val loss)
    report_to="tensorboard",                              # Log to TensorBoard
    run_name="llama-qlora-run",                           # Appears in TensorBoard dashboard
    bf16=True,
    group_by_length=True,                                 # Efficient packing of similar-length samples
    gradient_checkpointing=True,                          # Redundant with model setup, but safe to keep
    eval_accumulation_steps=2,                            # Solves the CUDA OOM error during training
    seed=42
)

## Step 8: Adding Perplexity as an Evaluation Metric

In [None]:
def compute_metrics(eval_pred: EvalPrediction) -> dict:
    with torch.no_grad():
        # Move the large tensors from GPU VRAM to system RAM (CPU)
        # to prevent out-of-memory errors during metric calculation.
        logits = torch.tensor(eval_pred.predictions).cpu()
        labels = torch.tensor(eval_pred.label_ids).cpu()

        # Shift logits and labels for causal language modeling.
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous().long()

        # Calculate loss, ignoring masked tokens.
        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
        loss = loss_fct(
            shift_logits.view(-1, shift_logits.size(-1)),
            shift_labels.view(-1)
        )

        # Calculate perplexity from the loss.
        perplexity = torch.exp(loss)
        return {
            "eval_loss": loss.item(),
            "eval_perplexity": perplexity.item()
        }

##  Step 6: Fine-Tuning the BioGPT Model with LoRA using SFTTrainer

In [None]:
# This helper assembles individual samples into a single batch tensor.
# It is a required component for the standard Trainer.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Epoch,Training Loss,Validation Loss,Perplexity
1,1.1077,1.102924,3.04945
2,0.9267,1.015971,2.794589


TrainOutput(global_step=40, training_loss=1.4953804016113281, metrics={'train_runtime': 285.9927, 'train_samples_per_second': 4.406, 'train_steps_per_second': 0.14, 'total_flos': 1.11307687723008e+16, 'train_loss': 1.4953804016113281, 'epoch': 2.0})

In [None]:
%reload_ext tensorboard
%tensorboard --logdir ./logs

In [None]:
for file in SOURCE_LOG_DIR.glob("*"):
    shutil.copy(file, DEST_LOG_DIR)

## Step 7: Save LoRA Adapter Weights

In [None]:
trainer.save_model(ADAPTER_SAVE_PATH.as_posix())
print(f"LoRA adapter saved to: {ADAPTER_SAVE_PATH}")

## Step 8: Generate Validation Predictions

In [None]:
# This is crucial. We need a tokenizer instance configured for left-padding.
# The tokenizer used for training can be different from the one for generation.
generation_tokenizer = AutoTokenizer.from_pretrained(model_name)
generation_tokenizer.padding_side = 'left'
generation_tokenizer.pad_token = generation_tokenizer.eos_token

In [None]:
# Load validation samples
with open(VAL_PATH, "r") as f:
    samples = [json.loads(line) for line in f]

In [None]:
# Switch model to eval mode and disable gradients
model.eval()
torch.set_grad_enabled(False)

In [None]:
# Prepare DataLoader
eval_samples = samples[0:10]
results = []
batch_size = 5
eval_loader = DataLoader(eval_samples, batch_size=batch_size)


# Run Batched Inference with CORRECTED LOOP LOGIC
for batch in tqdm(eval_loader, desc="Generating sample outputs"):

    # The batch is a dictionary of lists. Get the number of items.
    num_items_in_batch = len(batch['question'])

    # Format prompts by iterating through the batch using an index
    prompts = []
    for i in range(num_items_in_batch):
        messages = [
            {"role": "system", "content": ""},
            {"role": "user", "content": batch["question"][i]}
        ]
        prompt_text = generation_tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        prompts.append(prompt_text)

    # Tokenize with the left-padded tokenizer
    inputs = generation_tokenizer(
        prompts, return_tensors="pt", padding=True
    ).to(model.device)

    # Generate with a sampling strategy to prevent loops
    output_ids = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        top_k=50,
        pad_token_id=generation_tokenizer.pad_token_id,
    )

    # Decode cleanly
    input_ids_len = inputs["input_ids"].shape[1]
    generated_ids = output_ids[:, input_ids_len:]
    generated_answers = generation_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    # Store results using the correct index-based iteration
    for i in range(num_items_in_batch):
        results.append({
            "uuid": batch['uuid'][i],
            "question": batch['question'][i],
            "reference_answer": batch['answer'][i],
            "generated_answer": generated_answers[i],
        })

In [None]:
results

[{'uuid': 'iu_1888',
  'question': 'What is the radiologic impression?',
  'reference_answer': 'No acute cardiopulmonary abnormalities',
  'generated_answer': 'Heart size and mediastinal silhouette are within normal limits. There is no acute cardiopulmonary abnormality. The aorta is tortuous, but this is within the normal range for age. The left shoulder and chest wall are normal. There is no evidence of pneumothorax or pleural effusion. No rib fractures. No acute bony abnormality. No focal airspace consolidation or effusion. No pleural thickening or pleural effusion. No pneumothorax. No rib fractures. No acute bony abnormality. No focal airspace consolidation or effusion. No pleural thickening or pleural effusion. No pneumothorax. No rib fractures. No acute bony abnormality. No focal airspace consolidation or effusion. No pleural thickening or pleural effusion. No pneumothorax. No rib fractures. No acute bony abnormality. No focal airspace consolidation or effusion. No pleural thicken

In [None]:
prompt = "State the impression clearly in two sentences."

# Tokenize raw prompt only
inputs = generation_tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate answer
output_ids = model.generate(
    **inputs,
    max_new_tokens=64,
    do_sample=False,
    num_beams=1,
    pad_token_id=tokenizer.eos_token_id,
)

# Decode
decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(decoded)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


State the impression clearly in two sentences. The heart size is normal. There is no evidence of active disease or acute cardiopulmonary abnormality. The chest X-ray is otherwise unremarkable. No acute cardiopulmonary abnormality is identified. The heart size is normal. No active disease or acute cardiopulmonary abnormality is identified. The


In [None]:
# Save predictions
with open(OUTPUT_PATH, "w") as f:
    for example in results:
        f.write(json.dumps(example) + "\n")

print(f"Saved validation predictions to {OUTPUT_PATH}")

Saved validation predictions to /content/drive/MyDrive/multimodal-xray-agent/data/qapairs/llama_validation_predictions.jsonl


## Step 9: Final Metrics + Summary Reporting

In [None]:
# Extract training + eval logs (every log step)
records = trainer.state.log_history

# Convert to DataFrame
df = pd.DataFrame(records)

# Filter only epoch-level logs (those with eval_loss or epoch key)
epoch_logs = df[df["epoch"].notnull()][["epoch", "loss", "eval_loss", "eval_perplexity"]]

# Drop duplicates and keep last record per epoch (in case of multiple entries)
epoch_logs = epoch_logs.groupby("epoch").last().reset_index()

# Save
epoch_logs.to_csv(METRICS_PATH, index=False)

print(f"Epoch-level metrics saved to: {METRICS_PATH.resolve()}")
display(epoch_logs)

Epoch-level metrics saved to: /content/drive/MyDrive/multimodal-xray-agent/logs/llama_epoch_metrics.csv


Unnamed: 0,epoch,loss,eval_loss,eval_perplexity
0,0.253165,3.9356,,
1,0.506329,1.7588,,
2,0.759494,1.2917,,
3,1.0,1.1077,1.102924,3.04945
4,1.253165,1.0111,,
5,1.506329,0.9264,,
6,1.759494,1.005,,
7,2.0,0.9267,1.015971,2.794589


## Step 10: Fix Metadata

In [1]:
!pip install nbformat --q

In [2]:
import nbformat
import os
from google.colab import drive, files

In [None]:
drive.mount('/content/drive', force_remount=True)

In [4]:
# List the notebook directory to confirm the file exists
os.listdir("/content/drive/MyDrive/multimodal-xray-agent/notebooks")

['.gitkeep',
 '00_colab_setup.ipynb',
 '01_bootstrap.ipynb',
 '02_preprocessing.ipynb',
 '04_text_embedding_faiss_indexing.ipynb',
 '03_image_embedding_faiss_indexing.ipynb',
 '05_iu_xray_processing.ipynb',
 '06_generate_qa_pairs.ipynb',
 '08_finetune_biogpt_lora_run2.ipynb',
 '10_tokenization.ipynb',
 '09_llama3_zero_shot_eval.ipynb',
 '07_finetune_biogpt_lora.ipynb',
 'Copy of 10_tokenization.ipynb',
 '10_tokenization_fixed.ipynb',
 '12_llama3_finetuned_eval.ipynb',
 '11_finetune_llama3.2_lora.ipynb']

In [None]:
notebook_path = "/content/drive/MyDrive/multimodal-xray-agent/notebooks/11_finetune_llama3.2_lora.ipynb"

with open(notebook_path, "r") as f:
    nb = nbformat.read(f, as_version=4)

if "widgets" in nb.metadata:
    del nb.metadata["widgets"]

with open(notebook_path, "w") as f:
    nbformat.write(nb, f)

print("Notebook fixed and saved successfully!")