## Step 0: Mounting Google Drive and Importing Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/multimodal-xray-agent
!ls

In [None]:
!pip install -U trl bitsandbytes -q

In [None]:
!pip install sacremoses -q

In [39]:
import os
import json
import torch
import random
import shutil
import pandas as pd

from tqdm import tqdm
from pathlib import Path
from trl import SFTTrainer
from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, get_peft_model_state_dict
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, EvalPrediction, default_data_collator

%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


## Step 1: Verifying GPU and Environment

In [5]:
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    device = torch.device("cuda")
    print(f"GPU detected: {device_name}")
else:
    device = torch.device("cpu")
    print("GPU not detected. Falling back to CPU.")

print(f"Running on device: {device}")

GPU detected: NVIDIA A100-SXM4-40GB
Running on device: cuda


## Step 2: Load & Preprocess Full Q/A Dataset

In [41]:
# Setting paths

PROJECT_ROOT = Path("/content/drive/MyDrive/multimodal-xray-agent")
QA_DIR = PROJECT_ROOT / "data" / "qapairs"

TRAIN_PATH = QA_DIR / "train.jsonl"
VAL_PATH = QA_DIR / "val.jsonl"

ADAPTER_SAVE_PATH = PROJECT_ROOT / "models" / "biogpt_lora_adapter" / "run 2"
OUTPUT_PATH = PROJECT_ROOT / "data" / "qapairs" / "validation_predictions_run2.jsonl"
METRICS_PATH = PROJECT_ROOT / "logs" / "epoch_metrics_run2.csv"

SOURCE_LOG_DIR = Path("./logs")
DEST_LOG_DIR = PROJECT_ROOT / "logs" / "biogpt_qlora_run2"

DEST_LOG_DIR.mkdir(parents=True, exist_ok=True)
ADAPTER_SAVE_PATH.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
METRICS_PATH.parent.mkdir(parents=True, exist_ok=True)

In [None]:
# Load train and val datasets

train_raw = load_dataset("json", data_files=TRAIN_PATH.as_posix(), split="train")
val_raw = load_dataset("json", data_files=VAL_PATH.as_posix(), split="train")

In [8]:
len(train_raw), len(val_raw)

(1248, 312)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/BioGPT-Large")

In [10]:
tokenizer.pad_token = tokenizer.eos_token

In [11]:
tokenizer.padding_side = "left"

In [16]:
random.seed(42)

In [17]:
# Define 15 question stems for random injection
QUESTION_STEMS = [
    "What are the radiographic findings?",
    "Summarize the radiology impression.",
    "What does the chest X-ray show?",
    "Describe the key findings in this study.",
    "Are there any abnormalities present?",
    "What is the diagnostic impression?",
    "State the relevant imaging findings.",
    "Provide a concise summary of the image.",
    "What is seen on this radiograph?",
    "Report the pertinent imaging findings.",
    "Summarize the chest radiograph interpretation.",
    "Are there any acute cardiopulmonary abnormalities?",
    "Give a brief interpretation of the image.",
    "List any significant or incidental findings.",
    "Write the clinical impression based on this image.",
]

In [18]:
# Preprocessing function with label masking and dynamic question stem
def preprocess(example):
    question_stem = random.choice(QUESTION_STEMS)
    prompt = f"### Question:\n{question_stem}\n\n### Answer:\n"
    full_input = prompt + example["answer"]

    tokenized = tokenizer(full_input, truncation=True, padding="max_length", max_length=256)
    labels = [-100] * len(tokenizer(prompt)["input_ids"]) + tokenized["input_ids"][len(tokenizer(prompt)["input_ids"]):]
    labels = labels[:256] + [-100] * max(0, 256 - len(labels))  # pad or truncate to match input
    tokenized["labels"] = labels

    return tokenized

In [None]:
# Tokenize both datasets
train_dataset = train_raw.map(
    preprocess,
    batched=False,
    remove_columns=train_raw.column_names,
    load_from_cache_file=False,
    keep_in_memory=True,
)

eval_dataset = val_raw.map(
    preprocess,
    batched=False,
    remove_columns=val_raw.column_names,
    load_from_cache_file=False,
    keep_in_memory=True,
)

In [20]:
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(eval_dataset)}")

Train dataset size: 1248
Validation dataset size: 312


In [21]:
print(train_dataset[0])

{'input_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2045, 2045, 2045, 4950, 32691, 52, 15474, 5078, 6002, 8, 3352, 19626, 3210, 4, 2045, 2045, 2045, 2454, 5895, 953, 52, 13156, 885, 11, 7459, 452, 463, 16754, 3869, 126, 449, 7719, 113], 'attention_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Step 3: Model + Tokenizer Setup (QLoRA + FlashAttention)

In [22]:
# Set quantization config for QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  # NormalFloat4: best for LLMs
    bnb_4bit_compute_dtype=torch.float16,
)

In [None]:
# Load BioGPT model with FlashAttention if supported
base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/BioGPT-Large",
    quantization_config=bnb_config,
    device_map="auto"
)

In [24]:
base_model = prepare_model_for_kbit_training(base_model)

## Step 4: LoRA Configuration + PEFT Wrapping

In [25]:
# Target modules for GPT2-style transformers (BioGPT)
target_modules = ["q_proj", "v_proj"]

In [26]:
# LoRA configuration (QLoRA-optimized)
peft_config = LoraConfig(
    r=64,                          # Rank of the LoRA decomposition
    lora_alpha=16,                # Scaling factor
    target_modules=target_modules,
    lora_dropout=0.05,            # Regularization
    bias="none",                  # Do not fine-tune bias terms
    task_type="CAUSAL_LM",        # Language modeling
)

In [27]:
# Inject LoRA adapters into the base model
model = get_peft_model(base_model, peft_config)

In [28]:
model.gradient_checkpointing_enable()

In [29]:
model.config.use_cache = False

In [30]:
model.resize_token_embeddings(len(tokenizer))

BioGptScaledWordEmbedding(57717, 1600, padding_idx=1)

In [31]:
model.print_trainable_parameters()

trainable params: 19,660,800 || all params: 1,590,849,600 || trainable%: 1.2359


In [32]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BioGptForCausalLM(
      (biogpt): BioGptModel(
        (embed_tokens): BioGptScaledWordEmbedding(57717, 1600, padding_idx=1)
        (embed_positions): BioGptLearnedPositionalEmbedding(2050, 1600)
        (layers): ModuleList(
          (0-47): 48 x BioGptDecoderLayer(
            (self_attn): BioGptSdpaAttention(
              (k_proj): Linear4bit(in_features=1600, out_features=1600, bias=True)
              (v_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1600, out_features=1600, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1600, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=1600, bias=False)
                )
        

## Step 5: TrainingArguments configuration

In [33]:
training_args = TrainingArguments(
    output_dir="./models/lora_adapter",       # Save path
    per_device_train_batch_size=8,            # Empirically stable for A100 with QLoRA
    per_device_eval_batch_size=4,             # Same for validation
    gradient_accumulation_steps=4,            # Effective batch size = 12 × 2 = 24
    eval_strategy="epoch",                    # Evaluate once per epoch
    save_strategy="epoch",                    # Save checkpoint once per epoch
    logging_strategy="steps",                 # Log losses periodically
    logging_dir="./logs",                     # Save logs
    logging_steps=20,                         # Log every 20 steps
    num_train_epochs=5,                       # Number of fine-tuning epochs
    learning_rate=3e-4,                       # Higher LR often better for small LoRA adapters
    warmup_steps=100,                         # Small warmup to stabilize first few updates
    lr_scheduler_type="cosine",               # Smooth decay
    save_total_limit=2,                       # Retain 2 best checkpoints only
    load_best_model_at_end=True,              # Restore best checkpoint (lowest val loss)
    report_to="tensorboard",                  # Log to TensorBoard
    run_name="biogpt-qlora-run",              # Appears in TensorBoard dashboard
    fp16=True,                                # Use mixed precision (saves memory, faster)
    group_by_length=False,                    # Efficient packing of similar-length samples
    gradient_checkpointing=True,              # Redundant with model setup, but safe to keep
    eval_accumulation_steps=1,
    remove_unused_columns=False               # Required for TRL's SFTTrainer
)

## Step 8: Adding Perplexity as an Evaluation Metric

In [34]:
def compute_metrics(eval_pred: EvalPrediction) -> dict:
    with torch.no_grad():
        logits = torch.tensor(eval_pred.predictions).cpu()
        labels = torch.tensor(eval_pred.label_ids).cpu()

        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous().long()

        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
        loss = loss_fct(
            shift_logits.view(-1, shift_logits.size(-1)),
            shift_labels.view(-1)
        )

        perplexity = torch.exp(loss)
        return {
            "eval_loss": loss.item(),
            "eval_perplexity": perplexity.item()
        }

##  Step 6: Fine-Tuning the BioGPT Model with LoRA using SFTTrainer

In [None]:
# Define the trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    compute_metrics=compute_metrics
)

In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss,Perplexity
1,6.5294,1.147179,3.14929
2,0.9415,0.46703,1.595241
3,0.4841,0.333089,1.395269
4,0.3716,0.286769,1.332114
5,0.3344,0.277321,1.319588


TrainOutput(global_step=195, training_loss=1.3107014851692396, metrics={'train_runtime': 1816.4662, 'train_samples_per_second': 3.435, 'train_steps_per_second': 0.107, 'total_flos': 1.4331168423936e+16, 'train_loss': 1.3107014851692396})

In [None]:
%reload_ext tensorboard
%tensorboard --logdir ./logs

In [43]:
# Ensure SOURCE_LOG_DIR and DEST_LOG_DIR are defined if they weren't in the original cell
# If they were defined in previous cells, you don't need these lines.
# SOURCE_LOG_DIR = Path("./logs")
# DEST_LOG_DIR = Path("/content/drive/MyDrive/multimodal-xray-agent/logs/biogpt_qlora_run2") # Example path, use your actual path

# Find the actual run directory created by the trainer within SOURCE_LOG_DIR
# The trainer creates a subdirectory, usually named after the run_name or with a timestamp.
# We can look for the most recently modified directory, or look for a directory matching the run_name.
# Let's assume the trainer created a directory like './logs/biogpt-qlora-run-timestamp'
# A safer way is to iterate and find a directory.

# Find the latest directory created in SOURCE_LOG_DIR.
# This assumes the latest directory is the one containing the current run's logs.
log_dirs_in_source = [d for d in SOURCE_LOG_DIR.iterdir() if d.is_dir()]

if not log_dirs_in_source:
    print(f"No log directories found in {SOURCE_LOG_DIR}. TensorBoard logs may not have been generated.")
else:
    # Sort by modification time to get the latest one
    latest_log_dir = max(log_dirs_in_source, key=os.path.getmtime)

    print(f"Identified log directory to copy: {latest_log_dir}")
    print(f"Destination directory: {DEST_LOG_DIR}")

    # Use copytree to copy the entire directory
    # If the destination directory already exists, copytree will raise an error.
    # We can handle this by removing the destination directory first if it exists,
    # or by copying into a subdirectory within DEST_LOG_DIR.
    # Let's copy into a subdirectory named after the source log directory's name.

    destination_run_dir = DEST_LOG_DIR / latest_log_dir.name

    # Remove destination if it already exists to avoid error
    if destination_run_dir.exists():
        print(f"Destination log directory already exists: {destination_run_dir}. Removing it.")
        shutil.rmtree(destination_run_dir)

    print(f"Copying log directory from {latest_log_dir} to {destination_run_dir}...")
    shutil.copytree(latest_log_dir, destination_run_dir)
    print("Log directory copied successfully.")

# Now, when you want to view TensorBoard in another notebook or session,
# you will point TensorBoard to the directory within your Google Drive.
# For example, if your DEST_LOG_DIR was
# /content/drive/MyDrive/multimodal-xray-agent/logs/biogpt_qlora_run2
# and the trainer created a directory like logs/runs/train/events.out...
# then the copied path might look like
# /content/drive/MyDrive/multimodal-xray-agent/logs/biogpt_qlora_run2/runs/train
# You would then run in the other notebook:
# %load_ext tensorboard
# %tensorboard --logdir /content/drive/MyDrive/multimodal-xray-agent/logs/biogpt_qlora_run2/runs/train
# You might need to inspect the copied directory structure to find the exact path containing the event files.

Identified log directory to copy: logs/biogpt_qlora_run2
Destination directory: /content/drive/MyDrive/multimodal-xray-agent/logs/biogpt_qlora_run2
Copying log directory from logs/biogpt_qlora_run2 to /content/drive/MyDrive/multimodal-xray-agent/logs/biogpt_qlora_run2/biogpt_qlora_run2...
Log directory copied successfully.


## Step 7: Save LoRA Adapter Weights

In [44]:
trainer.save_model(ADAPTER_SAVE_PATH.as_posix())
print(f"LoRA adapter saved to: {ADAPTER_SAVE_PATH}")

LoRA adapter saved to: /content/drive/MyDrive/multimodal-xray-agent/models/biogpt_lora_adapter/run 2


## Step 8: Generate Validation Predictions

In [50]:
# Load validation samples
with open(VAL_PATH, "r") as f:
    samples = [json.loads(line) for line in f]

In [None]:
model.to(device)

In [53]:
# Switch model to eval mode and disable gradients
model.eval()
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7e1ee05daa90>

In [54]:
# Prep batched dataloader
eval_loader = DataLoader(
    samples, batch_size=10, shuffle=False, collate_fn=lambda batch: batch
)

results = []

for batch in tqdm(eval_loader, desc="Batched Generation"):
    prompts = [
        f"""### Context:
{item['answer']}

### User Question:
{item['question']}

### Instruction:
Rewrite the above impression to directly and professionally answer the user’s question as a radiology report summary.

### Answer:"""
        for item in batch
    ]

    references = [item["answer"] for item in batch]
    uuids = [item["uuid"] for item in batch]

    # Tokenize batched prompts
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)

    # Batched generation
    output_ids = model.generate(
        **inputs,
        max_new_tokens=256,                # Increased to allow for complete answers
        do_sample=False,
        num_beams=1,
        pad_token_id=tokenizer.eos_token_id,
    )

    # Decode each and clean
    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    cleaned_outputs = [
        out[len(prompt):].strip() if out.startswith(prompt) else out
        for prompt, out in zip(prompts, outputs)
    ]

    for uuid, q, r, g in zip(uuids, prompts, references, cleaned_outputs):
        results.append({
            "uuid": uuid,
            "question": q,
            "reference_answer": r,
            "generated_answer": g,
        })

Batched Generation: 100%|██████████| 32/32 [20:25<00:00, 38.31s/it]


In [55]:
results[:20]

[{'uuid': 'iu_1888',
  'question': '### Context:\nNo acute cardiopulmonary abnormalities\n\n### User Question:\nWhat is the radiologic impression?\n\n### Instruction:\nRewrite the above impression to directly and professionally answer the user’s question as a radiology report summary.\n\n### Answer:',
  'reference_answer': 'No acute cardiopulmonary abnormalities',
  'generated_answer': '# # # Context: No acute cardiopulmonary abnormalities # # # User Question: What is the radiologic impression? # # # Instruction: Rewrite the above impression to directly and professionally answer the user ’ s question as a radiology report summary. # # # Answer: "The chest radiograph shows a large amount of pleural fluid in the right hemithorax, with a small amount of pleural fluid in the left hemithorax." # # Examine the chest radiograph for evidence of pneumothorax. # # # Provide the patient with a chest radiograph and a chest radiograph with a lateral view. # # Examine the chest radiograph for eviden

In [88]:
prompt = (
    "Context: Hyperexpanded lungs, suggesting chronic obstructive pulmonary disease. "
    "No acute pulmonary process.\n\n"
    "Task: Rewrite the context into a single-sentence radiology impression. "
    "Only output the impression. Do not include explanations, conclusions, or clinical advice.\n\n"
    "Impression:"
)

# Tokenize raw prompt only
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate answer
output_ids = model.generate(
    **tokenizer(prompt, return_tensors="pt").to(model.device),
    max_new_tokens=64,
    do_sample=False,
    early_stopping=True,
    repetition_penalty=1.2,
    pad_token_id=tokenizer.eos_token_id,
)

# Decode
decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(decoded.split("Impression:")[-1].strip())  # strip extra prompt echo if present

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Chronic obstructive pulmonary disease with emphysema and bronchiectasis. Conclusion: The radiologist's report is an important communication tool that can be improved by using structured reporting to provide more information about the patient's condition in a concise manner. < / FREETEXT > < / ABSTRACT > ▃


In [86]:
# Save predictions
with open(OUTPUT_PATH, "w") as f:
    for example in results:
        f.write(json.dumps(example) + "\n")

print(f"Saved validation predictions to {OUTPUT_PATH}")

Saved validation predictions to /content/drive/MyDrive/multimodal-xray-agent/data/qapairs/validation_predictions_run2.jsonl


## Step 9: Final Metrics + Summary Reporting

In [87]:
# Extract training + eval logs (every log step)
records = trainer.state.log_history

# Convert to DataFrame
df = pd.DataFrame(records)

# Filter only epoch-level logs (those with eval_loss or epoch key)
epoch_logs = df[df["epoch"].notnull()][["epoch", "loss", "eval_loss", "eval_perplexity"]]

# Drop duplicates and keep last record per epoch (in case of multiple entries)
epoch_logs = epoch_logs.groupby("epoch").last().reset_index()

# Save
epoch_logs.to_csv(METRICS_PATH, index=False)

print(f"Epoch-level metrics saved to: {METRICS_PATH.resolve()}")
display(epoch_logs)

Epoch-level metrics saved to: /content/drive/MyDrive/multimodal-xray-agent/logs/epoch_metrics_run2.csv


Unnamed: 0,epoch,loss,eval_loss,eval_perplexity
0,0.512821,6.5294,,
1,1.0,,1.147179,3.14929
2,1.025641,2.4901,,
3,1.538462,0.9415,,
4,2.0,,0.46703,1.595241
5,2.051282,0.6387,,
6,2.564103,0.4841,,
7,3.0,,0.333089,1.395269
8,3.076923,0.3909,,
9,3.589744,0.3716,,
