## Step 1: Mounting Google Drive and Importing Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/multimodal-xray-agent
!ls

In [None]:
import torch
import json
from huggingface_hub import login
from datasets import load_dataset, DatasetDict, load_from_disk, Dataset
from transformers import AutoTokenizer

In [None]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Step 2:  Loading the Tokenizer

In [None]:
tok = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

In [None]:
tok.pad_token = tok.eos_token

In [None]:
# Truncate from the LEFT to preserve the end of the answer and the EOS token.
tok.truncation_side = "left"

## Step 3: Loading and Formatting Dataset for Supervised Fine-Tuning

In [None]:
# Copy file from GDrive to Colab local runtime
!cp /content/drive/MyDrive/multimodal-xray-agent/data/qapairs/top_700_qa_pairs.jsonl /content/top_700_qa_pairs.jsonl

In [None]:
# Load the data manually
with open("/content/top_700_qa_pairs.jsonl", "r") as f:
    data = [json.loads(line) for line in f]

In [None]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_list(data)

## Step 4: Tokenizing the Dataset for Causal Language Modeling

In [None]:
MAX_LEN = 512 # fits captions+pubmed later; adjust if needed

def format_and_tokenize(ex):
    """
    This function takes a sample, formats it using the official Llama 3.2 chat
    template, tokenizes it, and creates the labels for fine-tuning.
    """
    # Create the message structure that Llama 3.2 expects
    # Provide an empty system prompt to prevent the tokenizer from adding a default one
    messages = [
        {"role": "system", "content": ""},
        {"role": "user", "content": ex["question"]},
        {"role": "assistant", "content": ex["answer"]}
    ]

    # To correctly create the labels, we first need to find the length of the prompt
    # after it has been formatted by the template.
    # We apply the template to the user part only and add a generation prompt.
    prompt_templated = tok.apply_chat_template(
        messages[:-1], # Apply template to system and user roles
        tokenize=False,
        add_generation_prompt=True # This adds the '<|start_header_id|>assistant<|end_header_id|>\n\n' part
    )
    p_len = len(tok.encode(prompt_templated, add_special_tokens=False))

    # Now, we create the full tokenized sequence
    full_text_templated = tok.apply_chat_template(messages, tokenize=False)
    encoded = tok(
        full_text_templated,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        add_special_tokens=False
    )

    # Your label masking logic was perfect, so we reuse it here.
    # We mask the prompt part and the padding part with -100.
    labels = list(encoded["input_ids"])
    labels[:p_len] = [-100] * p_len

    # Find the first padding token and mask all subsequent tokens
    # This is more robust than relying on attention_mask.
    try:
        first_pad_idx = labels.index(tok.pad_token_id)
        labels[first_pad_idx:] = [-100] * (len(labels) - first_pad_idx)
    except ValueError:
        # No padding token found, so no further masking is needed.
        pass

    encoded["labels"] = labels
    return encoded

In [None]:
tokenised = dataset.map(format_and_tokenize, remove_columns=["uuid", "question", "answer"])

In [None]:
print(len(tokenised))

700


In [None]:
print(tokenised[0])

{'input_ids': [128000, 128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 975, 12044, 220, 2366, 20, 271, 128009, 128006, 882, 128007, 271, 3957, 1070, 904, 6029, 315, 8624, 304, 279, 1630, 30630, 30, 128009, 128006, 78191, 128007, 271, 16, 13, 1369, 19846, 991, 12989, 325, 1764, 13, 220, 17, 13, 12227, 23108, 11, 7245, 4269, 2320, 9329, 331, 1631, 278, 19172, 304, 2163, 8582, 326, 15784, 13, 1115, 1253, 42408, 7245, 4269, 2320, 9329, 331, 1631, 278, 1156, 20450, 11, 4869, 11, 7079, 12593, 449, 810, 8870, 4994, 32758, 11, 422, 2561, 311, 8417, 1317, 9860, 20334, 13, 1442, 7000, 527, 2561, 11, 7079, 2875, 9860, 510, 793, 6486, 1182, 1507, 60, 304, 220, 18, 311, 220, 19, 4038, 13, 40388, 315, 22760, 278, 323, 30811, 83223, 15180, 12400, 5448, 505, 279, 4994, 4007, 1053, 1101, 387, 11190, 13, 4314, 1051, 539, 510, 793, 6486, 1182, 1507, 60, 2561, 520, 279, 4994, 15244, 13, 386, 6750, 6709, 4250, 387, 78076, 28544, 389, 279, 2561, 5448,

In [None]:
# Sanity check
print(tok.decode(tokenised[0]["input_ids"]))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 14 Jun 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Is there any evidence of disease in the X-ray?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

1. Severe emphysema. 2. Irregular, pleural-parenchymal opacity in left upper lobe. This may irregular pleural-parenchymal scarring, however, recommend comparison with more remote outside imaging, if available to determine long-term stability. If none are available, recommend short-term [REDACTED] in 3 to 4 months. Evaluation of coronal and sagittal reformatted images from the outside study would also be helpful. These were not [REDACTED] available at the outside institution. Malignancy cannot be confidently excluded on the available images<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_

## Step 5: Splitting the Tokenized Dataset into Train and Validation Sets

In [None]:
split_dataset = tokenised.train_test_split(test_size=0.1, seed=42)

In [None]:
dataset_dict = DatasetDict({
    "train": split_dataset["train"],
    "validation": split_dataset["test"]
})

In [None]:
print("Training examples:", len(dataset_dict["train"]))
print("Validation examples:", len(dataset_dict["validation"]))

Training examples: 630
Validation examples: 70


## Step 6: Saving the Tokenized Dataset to Disk

In [None]:
save_path = "./data/tokenized_dataset"

dataset_dict.save_to_disk(save_path)

## Step 7: Verifying the Saved Dataset

In [None]:
# Path to the saved tokenized dataset
load_path = "file://./data/tokenized_dataset"

In [None]:
# Load the dataset from disk
loaded_dataset = load_from_disk(load_path)

In [None]:
# Sanity check: view one example
print(loaded_dataset["train"][0])

{'input_ids': [128000, 128000, 128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 975, 12044, 220, 2366, 20, 271, 128009, 128006, 882, 128007, 271, 861, 279, 3682, 73833, 94257, 14955, 304, 420, 2217, 13, 128009, 128006, 78191, 128007, 271, 2822, 30883, 73151, 454, 360, 55892, 1920, 26, 23900, 11, 4325, 18251, 65324, 296, 1123, 269, 94257, 67861, 42743, 64785, 79212, 488, 13, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,

In [None]:
print(len(loaded_dataset["train"]))
print(len(loaded_dataset["validation"]))

630
70


## Step 7: Fix Metadata

In [1]:
!pip install nbformat --q

In [2]:
import nbformat
import os
from google.colab import drive, files

In [3]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# List the notebook directory to confirm the file exists
os.listdir("/content/drive/MyDrive/multimodal-xray-agent/notebooks")

['.gitkeep',
 '00_colab_setup.ipynb',
 '01_bootstrap.ipynb',
 '02_preprocessing.ipynb',
 '04_text_embedding_faiss_indexing.ipynb',
 '03_image_embedding_faiss_indexing.ipynb',
 '05_iu_xray_processing.ipynb',
 '06_generate_qa_pairs.ipynb',
 '08_finetune_biogpt_lora_run2.ipynb',
 '10_tokenization.ipynb',
 '09_llama3_zero_shot_eval.ipynb',
 '07_finetune_biogpt_lora.ipynb',
 'Copy of 10_tokenization.ipynb',
 '12_llama3_finetuned_eval.ipynb',
 '11_finetune_llama3.2_lora.ipynb',
 '10_tokenization_fixed.ipynb']

In [None]:
notebook_path = "/content/drive/MyDrive/multimodal-xray-agent/notebooks/10_tokenization_fixed.ipynb"

with open(notebook_path, "r") as f:
    nb = nbformat.read(f, as_version=4)

if "widgets" in nb.metadata:
    del nb.metadata["widgets"]

with open(notebook_path, "w") as f:
    nbformat.write(nb, f)

print("Notebook fixed and saved successfully!")