## Install packages

In [None]:
# These libraries are standard for modern LLM fine-tuning

# torch           → core deep learning library
# transformers    → Hugging Face models (Mistral, LLaMA, etc.)
# datasets        → easy loading & processing of datasets
# peft            → Parameter-Efficient Fine-Tuning (LoRA)
# accelerate      → helps manage devices (GPU/CPU)
# bitsandbytes    → enables 4-bit quantization (QLoRA)
# sentencepiece   → tokenizer dependency for many LLMs

!pip install -q \
  torch \
  transformers \
  datasets \
  peft \
  accelerate \
  bitsandbytes \
  sentencepiece


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h

## STEP 2: Load the Training Dataset

In [None]:
from datasets import load_dataset

# We load our medical fine-tuning data from a JSONL file
# Each line has:
#   - instruction (question)
#   - response (ideal answer)

dataset = load_dataset(
    "json",
    data_files="sample.jsonl"
)

# Print dataset structure so audience can see what's inside
print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 13
    })
})


## STEP 3: Format Data for Instruction Tuning

Large language models like Mistral expect a specific prompt format.

In [None]:
# This function converts raw instruction/response
# into the format expected by Mistral Instruct models

def format_example(example):
    # Extract user question and assistant answer from chat format
    user_msg = next(m["content"] for m in example["messages"] if m["role"] == "user")
    assistant_msg = next(m["content"] for m in example["messages"] if m["role"] == "assistant")

    prompt = f"<s>[INST] {user_msg} [/INST]"
    completion = f"{assistant_msg}</s>"

    return {"text": prompt + " " + completion}



# Apply this formatting to every row in the dataset
dataset = dataset.map(format_example)
# “This is supervised learning — we show the model the correct answer.”

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

## STEP 4: Load the Base Model with QLoRA
Why QLoRA?

“QLoRA allows us to fine-tune a 7 billion parameter model on a single Colab GPU.”

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Name of the open-source model
model_name = "mistralai/Mistral-7B-Instruct-v0.2"


In [None]:
# This configuration tells Transformers:
# - Load model in 4-bit precision (very small memory)
# - Keep computation stable using float16 internally

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                # Reduce memory usage Q of Qlora
    bnb_4bit_quant_type="nf4",        # NormalFloat4 (best quality)
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
# “We are compressing the model weights, not the training data.”

In [None]:
# Load tokenizer (converts text → numbers)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Some models don't have a padding token
# We reuse the end-of-sequence token for padding
tokenizer.pad_token = tokenizer.eos_token


In [None]:
# Load the base model itself
# device_map="auto" automatically places model on GPU

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
#STEP 5: Apply LoRA
from peft import LoraConfig, get_peft_model
# LoRA configuration:
# We are NOT training the full model
# We only train small 'adapter' matrices

lora_config = LoraConfig(
    r=16,                     # Rank of adapter (higher = more capacity)
    lora_alpha=32,            # Scaling factor
    lora_dropout=0.05,        # Regularization (prevents overfitting)
    bias="none",
    task_type="CAUSAL_LM",

    # These are internal transformer layers we adapt
    target_modules=["q_proj", "v_proj"]
)
# Attach LoRA adapters to the model
model = get_peft_model(model, lora_config)

# Print how many parameters will actually be trained
model.print_trainable_parameters()


trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940


In [None]:
def tokenize(batch):
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256 #512
    )

    # IMPORTANT:
    # For causal language modeling, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized


In [None]:
tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    remove_columns=dataset["train"].column_names
)
print(tokenized_dataset["train"].features)


Map:   0%|          | 0/13 [00:00<?, ? examples/s]

{'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8')), 'labels': List(Value('int64'))}


In [None]:
tokenized_dataset.data

{'train': MemoryMappedTable
 input_ids: list<item: int32>
   child 0, item: int32
 attention_mask: list<item: int8>
   child 0, item: int8
 labels: list<item: int64>
   child 0, item: int64
 ----
 input_ids: [[[2,2,2,2,2,...,304,4908,15885,28723,2],[2,2,2,2,2,...,26823,302,1656,28723,2],...,[2,2,2,2,2,...,605,6256,288,28723,2],[2,2,2,2,2,...,7643,11254,4623,28723,2]]]
 attention_mask: [[[0,0,0,0,0,...,1,1,1,1,1],[0,0,0,0,0,...,1,1,1,1,1],...,[0,0,0,0,0,...,1,1,1,1,1],[0,0,0,0,0,...,1,1,1,1,1]]]
 labels: [[[2,2,2,2,2,...,304,4908,15885,28723,2],[2,2,2,2,2,...,26823,302,1656,28723,2],...,[2,2,2,2,2,...,605,6256,288,28723,2],[2,2,2,2,2,...,7643,11254,4623,28723,2]]]}

In [None]:
from transformers import TrainingArguments


In [None]:
training_args = TrainingArguments(
    output_dir="./medical_lora_adapter",  # Where adapters are saved

    per_device_train_batch_size=2,
    # Batch size = how many examples the model sees at once
    # Small because GPU memory is limited

    gradient_accumulation_steps=4,
    # This simulates a larger batch size:
    # 2 × 4 = effective batch size of 8

    learning_rate=2e-4,
    # How fast the model learns
    # Too high → unstable
    # Too low → slow

    num_train_epochs=3,
    # Number of times model sees entire dataset

    fp16=True,                 # Faster training on GPU
    logging_steps=5,           # Print logs every 5 steps
    save_strategy="epoch",     # Save model after each epoch
    report_to="none"           # No external logging
)


In [None]:
from transformers import Trainer


In [None]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    args=training_args
)


In [None]:
# THIS is where learning actually happens
trainer.train()

Step,Training Loss
5,6.2873


TrainOutput(global_step=6, training_loss=6.242390791575114, metrics={'train_runtime': 23.9234, 'train_samples_per_second': 1.63, 'train_steps_per_second': 0.251, 'total_flos': 426365272719360.0, 'train_loss': 6.242390791575114, 'epoch': 3.0})

In [None]:
# Test the model
def generate(prompt):
    # Convert text → tokens
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate response
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        do_sample=False
    )

    # Convert tokens → text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
prompt = "[INST] What should be done after a patient fall? [/INST]"
print(generate(prompt))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST] What should be done after a patient fall? [/INST] 1. Assess the patient: Check for signs of injury, such as bruises, cuts, or deformities. Assess for any pain, loss of consciousness, or difficulty breathing.

2. Provide comfort: Help the patient into a comfortable position.

3. Check for vital signs: Monitor the patient's heart rate, respiratory rate, blood pressure, and temperature.

4. Provide first aid: Apply direct pressure to any bleeding wounds. Immobilize any fractured or dislocated limbs.

5. Call for medical help: If the patient's condition is serious, call for emergency medical services.

6. Document the incident: Record the details of


In [None]:
# Directory where adapter will be saved
adapter_save_path = "./medical_lora_adapter"

# Save ONLY the LoRA adapter weights
model.save_pretrained(adapter_save_path)

# Save tokenizer (important for inference)
tokenizer.save_pretrained(adapter_save_path)


('./medical_lora_adapter/tokenizer_config.json',
 './medical_lora_adapter/special_tokens_map.json',
 './medical_lora_adapter/chat_template.jinja',
 './medical_lora_adapter/tokenizer.model',
 './medical_lora_adapter/added_tokens.json',
 './medical_lora_adapter/tokenizer.json')

In [None]:
## Loading the model
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Attach adapter
model = PeftModel.from_pretrained(
    base_model,
    "./medical_lora_adapter"
)

tokenizer = AutoTokenizer.from_pretrained("./medical_lora_adapter")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
prompt = "[INST] What should be done after a patient fall? [/INST]"
print(generate(prompt))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST] What should be done after a patient fall? [/INST] 1. Assess the patient: Check for signs of injury, such as bruises, cuts, or deformities. Assess for any pain, loss of consciousness, or difficulty breathing.

2. Provide comfort: Help the patient into a comfortable position.

3. Check for vital signs: Monitor the patient's heart rate, respiratory rate, blood pressure, and temperature.

4. Provide first aid: Apply direct pressure to any bleeding wounds. Immobilize any fractured or dislocated limbs.

5. Call for medical help: If the patient's condition is serious, call for emergency medical services.

6. Document the incident: Record the details of


In [None]:
## Base model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# QLoRA config (same as training)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load BASE model (no adapters)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

base_model.eval()


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
prompt = "[INST] How should abnormal lab results be handled? [/INST]"

print("====== BASE MODEL ======\n")
print(generate(base_model, prompt))

print("\n====== FINE-TUNED ======\n")
print(generate(finetuned_model, prompt))