### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

### Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### Data Prep

In [None]:
# ---------- FINAL DATA-PREP CELL (run AFTER you have `tokenizer`) ----------
from google.colab import drive
drive.mount("/content/drive")

from datasets import load_dataset

# file paths
DATA_DIR = "/content/drive/MyDrive/data"
paths = {
    "train":      f"{DATA_DIR}/train.csv",
    "validation": f"{DATA_DIR}/validation.csv",
    "test":       f"{DATA_DIR}/test.csv",
    "hf_set":     f"{DATA_DIR}/humanFeedback.csv",
}

# Alpaca‐style template
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
EOS = tokenizer.eos_token or "</s>"

def formatting_prompts_func(examples):
    texts = [
        alpaca_prompt.format(inp, ins, out) + EOS
        for ins, inp, out in zip(
            examples["instruction"],
            examples["input"],
            examples["output"],
        )
    ]
    return {"text": texts}

# 1) Load train/validation/test with CSV loader
raw = load_dataset(
    "csv",
    data_files={
        "train":      paths["train"],
        "validation": paths["validation"],
        "test":       paths["test"],
    },
    column_names=["instruction", "output"],
)

# 2) Process each split, add the fixed input, format, then slice
for split, size in [("train", 3200), ("validation", 400), ("test", 400)]:
    ds = raw[split]
    # ← fill input with the same prompt for all examples
    ds = ds.add_column(
        "input",
        ["Construct a research methodology for the given problem."] * len(ds)
    )
    ds = ds.map(
        formatting_prompts_func,
        batched=True,
        remove_columns=ds.column_names,
    )
    globals()[f"{split}_dataset"] = ds.select(range(size))

# 3) human-feedback set: likewise fill input
raw_hf = load_dataset(
    "csv",
    data_files=paths["hf_set"],
    column_names=["instruction", "output"],
    split="train",
)
raw_hf = raw_hf.add_column(
    "input",
    ["Construct a research methodology for the given problem."] * len(raw_hf)
)
hf_set = (
    raw_hf
    .map(formatting_prompts_func, batched=True, remove_columns=raw_hf.column_names)
    .select(range(10))
)

# 4) Sanity checks
print("train_dataset length:", len(train_dataset))           # → 500
print("validation_dataset length:", len(validation_dataset)) # → 50
print("test_dataset length:", len(test_dataset))             # → 50
print("hf_set length:", len(hf_set))                         # → 10

# show that Input now contains your fixed prompt
print("\nSample train text:\n", train_dataset[0]["text"])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
train_dataset length: 3200
validation_dataset length: 400
test_dataset length: 400
hf_set length: 10

Sample train text:
 Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Construct a research methodology for the given problem.

### Input:
Antibiotic-based regimens are frequently used for the treatment of Helicobacter pylori infection .These regimens fail to eradicate H pylori in 15 % to 40 % of patients , primarily due to antimicrobial resistance and insufficient patient compliance .Effective prevention and eradication of H pylori by passive immunization with orally administered bovine antibodies has been demonstrated in animal studies , and may serve as an alternative therapy in humans .To study the efficacy and safety of orally admi

In [None]:
print("\nSample test text:\n", hf_set[4]["text"])



Sample test text:
 Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Construct a research methodology for the given problem.

### Input:
Duration of bladder catheterisation after female genital fistula repair varies widely .We aimed to establish whether 7 day bladder catheterisation was non-inferior to 14 days in terms of incidence of fistula repair breakdown in women with simple fistula .US Agency for International Development .

### Response:
In this randomised , controlled , open-label , non-inferiority trial , we enrolled patients at eight hospitals in the Democratic Republic of the Congo , Ethiopia , Guinea , Kenya , Niger , Nigeria , Sierra Leone , and Uganda .Consenting patients were eligible if they had a simple fistula that was closed after surgery and remained closed 7 days after surgery , understood study procedures and requirements , and agreed to r

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import FastLanguageModel, is_bfloat16_supported
import random, math, torch
from torch.cuda.amp import autocast

# 1) Build your TrainingArguments ONCE, with remove_unused_columns=False
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    num_train_epochs=1,
    max_steps=-1,
    learning_rate=2e-4,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    eval_strategy="steps",
    eval_steps=40,
    per_device_eval_batch_size=2,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    remove_unused_columns=False,
    seed=3407,
    output_dir="outputs",
    report_to="none",
)

# tokenize
def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="longest",
    )

train_dataset = train_dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
)
validation_dataset = validation_dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
)
test_dataset = test_dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
)

# 2) Instantiate *once*
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    dataset_text_field="text",
    packing=True,
    args=training_args,
)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!
Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!


In [None]:
# initial perplexity
import math
full_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("200-sample Perplexity on test:", math.exp(full_metrics["eval_loss"]))

Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


200-sample Perplexity on test: 9.981109207850464


In [None]:

# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")



GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
7.625 GB of memory reserved.


In [None]:
import re

# Prepare a list to collect all generated responses

FastLanguageModel.for_inference(model)  # enable optimized inference

for i, sample in enumerate(hf_set):
    # 1) Print the original formatted human‐feedback sample
    sample_text = sample["text"]

    # Split on the three markers; the regex will drop them for us
    parts = re.split(
        r"\r?\n\r?\n### Instruction:\r?\n|\r?\n\r?\n### Input:\r?\n|\r?\n\r?\n### Response:\r?\n",
        sample_text
    )

    # re.split returns ['', '<instr>', '<inp>', '<resp>'], so skip the first empty
    _, instruction, input, reference = parts

    # strip any stray whitespace
    instruction = instruction.strip()
    input = input.strip()
    reference = reference.strip()

    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    inputs = tokenizer(
    [
        alpaca_prompt.format(
            instruction, # Instruction
            input, # Input
            "", # output - leave this blank for generation!
        )
    ], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 500, use_cache = True)
    prompt = tokenizer.batch_decode(outputs)[0]

    full = (
        prompt  + "\n\n"
        "### Reference:\n"   + reference
    )
    print(f"\nHuman Feedback sample {i}:\n{full}")



Human Feedback sample 0:
<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Construct a research methodology for the given problem.

### Input:
High oxidative stress and chronic inflammation can contribute to the pathogenesis of coronary artery disease ( CAD ).Coenzyme Q10 is an endogenous lipid-soluble antioxidant.Statins therapy can reduce the biosynthesis of coenzyme Q10.The purpose of this study was to investigate the effects of a coenzyme Q10 supplement ( 300mg/d ; 150mg/b.i.d ) on antioxidation and anti-inflammation in patients who have CAD during statins therapy.Clinical Trials.gov Identifier : NCT01424761.

### Response:
The response should be appropriate and should be in the form of a research methodology for the given problem.<|end_of_text|>

### Reference:
Patients who were identified by cardiac catheterization as having at least 50 %

In [None]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,200 | Num Epochs = 1 | Total steps = 400
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss,Model Preparation Time
40,1.3861,1.384224,0.0078
80,1.4539,1.37416,0.0078
120,1.4931,1.369413,0.0078
160,1.3672,1.367253,0.0078
200,1.3708,1.3643,0.0078
240,1.2246,1.364179,0.0078
280,1.3213,1.362745,0.0078
320,1.3123,1.361318,0.0078
360,1.3772,1.360534,0.0078
400,1.1817,1.36013,0.0078


TrainOutput(global_step=400, training_loss=1.3789241325855255, metrics={'train_runtime': 1166.0012, 'train_samples_per_second': 2.744, 'train_steps_per_second': 0.343, 'total_flos': 9.597668637794304e+16, 'train_loss': 1.3789241325855255})

In [None]:
import math
full_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("200-sample Perplexity:", math.exp(full_metrics["eval_loss"]))

200-sample Perplexity: 3.878508798187442


In [None]:
import re

# Prepare a list to collect all generated responses

FastLanguageModel.for_inference(model)  # enable optimized inference

for i, sample in enumerate(hf_set):
    # 1) Print the original formatted human‐feedback sample
    sample_text = sample["text"]
    # assume sample_text is your full Alpaca‐style string
    # e.g. "…\n\n### Instruction:\n<instr>\n\n### Input:\n<inp>\n\n### Response:\n<resp><|end_of_text|>"

    # Split on the three markers; the regex will drop them for us
    parts = re.split(
        r"\r?\n\r?\n### Instruction:\r?\n|\r?\n\r?\n### Input:\r?\n|\r?\n\r?\n### Response:\r?\n",
        sample_text
    )

    # re.split returns ['', '<instr>', '<inp>', '<resp>'], so skip the first empty
    _, instruction, input, reference = parts

    # strip any stray whitespace
    instruction = instruction.strip()
    input = input.strip()
    reference = reference.strip()

    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    inputs = tokenizer(
    [
        alpaca_prompt.format(
            instruction, # Instruction
            input, # Input
            "", # output - leave this blank for generation!
        )
    ], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 500, use_cache = True)
    prompt = tokenizer.batch_decode(outputs)[0]

    # 2) Build the generation prompt by keeping everything up to "### Response:"
    #prompt = "### Response:\n" + sample_text.split("### Response:")[1]

    # 5) Save and print the generated reply
    # now concatenate strings
    full = (
        prompt  + "\n\n"
        "### Reference:\n"   + reference
    )
    print(f"\nHuman Feedback sample {i}:\n{full}")



Human Feedback sample 0:
<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Construct a research methodology for the given problem.

### Input:
High oxidative stress and chronic inflammation can contribute to the pathogenesis of coronary artery disease ( CAD ).Coenzyme Q10 is an endogenous lipid-soluble antioxidant.Statins therapy can reduce the biosynthesis of coenzyme Q10.The purpose of this study was to investigate the effects of a coenzyme Q10 supplement ( 300mg/d ; 150mg/b.i.d ) on antioxidation and anti-inflammation in patients who have CAD during statins therapy.Clinical Trials.gov Identifier : NCT01424761.

### Response:
This study was a randomized, double-blind, placebo-controlled, parallel-group trial.A total of 120 patients with CAD were randomly assigned to 2 groups : coenzyme Q10 ( n = 60 ) and placebo ( n = 60 ).The patients in the

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
7.625 GB of memory reserved.
