In [None]:
!pip install -q -U pip
!pip install -U "transformers>=4.46.0" "accelerate>=0.34.0" "peft>=0.13.0" "trl==0.26.0" "datasets>=3.0.0" "bitsandbytes>=0.43.0"



In [None]:
!pip install -q triton

Necessary Imports

In [None]:
import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig

Check for GPU

In [None]:
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CUDA Version: {torch.version.cuda}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

CUDA Available: True
CUDA Version: 12.6
GPU: Tesla T4
GPU Memory: 15.83 GB


Configuration

In [None]:
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
DATASET_NAME = "databricks/databricks-dolly-15k"
OUTPUT_DIR = "./mistral-7b-dolly-qlora"
ADAPTER_DIR = "./mistral-7b-dolly-adapter"

QLoRA configuration for 4-bit quantization

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

LoRA hyperparameters (targeting Mistral architecture)

In [None]:
lora_config = LoraConfig(
    r=16,  # Low-rank dimension
    lora_alpha=32,  # Scaling factor
    target_modules=[
        "q_proj",  # Query projection
        "k_proj",  # Key projection
        "v_proj",  # Value projection
        "o_proj",  # Output projection
        "gate_proj",  # MLP gate
        "up_proj",  # MLP up
        "down_proj",  # MLP down
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

Training Hyperparameters

In [None]:
sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    max_length=128,
    dataset_text_field="text",
    packing=False,
    fp16=False,
    bf16=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    optim="paged_adamw_32bit",
    report_to="none",
)

Load Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Load Model

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

trainable params: 41,943,040 || all params: 7,283,675,136 || trainable%: 0.5758


Load Dataset

In [None]:
dataset = load_dataset(DATASET_NAME, split="train")

README.md: 0.00B [00:00, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Choose 1000rows only for fast training

In [None]:
dataset = dataset.shuffle(seed=42).select(range(1000))

In [None]:
len(dataset)

1000

In [None]:
dataset[0]

{'instruction': 'Who were the children of the legendary Garth Greenhand, the High King of the First Men in the series A Song of Ice and Fire?',
 'context': '',
 'response': 'Garth the Gardener, John the Oak, Gilbert of the Vines, Brandon of the Bloody Blade, Foss the Archer, Owen Oakenshield, Harlon the Hunter, Herndon of the Horn, Bors the Breaker, Florys the Fox, Maris the Maid, Rose of the Red Lake, Ellyn Ever Sweet, Rowan Gold-Tree',
 'category': 'open_qa'}

Format prompts function for instruction tuning


In [None]:
def format_instruction(sample):
    """
    Formats Dolly dataset entries into instruction-response pairs.
    Format: [INST] instruction + context [/INST] response
    """
    instruction = sample["instruction"]
    context = sample.get("context", "")
    response = sample["response"]

    # Construct prompt (Mistral-Instruct format)
    if context:
        prompt = f"[INST] {instruction}\n\nContext: {context} [/INST] {response}"
    else:
        prompt = f"[INST] {instruction} [/INST] {response}"

    return {"text": prompt}

Apply formatting

In [None]:
dataset = dataset.map(format_instruction, remove_columns=dataset.column_names)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
print(dataset[0]['text'])

[INST] Who were the children of the legendary Garth Greenhand, the High King of the First Men in the series A Song of Ice and Fire? [/INST] Garth the Gardener, John the Oak, Gilbert of the Vines, Brandon of the Bloody Blade, Foss the Archer, Owen Oakenshield, Harlon the Hunter, Herndon of the Horn, Bors the Breaker, Florys the Fox, Maris the Maid, Rose of the Red Lake, Ellyn Ever Sweet, Rowan Gold-Tree


In [None]:
dtypes = set(p.dtype for p in model.parameters())
print(f"Model parameters dtypes: {dtypes}")

Model parameters dtypes: {torch.float32, torch.uint8}


Fine-tuning with SFTTrainer

In [None]:
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=dataset,
    processing_class=tokenizer,
)

Adding EOS to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss
10,2.1959
20,1.6519
30,1.5392
40,1.6166
50,1.6031
60,1.5233
70,1.6667
80,1.5361
90,1.4383
100,1.449


TrainOutput(global_step=250, training_loss=1.5824576835632325, metrics={'train_runtime': 3365.5825, 'train_samples_per_second': 0.297, 'train_steps_per_second': 0.074, 'total_flos': 4185088231317504.0, 'train_loss': 1.5824576835632325, 'epoch': 1.0})

Save LoRA Adapters

In [None]:
trainer.model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)

('./mistral-7b-dolly-adapter/tokenizer_config.json',
 './mistral-7b-dolly-adapter/special_tokens_map.json',
 './mistral-7b-dolly-adapter/chat_template.jinja',
 './mistral-7b-dolly-adapter/tokenizer.model',
 './mistral-7b-dolly-adapter/added_tokens.json',
 './mistral-7b-dolly-adapter/tokenizer.json')

#Inference

Clear memory

In [None]:
del trainer
del model
torch.cuda.empty_cache()

Reload quantization configuration

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

Reload tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR)
tokenizer.pad_token = tokenizer.eos_token

Reload Base Model

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map={"": 0},
    trust_remote_code=True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Load LoRA adapters

In [None]:
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pro

Test Inference

In [None]:
def generate_response(prompt, max_new_tokens=256):
    """
    Generate a response for a given instruction prompt.
    """

    # Ensure proper Mistral-Instruct formatting (avoid double wrap)
    if not prompt.strip().startswith("[INST]"):
        formatted_prompt = f"[INST] {prompt.strip()} [/INST]"
    else:
        formatted_prompt = prompt.strip()

    # Tokenize on the same device as the model
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the assistant response safely
    if "[/INST]" in decoded:
        response = decoded.split("[/INST]", 1)[-1].strip()
    else:
        response = decoded.strip()

    return response

In [None]:
test_prompts = [
    "Write a short poem about machine learning.",
    "Explain quantum computing in simple terms."
]

print("Testing fine-tuned model:\n")
for i, prompt in enumerate(test_prompts, 1):
    print(f"Example {i}:")
    print(f"Prompt: {prompt}")
    response = generate_response(prompt)
    print(f"Response: {response}")
    print("-" * 80 + "\n")

Testing fine-tuned model:

Example 1:
Prompt: Write a short poem about machine learning.
Response: Machine learning, oh how grand,
A world of endless possibilities,
A world of endless data,
A world of endless knowledge.
--------------------------------------------------------------------------------

Example 2:
Prompt: Explain quantum computing in simple terms.
Response: Quantum computing is a form of computing that uses the principles of quantum mechanics to process information. Quantum computers are different from classical computers because they are able to perform operations on quantum states, which are more complex than the binary states used by classical computers. Quantum computers are able to perform certain tasks much faster than classical computers, which has the potential to revolutionize fields such as chemistry, materials science, and machine learning. However, quantum computers are still in their infancy and are not yet able to perform tasks that are impossible for classi