In [1]:
!pip install -q -U torch transformers peft datasets bitsandbytes trl wandb

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m760.3 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m790.6 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m139.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.1/193.1 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m78.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.6/63.6 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

# We use Qwen because it is excellent at math/reasoning
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# 4-bit Quantization Config (Makes the model 4x smaller to fit in Colab RAM)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

In [3]:
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [4]:
# Load the dataset
dataset = load_dataset("gsm8k", "main", split="train").select(range(500)) # Just 500 examples for speed

# Format it so the model learns: Question -> Reasoning -> Answer
def format_prompts(examples):
    texts = []
    for question, answer in zip(examples['question'], examples['answer']):
        # formatting following Qwen's chat template structure roughly
        text = f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n{answer}<|im_end|>"
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(format_prompts, batched=True)
print(f"Sample data: {dataset[0]['text']}")

README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Sample data: <|im_start|>user
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>
<|im_start|>assistant
Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72<|im_end|>


In [5]:
peft_config = LoraConfig(
    r=16,       # Rank (Higher = smarter but slower. 16 is standard)
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"] # Target attention heads
)

In [1]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

# We use Qwen because it is excellent at math/reasoning
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# 4-bit Quantization Config (Makes the model 4x smaller to fit in Colab RAM)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

In [2]:
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# Load the dataset
dataset = load_dataset("gsm8k", "main", split="train").select(range(500)) # Just 500 examples for speed

# Format it so the model learns: Question -> Reasoning -> Answer
def format_prompts(examples):
    texts = []
    for question, answer in zip(examples['question'], examples['answer']):
        # formatting following Qwen's chat template structure roughly
        text = f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n{answer}<|im_end|>"
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(format_prompts, batched=True)
print(f"Sample data: {dataset[0]['text']}")

Sample data: <|im_start|>user
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>
<|im_start|>assistant
Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72<|im_end|>


In [4]:
peft_config = LoraConfig(
    r=16,       # Rank (Higher = smarter but slower. 16 is standard)
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"] # Target attention heads
)

In [1]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

# We use Qwen because it is excellent at math/reasoning
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# 4-bit Quantization Config (Makes the model 4x smaller to fit in Colab RAM)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

In [2]:
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# Load the dataset
dataset = load_dataset("gsm8k", "main", split="train").select(range(500)) # Just 500 examples for speed

# Format it so the model learns: Question -> Reasoning -> Answer
def format_prompts(examples):
    texts = []
    for question, answer in zip(examples['question'], examples['answer']):
        # formatting following Qwen's chat template structure roughly
        text = f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n{answer}<|im_end|>"
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(format_prompts, batched=True)
print(f"Sample data: {dataset[0]['text']}")

Sample data: <|im_start|>user
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>
<|im_start|>assistant
Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72<|im_end|>


In [4]:
peft_config = LoraConfig(
    r=16,       # Rank (Higher = smarter but slower. 16 is standard)
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"] # Target attention heads
)

In [7]:
from trl import SFTConfig, SFTTrainer

# --- NEW: Use SFTConfig instead of TrainingArguments ---
training_args = SFTConfig(
    output_dir="./results",
    dataset_text_field="text",  # <--- Moved here!
    max_length=512,         # <--- Moved here!
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=10,
    max_steps=60,
    bf16=True, # Changed from fp16=True to bf16=True for better compatibility on modern GPUs
    fp16=False, # Explicitly set fp16 to False
    packing=False
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    args=training_args,         # Pass the config here
)

print("Starting training...")
trainer.train()
print("Training complete!")



Starting training...


Step,Training Loss
10,1.0512
20,0.8498
30,0.7673
40,0.7098
50,0.7159
60,0.6818


Training complete!


In [10]:
# 1. Switch model to "Eval Mode" (Stop learning, start answering)
model.config.use_cache = True
model.eval()

# 2. Ask a Question NOT in the dataset
question = "If a train travels 60 miles in 2 hours, and then 100 miles in 3 hours, what is its average speed for the whole trip?"

# Format it like the training data
input_text = f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# 3. Generate the answer
print("Thinking...")
outputs = model.generate(
    **inputs,
    max_new_tokens=200,    # Let it write a long explanation
    temperature=0.7      # Add a little creativity
)

# 4. Decode the result
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n=== MODEL ANSWER ===")
print(answer)

Thinking...

=== MODEL ANSWER ===
user
If a train travels 60 miles in 2 hours, and then 100 miles in 3 hours, what is its average speed for the whole trip?
assistant
The total distance traveled by the train is 60 + 100 = <<60+100=160>>160 miles.
The total time taken to travel this distance is 2 + 3 = <<2+3=5>>5 hours.
Therefore, the average speed of the train for the whole trip is 160 / 5 = <<160/5=32>>32 miles per hour.
#### 32
