In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

In [2]:
%%capture
# Install latest transformers for Gemma 3N
!pip install --no-deps --upgrade transformers # Only for Gemma 3N
!pip install --no-deps --upgrade timm # Only for Gemma 3N

In [3]:
from unsloth import FastModel
import torch

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit",
    "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit",
    # Pretrained models
    "unsloth/gemma-3n-E4B-unsloth-bnb-4bit",
    "unsloth/gemma-3n-E2B-unsloth-bnb-4bit",

    # Other Gemma 3 quants
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3n-E4B-it", # Or "unsloth/gemma-3n-E2B-it"
    dtype = None, # None for auto detection
    max_seq_length = 1024, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-08-01 01:17:56.538441: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754011076.755290      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754011076.813921      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.11: Fast Gemma3N patching. Transformers: 4.54.1.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3N does not support SDPA - switching to eager!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.72G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.15G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

In [4]:
from transformers import TextStreamer
import gc
# Helper function for inference
def do_gemma_3n_inference(model, messages, max_new_tokens = 128):
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt = True, # Must add for generation
        tokenize = True,
        return_dict = True,
        return_tensors = "pt",
    ).to("cuda")
    _ = model.generate(
        **inputs,
        max_new_tokens = max_new_tokens,
        temperature = 1.0, top_p = 0.95, top_k = 64,
        streamer = TextStreamer(tokenizer, skip_prompt = True),
    )
    # Cleanup to reduce VRAM usage
    del inputs
    torch.cuda.empty_cache()
    gc.collect()

In [6]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # Should leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model.language_model` require gradients


In [7]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [9]:
%%capture 
pip install python-docx

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
from docx import Document
import re
from collections import defaultdict

# === Load document ===
doc = Document("/kaggle/input/age-8-10/Ages 8-10.docx")  # Update this path

# === Setup ===
topics = defaultdict(list)
current_topic = None
current_question = None
current_answer = []

# Topics start with capital letters and possibly "&"
def is_topic_header(text):
    return bool(re.match(r"^[A-Z][a-zA-Z& ]+$", text)) and not text.endswith("?")

def clean_answer(text):
    text = text.replace("Sparky's Answer:", "").strip()
    text = re.split(r"Wow! Fact:.*?Wow!$", text, flags=re.DOTALL)[0].strip()
    return text

# === Main parse loop ===
for para in doc.paragraphs:
    text = para.text.strip()
    if not text:
        continue

    # Section/topic header
    if is_topic_header(text):
        current_topic = text

    # Question
    elif text.endswith("?") and "Sparky's Answer" not in text:
        if current_question and current_answer:
            messages = [
                {"role": "user", "content": current_question},
                {"role": "assistant", "content": clean_answer(" ".join(current_answer))}
            ]
            topics[current_topic].append({"messages": messages})
            current_answer = []
        current_question = text

    # Answer continuation
    elif "Sparky's Answer:" in text or current_answer:
        current_answer.append(text)

# Save last Q&A
if current_question and current_answer:
    messages = [
        {"role": "user", "content": current_question},
        {"role": "assistant", "content": clean_answer(" ".join(current_answer))}
    ]
    topics[current_topic].append({"messages": messages})

# === Outputs ===
# 1. Flat list for fine-tuning
finetune_data = [ex for topic in topics.values() for ex in topic]

# 2. Show available topics and counts
print("Available topics:")
for topic, examples in topics.items():
    print(f"- {topic}: {len(examples)} examples")

Available topics:
- Human Body & Health: 60 examples
- Animals & Nature: 14 examples
- Earth & Space: 44 examples
- Physics & Chemistry: 22 examples
- Technology: 13 examples
- Food & Nutrition: 9 examples
- Language & Culture: 6 examples
- Math & Logic: 2 examples
- Sports & Recreation: 8 examples
- Time & Holidays: 14 examples
- Places & Geography: 11 examples
- Myths & Legends: 3 examples


In [12]:
from datasets import Dataset
# Create a Hugging Face Dataset
dataset = Dataset.from_list(finetune_data)

# Optional: create a train/test split
dataset_split = dataset.train_test_split(test_size=0.2)

In [13]:
from unsloth.chat_templates import standardize_data_formats
dataset = standardize_data_formats(dataset)

In [14]:
dataset[100]

{'messages': [{'content': 'What is the thermosphere?', 'role': 'user'},
  {'content': 'The thermosphere is like the "super-hot, top layer" of Earth\'s atmosphere, way, way up high! It\'s where the air is very, very thin, but it gets super hot because it\'s the first part of our atmosphere to absorb the sun\'s energy. Even though it\'s hot, you wouldn\'t feel warm there because the air particles are so far apart!',
   'role': 'assistant'}]}

In [16]:
def formatting_prompts_func(examples):
   convos = examples["messages"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
   return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/206 [00:00<?, ? examples/s]

In [17]:
dataset[100]["text"]

'<start_of_turn>user\nWhat is the thermosphere?<end_of_turn>\n<start_of_turn>model\nThe thermosphere is like the "super-hot, top layer" of Earth\'s atmosphere, way, way up high! It\'s where the air is very, very thin, but it gets super hot because it\'s the first part of our atmosphere to absorb the sun\'s energy. Even though it\'s hot, you wouldn\'t feel warm there because the air particles are so far apart!<end_of_turn>\n'

Train the Model

In [18]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "paged_adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/206 [00:00<?, ? examples/s]

To resume a training run, set trainer.train(resume_from_checkpoint = True)

In [19]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=4):   0%|          | 0/206 [00:00<?, ? examples/s]

In [20]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

'<bos><start_of_turn>user\nWhat is the thermosphere?<end_of_turn>\n<start_of_turn>model\nThe thermosphere is like the "super-hot, top layer" of Earth\'s atmosphere, way, way up high! It\'s where the air is very, very thin, but it gets super hot because it\'s the first part of our atmosphere to absorb the sun\'s energy. Even though it\'s hot, you wouldn\'t feel warm there because the air particles are so far apart!<end_of_turn>\n'

In [21]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'               The thermosphere is like the "super-hot, top layer" of Earth\'s atmosphere, way, way up high! It\'s where the air is very, very thin, but it gets super hot because it\'s the first part of our atmosphere to absorb the sun\'s energy. Even though it\'s hot, you wouldn\'t feel warm there because the air particles are so far apart!<end_of_turn>\n'

In [22]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 206 | Num Epochs = 3 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 19,210,240 of 7,869,188,432 (0.24% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,12.5307
2,12.1779
3,12.4916
4,12.6356
5,13.9792
6,11.7182
7,12.2681
8,13.6253
9,13.2534
10,13.5413


In [24]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "How do birds know where to fly when they migrate?",
    }]
}]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
    tokenize = True,
    return_dict = True,
).to("cuda")
outputs = model.generate(
    **inputs,
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

["<bos><start_of_turn>user\nHow do birds know where to fly when they migrate?<end_of_turn>\n<start_of_turn>model\nBirds have amazing navigation skills! They use a combination of things to know where to fly during migration:\n\n* **The Sun:** They can tell which direction is east and west using the sun's position.\n* **The Stars:** At night, they use the stars to guide them.\n* **The Earth"]

In [25]:
model.save_pretrained("gemma-3n")  # Local saving
tokenizer.save_pretrained("gemma-3n")
# model.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving
# tokenizer.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving

['gemma-3n/processor_config.json']