In [1]:
import json

# Path to your JSON file
json_path = "/content/interview_qa_dataset_350pairs_20250618_155016.json"

# Load the JSON data
with open(json_path, 'r') as f:
    data = json.load(f)

# Extract the list of Q&A pairs
qa_pairs = data["qa_pairs"]

# Convert to a list of tuples (question, answer)
dataset = [(item["question"], item["answer"]) for item in qa_pairs]


In [3]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastModel
import torch

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-it",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

==((====))==  Unsloth 2025.6.2: Fast Gemma3 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update a small amount of parameters!

In [6]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model` require gradients


In [7]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [9]:
from datasets import load_dataset, Dataset
import json

# Replace with the actual link to your dataset
dataset_link = "/content/interview_qa_dataset_350pairs_20250618_155016.json" # <-- **Please replace this with your dataset link**

# Load the JSON data
with open(dataset_link, 'r') as f:
    data = json.load(f)

# Restructure the data into a dictionary of lists
# This assumes each dictionary in data["qa_pairs"] has the same keys
if data and "qa_pairs" in data and isinstance(data["qa_pairs"], list):
    if data["qa_pairs"]: # Check if the list is not empty
        # Get all keys from the first dictionary to ensure consistency
        keys = data["qa_pairs"][0].keys()
        # Create the dictionary of lists
        dataset_dict = {key: [item.get(key) for item in data["qa_pairs"]] for key in keys}
        # Create the dataset from the dictionary
        dataset = Dataset.from_dict(dataset_dict)
    else:
        print("The 'qa_pairs' list is empty.")
        dataset = Dataset.from_dict({}) # Create an empty dataset
else:
    print("Data is not in the expected format (missing 'qa_pairs' key or not a list).")
    dataset = Dataset.from_dict({}) # Create an empty dataset


# The current formatting function expects a "conversations" key.
# Your dataset has "question" and "answer" keys.
# We need to adjust the formatting function or the dataset structure.
# The standardize_data_formats function in the next cell should handle this conversion.

# display(dataset)

In [10]:
from unsloth.chat_templates import standardize_data_formats
dataset = standardize_data_formats(dataset)

Let's see how row 100 looks like!

In [11]:
dataset[100]

{'id': 101,
 'topic': 'Tar/Gzip',
 'question': 'How do you create a compressed tar archive?',
 'answer': 'Use tar with the -czf flags to create a compressed gzip archive. -c creates archive, -z compresses with gzip, -f specifies filename.',
 'commands': 'tar -czf archive.tar.gz /path/to/directory/\ntar -czf backup.tar.gz file1.txt file2.txt',
 'source': 'https://stackoverflow.com/questions/18681595/tar-a-directory-but-dont-store-full-absolute-paths-in-the-archive',
 'timestamp': '2025-06-18T15:50:12.595748'}

In [16]:
def formatting_prompts_func(examples):
    # Get lists of questions and answers from the batch
    questions = examples["question"]
    answers = examples["answer"]

    # Construct conversation strings for each example in the batch
    # This assumes a simple turn structure: user question followed by model answer
    # Adjust this if your chat template or data structure is different
    convos = [{"role": "user", "content": q} for q in questions]
    # Interleave user and model turns - this might need adjustment based on template
    # A more robust approach for multiple turns would be needed for complex data
    # For this dataset structure (Q&A), we can create a list of message dictionaries
    # for each example, containing the user query and the model response.
    messages = []
    for q, a in zip(questions, answers):
        messages.append([
            {"role": "user", "content": q},
            {"role": "model", "content": a}
        ])

    # Apply the chat template to each list of messages in the batch
    # The tokenizer's apply_chat_template expects a list of message dictionaries
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False).removeprefix('<bos>') for convo in messages]

    # Return the processed texts under the 'text' key
    return { "text" : texts }

# Apply the formatting function to the dataset in batches
dataset = dataset.map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

Let's see how the chat template did! Notice there is no `<bos>` token as the processor tokenizer will be adding one.

In [17]:
dataset[100]["text"]

'<start_of_turn>user\nHow do you create a compressed tar archive?<end_of_turn>\n<start_of_turn>model\nUse tar with the -czf flags to create a compressed gzip archive. -c creates archive, -z compresses with gzip, -f specifies filename.<end_of_turn>\n'

In [18]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
        dataset_num_proc=2,
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/350 [00:00<?, ? examples/s]

In [19]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=2):   0%|          | 0/350 [00:00<?, ? examples/s]

Let's verify masking the instruction part is done! Let's print the 100th row again.  Notice how the sample only has a single `<bos>` as expected!

In [20]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

'<bos><start_of_turn>user\nHow do you create a compressed tar archive?<end_of_turn>\n<start_of_turn>model\nUse tar with the -czf flags to create a compressed gzip archive. -c creates archive, -z compresses with gzip, -f specifies filename.<end_of_turn>\n'

Now let's print the masked out example - you should see only the answer is present:

In [21]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                  Use tar with the -czf flags to create a compressed gzip archive. -c creates archive, -z compresses with gzip, -f specifies filename.<end_of_turn>\n'

In [22]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
1.512 GB of memory reserved.


Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [23]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 350 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 6,522,880/1,000,000,000 (0.65% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,7.3048
2,4.9116
3,5.4406
4,5.4872
5,4.5323
6,3.7263
7,3.0842
8,2.6855
9,2.1703
10,2.3507


In [24]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

147.6455 seconds used for training.
2.46 minutes used for training.
Peak reserved memory = 1.512 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 10.257 %.
Peak reserved memory for training % of max memory = 0.0 %.


<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Gemma-3` team, the recommended settings for inference are `temperature = 1.0, top_p = 0.95, top_k = 64`

In [28]:
from unsloth.chat_templates import get_chat_template
from transformers import TextStreamer
import torch # Import torch to convert token IDs to tensor

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

# Get a question from the dataset (e.g., the 100th example)
dataset_question = dataset[100]["question"]

messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : dataset_question, # Use the question from the dataset
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False, # Keep as string for now
    add_generation_prompt = True, # Must add for generation
)

# Tokenize the formatted string to get a list of token IDs
input_ids_list = tokenizer.encode(text)

# Convert the list of token IDs to a PyTorch tensor and add a batch dimension
input_ids = torch.tensor([input_ids_list]).to("cuda")

outputs = model.generate(
    input_ids, # Pass the tokenized input tensor
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

['<bos><bos><start_of_turn>user\nHow do you create a compressed tar archive?<end_of_turn>\n<start_of_turn>model\nUsing gzip for compression with tar, can be very useful for reducing the archive size, but you can use bzip2 compression as well.\n\nUsing gzip is generally recommended because of its better compression ratio.\n\nUse -z for gzip and -k for keep a certain number of files. use -v for verbose output']

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [30]:
model.save_pretrained("gemma-3")  # Local saving
tokenizer.save_pretrained("gemma-3")
# model.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving
# tokenizer.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving

('gemma-3/tokenizer_config.json',
 'gemma-3/special_tokens_map.json',
 'gemma-3/chat_template.jinja',
 'gemma-3/tokenizer.model',
 'gemma-3/added_tokens.json',
 'gemma-3/tokenizer.json')

In [31]:
# Zip the gemma-3 folder
!zip -r gemma-3.zip gemma-3/

# Instructions for downloading
print("The folder 'gemma-3' has been zipped into 'gemma-3.zip'.")
print("You can download 'gemma-3.zip' from the files section in the left sidebar.")

  adding: gemma-3/ (stored 0%)
  adding: gemma-3/tokenizer_config.json (deflated 97%)
  adding: gemma-3/adapter_model.safetensors (deflated 8%)
  adding: gemma-3/special_tokens_map.json (deflated 77%)
  adding: gemma-3/README.md (deflated 66%)
  adding: gemma-3/tokenizer.model (deflated 52%)
  adding: gemma-3/chat_template.jinja (deflated 70%)
  adding: gemma-3/added_tokens.json (stored 0%)
  adding: gemma-3/tokenizer.json (deflated 83%)
  adding: gemma-3/adapter_config.json (deflated 57%)
The folder 'gemma-3' has been zipped into 'gemma-3.zip'.
You can download 'gemma-3.zip' from the files section in the left sidebar.
