<a href="https://colab.research.google.com/github/rajan-bhateja/Tolkienizer/blob/master/llama_3_2_1b_instruct_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install Libraries & Frameworks

In [1]:
!pip install -q --no-deps unsloth bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install -q sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.9/511.9 kB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.8/184.8 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the foll

### Initialise the Model

In [2]:
from unsloth import FastLanguageModel
import torch
from google.colab import userdata

max_seq_length = 1024     # Choose any! We auto support RoPE Scaling internally!
dtype = None              # None for auto detection.
load_in_4bit = True       # Use QLoRA; False for LoRA

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",    # using the Llama 3.2 1B Instruct Model
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = userdata.get('HF_ACCESS_TOKEN')
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.8.0+cu126)
    Python  3.12.9 (you have 3.12.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.9: Fast Llama patching. Transformers: 4.55.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

### LoRA Config

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.8.9 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


### Converting the dataset into Llama-supported format

In [4]:
import json

input_path = "/content/qa_format_quotes.jsonl"       # old dataset
output_path = "/content/quotes_chat.jsonl"           # new instruct-friendly dataset

with open(input_path, "r", encoding="utf-8") as infile, \
     open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        ex = json.loads(line)
        chat_example = {
            "messages": [
                {"role": "system", "content": "You are Tolkienizer, a model that rewrites text into J.R.R. Tolkien’s style."},
                {"role": "user", "content": ex["prompt"]},
                {"role": "assistant", "content": ex["completion"]}
            ]
        }
        outfile.write(json.dumps(chat_example, ensure_ascii=False) + "\n")

### Load Dataset

In [5]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="quotes_chat.jsonl", split="train")

def format_example(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
            add_generation_prompt=False
        )
    }

dataset = dataset.map(format_example, batched=True)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2556 [00:00<?, ? examples/s]

Dataset({
    features: ['messages', 'text'],
    num_rows: 2556
})

### Initialise the trainer

In [6]:
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    packing = True, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 2,
        # max_steps = 60,
        learning_rate = 5e-4,
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,  # random state, could be any integer
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/2556 [00:00<?, ? examples/s]

### Start Training

In [7]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,556 | Num Epochs = 2 | Total steps = 640
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 5,636,096 of 1,241,450,496 (0.45% trained)


Step,Training Loss
10,2.9687
20,1.9071
30,1.7113
40,1.6436
50,1.6971
60,1.6027
70,1.6459
80,1.6767
90,1.7033
100,1.5855


Unsloth: Will smartly offload gradients to save VRAM!


In [8]:
trainer_stats.metrics

{'train_runtime': 999.8816,
 'train_samples_per_second': 5.113,
 'train_steps_per_second': 0.64,
 'total_flos': 7949101399695360.0,
 'train_loss': 1.4760558873414993}

### Save the Model & Tokenizer

In [9]:
model.save_pretrained_merged("Tolkienizer-3.2-1B-Instruct", tokenizer, save_method="q4_0")
# tokenizer.save_pretrained("Tolkienizer-3.2-1B-Instruct", save_method="q4_0")

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Merging weights into 16bit:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [02:51<00:00, 171.06s/it]


In [10]:
import os
from google.colab import files

directory_path = "/content/Tolkienizer-3.2-1B-Instruct"
zip_file_name = "tolkienizer_model.zip"
zip_file_path = os.path.join("/content", zip_file_name)

if os.path.isdir(directory_path):
    !zip -r "$zip_file_path" "$directory_path"
    files.download(zip_file_path)

else:
    print(f"Directory not found: {directory_path}")

  adding: content/Tolkienizer-3.2-1B-Instruct/ (stored 0%)
  adding: content/Tolkienizer-3.2-1B-Instruct/tokenizer_config.json (deflated 96%)
  adding: content/Tolkienizer-3.2-1B-Instruct/special_tokens_map.json (deflated 71%)
  adding: content/Tolkienizer-3.2-1B-Instruct/.cache/ (stored 0%)
  adding: content/Tolkienizer-3.2-1B-Instruct/.cache/huggingface/ (stored 0%)
  adding: content/Tolkienizer-3.2-1B-Instruct/.cache/huggingface/download/ (stored 0%)
  adding: content/Tolkienizer-3.2-1B-Instruct/.cache/huggingface/download/model.safetensors.lock (stored 0%)
  adding: content/Tolkienizer-3.2-1B-Instruct/.cache/huggingface/download/model.safetensors.metadata (deflated 31%)
  adding: content/Tolkienizer-3.2-1B-Instruct/.cache/huggingface/.gitignore (stored 0%)
  adding: content/Tolkienizer-3.2-1B-Instruct/tokenizer.json (deflated 85%)
  adding: content/Tolkienizer-3.2-1B-Instruct/chat_template.jinja (deflated 71%)
  adding: content/Tolkienizer-3.2-1B-Instruct/model.safetensors (deflate

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Inference will be done in another notebook, after downloading the model

### Run Inference

In [None]:
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
import torch

model_path = "Tolkienizer-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = FastLanguageModel.from_pretrained(
    model_name = model_path,
    max_seq_length = 1024,
    dtype = None,
    load_in_4bit = True,
)

messages = [
    {"role": "system", "content": "You are Tolkienizer, a model that rewrites text into J.R.R. Tolkien’s style."},
    {"role": "user", "content": "Rewrite this in Tolkien’s style: The sea is vast and mysterious."},
]

inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")

outputs = model.generate(inputs, max_new_tokens=200, do_sample=True, temperature=0.7, top_p=0.9)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

==((====))==  Unsloth 2025.8.7: Fast Llama patching. Transformers: 4.55.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


AttributeError: 'tuple' object has no attribute 'generate'