# Fine-tuning TinyLlama for Conversational Chat

This notebook demonstrates how to fine-tune the TinyLlama model (1.1B parameters) for conversational chat tasks. TinyLlama is extremely lightweight and can run on even modest GPU hardware.

In [1]:
# Install required packages
!pip install -q unsloth
!pip install -q datasets
!pip install -q accelerate>=0.24.1
!pip install -q bitsandbytes>=0.41.1
!pip install -q peft>=0.6.0
!pip install -q trl>=0.7.6

# Verify GPU availability
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU Memory:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.7/192.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m85.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Loading TinyLlama with Unsloth

TinyLlama is a distilled model with only 1.1B parameters, making it much less resource-intensive than larger models while still maintaining decent performance for simpler tasks.

In [3]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset

# Set a small sequence length to reduce memory requirements
max_seq_length = 512  # Smaller context window helps with memory usage

# Load the TinyLlama model with Unsloth optimizations
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    max_seq_length=max_seq_length,
    dtype=torch.float16,  # Use float16 for memory efficiency
    load_in_4bit=True,    # 4-bit quantization for minimal memory usage
)

print(f"Model loaded with fp16 precision and max sequence length of {max_seq_length}")


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/762M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Model loaded with fp16 precision and max sequence length of 512


## Loading a Conversational Dataset

For fine-tuning a chat model, we'll use a small subset of the OpenAssistant dataset, which contains high-quality conversational data in the instruction format.

In [4]:
# Load a small conversational dataset
dataset = load_dataset("OpenAssistant/oasst1", split="train")
print(f"Dataset loaded with {len(dataset)} examples")

# Let's filter for English language and select a small subset
english_dataset = dataset.filter(lambda example: example["lang"] == "en")
small_dataset = english_dataset.select(range(1000))  # Just use 1000 examples for quick training
print(f"Using {len(small_dataset)} examples for fine-tuning")

# Preview a sample
print("Sample data point:")
print(small_dataset[0])

README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

(…)-00000-of-00001-b42a775f407cee45.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

(…)-00000-of-00001-134b8fd0c89408b6.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

Dataset loaded with 84437 examples


Filter:   0%|          | 0/84437 [00:00<?, ? examples/s]

Using 1000 examples for fine-tuning
Sample data point:
{'message_id': '6ab24d72-0181-4594-a9cd-deaf170242fb', 'parent_id': None, 'user_id': 'c3fe8c76-fc30-4fa7-b7f8-c492f5967d18', 'created_date': '2023-02-05T14:23:50.983374+00:00', 'text': 'Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.', 'role': 'prompter', 'lang': 'en', 'review_count': 3, 'review_result': True, 'deleted': False, 'rank': None, 'synthetic': False, 'model_name': None, 'detoxify': {'toxicity': 0.00044308538781479, 'severe_toxicity': 3.252684837207198e-05, 'obscene': 0.00023475120542570949, 'identity_attack': 0.0001416115992469713, 'insult': 0.00039489680784754455, 'threat': 4.075629112776369e-05, 'sexual_explicit': 2.712695459194947e-05}, 'message_tree_id': '6ab24d72-0181-4594-a9cd-deaf170242fb', 'tree_state': 'ready_for_export', 'emojis': {'name': ['+1', '_skip_reply', '_ski

## Preparing the Dataset with TinyLlama's Chat Template

We need to format our conversational data according to TinyLlama's expected chat template. This model follows the Llama 2 chat format with slight modifications.

In [5]:
# Function to format examples according to TinyLlama's chat template
def format_tinyllama_prompt(example):
    # TinyLlama chat format is similar to Llama 2
    system_prompt = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature."

    # Extract message content
    message = example["text"]

    # Format with Llama-2 style chat template
    prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{message} [/INST]"

    # For oasst data, we're using the "text" field which contains the human query
    # But we need to add in a response, so we'll use a generic helpful response
    response = "I'll do my best to help you with that."

    return {
        "text": prompt + " " + response + "</s>"
    }

# Apply formatting to our dataset
formatted_dataset = small_dataset.map(format_tinyllama_prompt)

# Show an example of formatted data
print("Formatted example:")
print(formatted_dataset[0]["text"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Formatted example:
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
<</SYS>>

Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research. [/INST] I'll do my best to help you with that.</s>


## Setting Up LoRA Parameters for TinyLlama Fine-tuning

Now we'll configure the LoRA (Low-Rank Adaptation) parameters for efficient fine-tuning and prepare the training arguments. Even though TinyLlama is already small, using LoRA makes fine-tuning even more efficient.

In [6]:
# Add LoRA adapters to the model
model = FastLanguageModel.get_peft_model(
    model,
    r=8,               # Lower rank for TinyLlama is sufficient
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,     # Alpha parameter for LoRA scaling
    lora_dropout=0.05  # Lower dropout for smaller model
)

# Set up the training arguments (optimized for smaller GPUs)
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./tinyllama_chat_assistant",
    num_train_epochs=1,               # Just 1 epoch for quick training
    per_device_train_batch_size=8,    # Can use larger batch size with smaller model
    gradient_accumulation_steps=2,    # Fewer steps needed due to larger batch size
    learning_rate=3e-4,               # Slightly higher learning rate for smaller model
    weight_decay=0.01,                # Weight decay for regularization
    warmup_steps=10,                  # Warmup steps
    logging_steps=10,                 # How often to log during training
    save_steps=200,                   # Save checkpoint every 200 steps
    gradient_checkpointing=True,      # Enable gradient checkpointing to save memory
    fp16=True,                        # Use mixed precision for faster training
    max_grad_norm=0.3,                # Gradient clipping
    optim="adamw_torch"               # Optimizer
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.3.19 patched 22 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [11]:
# Load the alpaca dataset which has a cleaner structure for instruction tuning
dataset = load_dataset("tatsu-lab/alpaca", split="train")
print(f"Alpaca dataset loaded with {len(dataset)} examples")

# Let's use a small subset for quick training
small_dataset = dataset.select(range(500))
print(f"Using {len(small_dataset)} examples for fine-tuning")

# Show a sample to understand the structure
print("Sample data point:")
print(small_dataset[0])

# Now let's create a proper formatting function for TinyLlama chat format
def format_alpaca_for_tinyllama(example):
    instruction = example["instruction"]
    input_text = example["input"] if example["input"] else ""
    output = example["output"]

    # Format with TinyLlama's chat template (based on Llama-2)
    if input_text:
        prompt = f"<s>[INST] {instruction}\n\n{input_text} [/INST]"
    else:
        prompt = f"<s>[INST] {instruction} [/INST]"

    # Return both prompt and completion for proper training
    return {
        "prompt": prompt,
        "completion": f" {output}</s>"
    }

# Apply the formatting
formatted_dataset = small_dataset.map(format_alpaca_for_tinyllama)

# Show a formatted example
print("Formatted example:")
print(formatted_dataset[0])

# Create text-only examples with format expected by unsloth
def create_training_example(example):
    return {"text": example["prompt"] + example["completion"]}

train_dataset = formatted_dataset.map(create_training_example)
print("Training example:")
print(train_dataset[0]["text"])

Alpaca dataset loaded with 52002 examples
Using 500 examples for fine-tuning
Sample data point:
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}
Formatted example:
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your bod

## Fine-tuning TinyLlama with the Properly Formatted Dataset

Now that we have a correctly formatted dataset, let's train the model using Unsloth's optimized training process.

In [10]:
# Initialize the trainer with our properly formatted dataset
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=False  # Keep this False for simplicity
)

# Train the model
trainer.train()

# Save the trained model
output_dir = "./tinyllama_chat_assistant_final"
trainer.save_model(output_dir)
print(f"Model saved to {output_dir}")

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 1 | Total steps = 31
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 6,307,840/4,000,000,000 (0.16% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.0913
20,1.6618
30,1.4988


Model saved to ./tinyllama_chat_assistant_final


## Testing the Fine-tuned TinyLlama Model

Let's test our fine-tuned TinyLlama model with some conversational prompts to see how it performs after training.

In [12]:
# Load the fine-tuned model
fine_tuned_model, fine_tuned_tokenizer = FastLanguageModel.from_pretrained(
    model_name="./tinyllama_chat_assistant_final",
    max_seq_length=max_seq_length,
    dtype=torch.float16,
    load_in_4bit=True
)

# Function to generate responses with TinyLlama's chat format
def generate_tinyllama_response(instruction, input_text=""):
    # Format the prompt according to TinyLlama's chat template
    if input_text:
        prompt = f"<s>[INST] {instruction}\n\n{input_text} [/INST]"
    else:
        prompt = f"<s>[INST] {instruction} [/INST]"

    inputs = fine_tuned_tokenizer(prompt, return_tensors="pt").to(fine_tuned_model.device)

    outputs = fine_tuned_model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True
    )

    response = fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the assistant's response by removing the prompt
    response = response.replace(prompt, "").strip()

    return response

# Test with a few example questions
test_questions = [
    "What's the best way to learn programming?",
    "Tell me a short story about friendship.",
    "Explain the concept of machine learning in simple terms."
]

for question in test_questions:
    print(f"\nQuestion: {question}")
    response = generate_tinyllama_response(question)
    print(f"Response: {response}")

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

Question: What's the best way to learn programming?
Response: [INST] What's the best way to learn programming? [/INST] The best way to learn programming is by doing it. Practice, practice, and practice some more! There are many resources available online for learning how to code, but don’t just take my word for it; read reviews from people who have learned through practical experience instead of books or videos.

Question: Tell me a short story about friendship.
Response: [INST] Tell me a short story about friendship. [/INST] Samantha a