# ü¶ô Fine-tuning Llama-3.2-3B-Instruct on Blog Dataset
> Improved notebook with better LoRA config, prompt engineering, dataset cleaning, and cosine LR scheduler.

In [1]:
%%capture
!pip install unsloth gradio transformers datasets trl accelerate bitsandbytes

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048   # blog posts can be long
dtype = None            # auto-detect (bf16 on Ampere+)
load_in_4bit = True     # QLoRA

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name   = 'unsloth/Llama-3.2-3B-Instruct',
    max_seq_length = max_seq_length,
    dtype          = dtype,
    load_in_4bit   = load_in_4bit,
)

==((====))==  Unsloth 2026.2.1: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.35. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

## LoRA Config ‚Äî Key Improvements
| Parameter | Before | After | Why |
|-----------|--------|-------|-----|
| `r` | 16 | 32 | More capacity for long-form generation |
| `lora_alpha` | 16 | 64 | Higher alpha ‚Üí stronger adaptation |
| `use_rslora` | False | True | Rank-stabilised LoRA for stability |
| `lora_dropout` | 0 | 0.05 | Light regularisation |

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r              = 32,          # ‚Üë from 16 ‚Äì more expressive
    target_modules = [
        'q_proj', 'k_proj', 'v_proj', 'o_proj',
        'gate_proj', 'up_proj', 'down_proj',
    ],
    lora_alpha             = 64,      # ‚Üë from 16 ‚Äì stronger signal
    lora_dropout           = 0.05,    # light regularisation
    bias                   = 'none',
    use_gradient_checkpointing = 'unsloth',
    random_state           = 3407,
    use_rslora             = True,    # ‚Üë rank-stabilised LoRA
    loftq_config           = None,
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2026.2.1 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


## Dataset Loading & Cleaning
The `source` column is a URL, not meaningful content. We drop it from the prompt and clean whitespace.

In [5]:
from datasets import load_dataset

dataset = load_dataset('nepalprabin/blog_dataset', split='train')
print(dataset)
print(dataset[0].keys())

README.md:   0%|          | 0.00/91.0 [00:00<?, ?B/s]

blog_dataset.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset({
    features: ['source', 'full_text', 'title'],
    num_rows: 16
})
dict_keys(['source', 'full_text', 'title'])


## Improved Prompt Template
- Cleaner system instruction focused on blog writing
- Removed the noisy `source` (URL) from the training signal
- Added section scaffolding hint so model learns structure
- Strips extra whitespace from raw text

In [None]:
EOS_TOKEN = tokenizer.eos_token

SYSTEM_PROMPT = (
    'You are an expert blog writer. '
    'Given a title, write a well-structured, engaging blog post in markdown. '
    'Include: a compelling introduction, 3-5 body sections with headers, and a strong conclusion.'
)

PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{system}<|eot_id|><|start_header_id|>user<|end_header_id|>
Write a blog post with the following title:

# {title}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{content}"""

def clean_text(t: str) -> str:
    import re
    t = re.sub(r'\n{3,}', '\n\n', t)   # collapse triple newlines
    return t.strip()

def formatting_prompts_func(examples):
    texts = []
    for title, content in zip(examples['title'], examples['full_text']):
        if not title or not content:          # skip empty rows
            continue
        text = PROMPT_TEMPLATE.format(
            system  = SYSTEM_PROMPT,
            title   = clean_text(title),
            content = clean_text(content),
        ) + EOS_TOKEN
        texts.append(text)
    return {'text': texts}

dataset = dataset.map(formatting_prompts_func, batched=True, remove_columns=dataset.column_names)
print(f'Dataset size after cleaning: {len(dataset)}')
print(dataset[0]['text'][:500])

## Training Config ‚Äî Key Improvements
| Parameter | Before | After | Why |
|-----------|--------|-------|-----|
| `max_steps` | 60 | 200 | More training for a 3B model |
| `lr_scheduler_type` | linear | cosine | Smoother decay, better convergence |
| `warmup_ratio` | 5 steps | 0.05 | Proportional warm-up |
| `learning_rate` | 2e-4 | 2e-4 | Keep (good default) |
| `packing` | False | True | 5√ó faster for varied-length texts |
| `weight_decay` | 0.01 | 0.01 | Keep |

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model              = model,
    tokenizer          = tokenizer,
    train_dataset      = dataset,
    dataset_text_field = 'text',
    max_seq_length     = max_seq_length,
    dataset_num_proc   = 2,
    packing            = True,     # ‚Üë much faster training
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio       = 0.05,             # ‚Üë proportional warm-up
        num_train_epochs   = 1,                # ‚Üë full epoch instead of fixed steps
        max_steps          = 200,              # cap if dataset is large
        learning_rate      = 2e-4,
        fp16  = not is_bfloat16_supported(),
        bf16  = is_bfloat16_supported(),
        logging_steps      = 10,
        optim              = 'adamw_8bit',
        weight_decay       = 0.01,
        lr_scheduler_type  = 'cosine',         # ‚Üë cosine > linear
        seed               = 3407,
        output_dir         = 'outputs',
        save_strategy      = 'steps',
        save_steps         = 50,
        report_to          = 'none',
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/16 [00:00<?, ? examples/s]

In [8]:
trainer_stats = trainer.train()
print(f"Training loss: {trainer_stats.training_loss:.4f}")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 16 | Num Epochs = 100 | Total steps = 200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 48,627,712 of 3,261,377,536 (1.49% trained)


Step,Training Loss
10,1.9423
20,0.4586
30,0.0199
40,0.0081
50,0.0084
60,0.0053
70,0.0047
80,0.0046
90,0.0064
100,0.0047


Training loss: 0.1253


## Save & Export

In [9]:
# Save LoRA adapter only (small, fast)
model.save_pretrained('blog_writer_lora')
tokenizer.save_pretrained('blog_writer_lora')

# Optional: merge and save full model (larger, no adapter needed at inference)
# model.save_pretrained_merged('blog_writer_merged', tokenizer, save_method='merged_16bit')

('blog_writer_lora/tokenizer_config.json',
 'blog_writer_lora/special_tokens_map.json',
 'blog_writer_lora/chat_template.jinja',
 'blog_writer_lora/tokenizer.json')

## Inference Helper

In [10]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(tokenizer, chat_template='llama-3.1')
FastLanguageModel.for_inference(model)

def generate_blog(title: str, max_new_tokens: int = 1024, temperature: float = 0.8) -> str:
    """Generate a blog post for the given title."""
    prompt = PROMPT_TEMPLATE.format(
        system  = SYSTEM_PROMPT,
        title   = title,
        content = '',          # model fills this in
    )
    inputs = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda')
    with torch.no_grad():
        output_ids = model.generate(
            input_ids      = inputs,
            max_new_tokens = max_new_tokens,
            use_cache      = True,
            temperature    = temperature,
            do_sample      = True,
            top_p          = 0.9,
            repetition_penalty = 1.1,   # reduces repetitive output
        )
    # decode only the newly generated tokens
    new_tokens = output_ids[0, inputs.shape[1]:]
    return tokenizer.decode(new_tokens, skip_special_tokens=True)

# Quick test
sample = generate_blog('The Future of Renewable Energy')
print(sample)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The Future of Renewable Energy

Renewable energy is becoming increasingly important as the world moves away from fossil fuels. In this blog post, I will try to write about the future of renewable energy and some of the trends that are likely to shape it.

Firstly, let‚Äôs look at what renewable energy is. Renewable energy includes energy generated from natural sources such as sunlight, wind, and water. It also includes energy generated from biomass (organic matter), geothermal sources, and hydrogen. The most common types of renewable energy are solar energy, wind energy, and hydro energy.

Solar energy generates electricity from sunlight using photovoltaic panels or solar thermal systems. Solar energy is becoming increasingly popular due to its zero emissions and ability to reduce greenhouse gas emissions. In 2020, solar energy accounted for 3.3% of the world‚Äôs electricity generation, and this percentage is expected to increase to 27.2% by 2050 (International Renewable Energy Agency,

In [12]:
!pip install -q unsloth gradio

from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch
import gradio as gr

# ‚îÄ‚îÄ Load model ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "blog_writer_lora",   # ‚Üê change to your saved model path
    max_seq_length = 2048,
    load_in_4bit   = True,
)
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")
FastLanguageModel.for_inference(model)
print("‚úÖ Model loaded")

SYSTEM_PROMPT = (
    "You are an expert blog writer. "
    "Given a title, write a well-structured blog post in markdown with "
    "an introduction, 3-5 sections with headers, and a conclusion."
)

# ‚îÄ‚îÄ Generate function ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def generate_blog(title, max_tokens, temperature):
    if not title.strip():
        return "Please enter a title."

    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>
Write a blog post titled: {title}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
    with torch.no_grad():
        output_ids = model.generate(
            input_ids          = inputs,
            max_new_tokens     = int(max_tokens),
            do_sample          = True,
            temperature        = float(temperature),
            top_p              = 0.9,
            repetition_penalty = 1.1,
            use_cache          = True,
        )
    return tokenizer.decode(output_ids[0, inputs.shape[1]:], skip_special_tokens=True)

# ‚îÄ‚îÄ Gradio UI ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
with gr.Blocks(title="Blog Writer") as demo:
    gr.Markdown("# ‚úçÔ∏è AI Blog Writer")
    with gr.Row():
        with gr.Column():
            title   = gr.Textbox(label="Blog Title", placeholder="e.g. The Future of AI")
            tokens  = gr.Slider(200, 1200, value=600, step=50, label="Length")
            temp    = gr.Slider(0.1, 1.4, value=0.8, step=0.1, label="Creativity")
            btn     = gr.Button("Generate", variant="primary")
        with gr.Column():
            output = gr.Markdown(label="Output")

    btn.click(fn=generate_blog, inputs=[title, tokens, temp], outputs=output)
    title.submit(fn=generate_blog, inputs=[title, tokens, temp], outputs=output)

demo.launch(share=True)   # share=True gives you a public link

==((====))==  Unsloth 2026.2.1: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.35. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
‚úÖ Model loaded
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0461e6c135be51cd1f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




## Gradio Deployment
Run `app.py` (provided separately) in the same environment to launch the web UI.

```bash
python app.py
```