In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created: January 2026
Author: LT3
Description: Notebook for fine-tuning an LLM (Mistral) for machine translation using QLoRA.
"""


'\nCreated: January 2026\nAuthor: Thomas Moerman\nDescription: Notebook for fine-tuning an LLM (Mistral) for machine translation using QLoRA.\n'

# Fine-tuning an LLM for Machine Translation

This notebook walks you through **fine-tuning a pretrained LLM** (Mistral-7B-Instruct) for machine translation using the ðŸ¤— Hugging Face ecosystem and parameter-efficient fine-tuning (PEFT).

## What you'll learn
- How to download and prepare parallel translation data
- How to format data for instruction-tuned models using chat templates
- How to use **QLoRA** (Quantized Low-Rank Adaptation) for efficient fine-tuning
- How to train with the `SFTTrainer` from TRL
- How to run inference with your fine-tuned translation model

## Why QLoRA?
Fine-tuning a 7B parameter model would normally require significant GPU memory. QLoRA combines:
- **4-bit quantization**: Reduces memory footprint dramatically
- **LoRA adapters**: Only trains a small number of additional parameters

This allows fine-tuning large models on consumer GPUs!


## 1) Imports + Environment Check

We use:
- **transformers**: models, tokenizers, and training utilities
- **datasets**: for loading and processing data
- **peft**: Parameter-Efficient Fine-Tuning (LoRA)
- **trl**: Transformer Reinforcement Learning (SFTTrainer)
- **bitsandbytes**: 4-bit quantization

We'll also set a random seed for reproducibility.


In [2]:
import os
import json
import torch
import pandas as pd
from datasets import load_dataset, Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    set_seed,
)

from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel,
    PeftConfig,
)

from trl import SFTTrainer, SFTConfig

print('PyTorch:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))
    print('GPU Memory:', round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 1), 'GB')

set_seed(42)


PyTorch: 2.9.1+cu128
CUDA available: True
GPU: Tesla V100-SXM2-16GB
GPU Memory: 15.8 GB


## 2) Download the Translation Data

We'll use the English-French translation dataset from Hugging Face.

The `download_data.py` script downloads the data and saves it as text files:
- `en_train.txt` / `fr_train.txt`: Training parallel sentences
- `en_validation.txt` / `fr_validation.txt`: Validation parallel sentences
- `en_test.txt` / `fr_test.txt`: Test parallel sentences

You can run this from the command line:
```bash
python download_data.py --repo_name LT3/nfr_bt_nmt_english-french --base_path data/en-fr
```

Or run it directly in the notebook:


In [3]:
# Configuration
REPO_NAME = "LT3/nfr_bt_nmt_english-french"
DATA_PATH = "data/en-fr"

# Download the data (run once)
from download_data import download_and_save_dataset

if not os.path.exists(DATA_PATH):
    dataset_paths = download_and_save_dataset(REPO_NAME, DATA_PATH)
else:
    print(f"Data already exists at {DATA_PATH}")


Data already exists at data/en-fr


## 3) Load and Prepare the Data

We'll load the parallel text files and create training examples.

For **instruction-tuned models**, we need to format the data as a conversation:
- **User**: Provides the source sentence with translation instruction
- **Assistant**: Provides the translation

For this tutorial, we'll use a small subset:
- **Train**: 2,000 examples
- **Validation**: 500 examples
- **Test**: 10 examples (for quick inference testing)


In [4]:
def load_parallel_data(en_path, fr_path, max_samples=None):
    """Load parallel English-French sentence pairs from text files."""
    with open(en_path, 'r', encoding='utf-8') as f:
        en_sentences = [line.strip() for line in f if line.strip()]
    with open(fr_path, 'r', encoding='utf-8') as f:
        fr_sentences = [line.strip() for line in f if line.strip()]
    
    # Ensure same length
    min_len = min(len(en_sentences), len(fr_sentences))
    en_sentences = en_sentences[:min_len]
    fr_sentences = fr_sentences[:min_len]
    
    if max_samples:
        en_sentences = en_sentences[:max_samples]
        fr_sentences = fr_sentences[:max_samples]
    
    return en_sentences, fr_sentences

# Load the data
print("Loading training data...")
en_train, fr_train = load_parallel_data(
    f"{DATA_PATH}/en_train.txt",
    f"{DATA_PATH}/fr_train.txt",
    max_samples=2000  # Use 2k for tutorial
)

print("Loading validation data...")
en_val, fr_val = load_parallel_data(
    f"{DATA_PATH}/en_validation.txt",
    f"{DATA_PATH}/fr_validation.txt",
    max_samples=500  # Use 500 for tutorial
)

print("Loading test data...")
en_test, fr_test = load_parallel_data(
    f"{DATA_PATH}/en_test.txt",
    f"{DATA_PATH}/fr_test.txt",
    max_samples=10  # Use 10 for quick testing
)

print(f"\nDataset sizes:")
print(f"  Train: {len(en_train)} pairs")
print(f"  Validation: {len(en_val)} pairs")
print(f"  Test: {len(en_test)} pairs")

# Preview some examples
print("\n--- Sample training pairs ---")
for i in range(3):
    print(f"EN: {en_train[i]}")
    print(f"FR: {fr_train[i]}")
    print()


Loading training data...
Loading validation data...
Loading test data...

Dataset sizes:
  Train: 2000 pairs
  Validation: 500 pairs
  Test: 10 pairs

--- Sample training pairs ---
EN: Article 199b is replaced by the following:
FR: l'articleÂ 199Â ter est remplacÃ© par le texte suivant:

EN: at consular offices:
FR: dans les bureaux consulaires:

EN: The Portuguese authorities have explained that this public interest mission was entrusted to the private sector in accordance with Decree-Law No 197/99 of 8 June 1999 [7], which transposed into national law European Parliament and Council Directive 97/52/EC of 13 October 1997 amending Directives 92/50/EEC, 93/36/EEC and 93/37/EEC concerning the coordination of procedures for the award of public service contracts, public supply contracts and public works contracts respectively [8].
FR: Les autoritÃ©s portugaises ont prÃ©cisÃ© que cette mission dâ€™intÃ©rÃªt public avait Ã©tÃ© attribuÃ©e au secteur privÃ©, dans le respect des prescriptions Ã

## 4) Load the Model and Tokenizer

We'll use **Mistral-7B-Instruct-v0.3**, a powerful instruction-tuned model.

### Key configurations:
- **4-bit quantization** (NF4): Dramatically reduces memory usage
- **Double quantization**: Further memory savings
- **bfloat16 compute**: For stable training

> **Note**: You may need to accept the model's terms on Hugging Face and set your token.


In [5]:
# Model configuration
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
OUTPUT_DIR = "../models/mistral-translation-en-fr"

# Optional: Set your Hugging Face token if the model requires authentication
# import huggingface_hub
# huggingface_hub.login(token="your_token_here")


In [6]:
# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

print(f"Loading model: {MODEL_NAME}")
print("This may take a few minutes...")

# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb_config,
    use_cache=False,  # Disable cache for training
    trust_remote_code=True,
)

print(f"Model loaded successfully!")
print(f"Model dtype: {model.dtype}")


Loading model: mistralai/Mistral-7B-Instruct-v0.3
This may take a few minutes...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded successfully!
Model dtype: torch.float16


In [7]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    add_bos_token=True,
    add_eos_token=False,  # SFTTrainer adds EOS when packing=True
    trust_remote_code=True,
)

# Set padding token (required for batched training)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Right padding for causal LM

print(f"Tokenizer loaded!")
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"BOS token: {tokenizer.bos_token}")
print(f"EOS token: {tokenizer.eos_token}")
print(f"PAD token: {tokenizer.pad_token}")


Tokenizer loaded!
Vocab size: 32768
BOS token: <s>
EOS token: </s>
PAD token: </s>


## 5) Format Data with Chat Template

Instruction-tuned models expect data in a specific chat format. We use the tokenizer's `apply_chat_template()` method to format our translation examples correctly.

### Training format:
```
[INST] Translate this sentence from English to French:
English: <source sentence>
French: [/INST] <target sentence>
```

### Inference format (no target):
```
[INST] Translate this sentence from English to French:
English: <source sentence>
French: [/INST]
```


In [8]:
def format_translation_example(en_sentence, fr_sentence=None, tokenizer=None, for_training=True):
    """
    Format a translation example using the model's chat template.
    
    Args:
        en_sentence: English source sentence
        fr_sentence: French target sentence (None for inference)
        tokenizer: The tokenizer with chat template
        for_training: If True, include the target translation
    
    Returns:
        Formatted string ready for the model
    """
    # Create the user message with translation instruction
    user_message = f"""Translate this sentence from English to French:
English: {en_sentence}
French:"""
    
    # Build the chat
    chat = [{"role": "user", "content": user_message}]
    
    # Add assistant response for training
    if for_training and fr_sentence:
        chat.append({"role": "assistant", "content": fr_sentence})
    
    # Apply the chat template
    formatted = tokenizer.apply_chat_template(
        chat,
        tokenize=False,
        add_generation_prompt=not for_training  # Add prompt for inference
    )
    
    return formatted


def create_dataset(en_sentences, fr_sentences, tokenizer, for_training=True):
    """
    Create a HuggingFace Dataset from parallel sentences.
    """
    formatted_examples = []
    
    for en, fr in zip(en_sentences, fr_sentences):
        formatted = format_translation_example(
            en, fr, tokenizer, for_training=for_training
        )
        formatted_examples.append(formatted)
    
    return Dataset.from_dict({"text": formatted_examples})


# Create datasets
print("Creating training dataset...")
train_dataset = create_dataset(en_train, fr_train, tokenizer, for_training=True)

print("Creating validation dataset...")
val_dataset = create_dataset(en_val, fr_val, tokenizer, for_training=True)

print(f"\nDataset sizes:")
print(f"  Train: {len(train_dataset)}")
print(f"  Validation: {len(val_dataset)}")


Creating training dataset...
Creating validation dataset...

Dataset sizes:
  Train: 2000
  Validation: 500


In [9]:
# Preview a formatted training example
print("=" * 60)
print("FORMATTED TRAINING EXAMPLE:")
print("=" * 60)
print(train_dataset[0]["text"])
print("=" * 60)


FORMATTED TRAINING EXAMPLE:
<s>[INST] Translate this sentence from English to French:
English: Article 199b is replaced by the following:
French:[/INST] l'articleÂ 199Â ter est remplacÃ© par le texte suivant:</s>


In [10]:
# Preview how inference prompts look (no target)
inference_example = format_translation_example(
    en_test[0], None, tokenizer, for_training=False
)
print("=" * 60)
print("FORMATTED INFERENCE EXAMPLE:")
print("=" * 60)
print(inference_example)
print("=" * 60)


FORMATTED INFERENCE EXAMPLE:
<s>[INST] Translate this sentence from English to French:
English: Carrying out the information procedures laid down under Article 5(4) of the Schengen Borders Code and the consultation procedures laid down under Article 25 of the Schengen Convention, falls within the competences of the authorities responsible for border controls and issuing residence permits or visas.
French:[/INST]


## 6) Configure LoRA (Low-Rank Adaptation)

LoRA adds small trainable matrices to the model's attention layers. This allows us to:
- Train only ~0.1% of the parameters
- Keep the original model weights frozen
- Save only the small adapter weights

### Key LoRA hyperparameters:
- **r (rank)**: Size of the low-rank matrices (higher = more capacity, more memory)
- **lora_alpha**: Scaling factor for LoRA weights
- **lora_dropout**: Dropout for regularization


In [11]:
# Configure LoRA
# Note: We only define the config here. SFTTrainer will apply it to the model automatically.
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,  # Rank of the update matrices
    bias="none",
    task_type="CAUSAL_LM",
    # Target the attention layers
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

# Prepare model for k-bit training (enables gradient checkpointing, etc.)
model = prepare_model_for_kbit_training(model)

# IMPORTANT: Don't call get_peft_model() here!
# SFTTrainer will apply the peft_config when we pass it to the trainer.
# If you manually apply PEFT here AND pass peft_config to SFTTrainer, you'll get an error.

print("LoRA configuration defined.")
print(f"  Rank (r): {peft_config.r}")
print(f"  Alpha: {peft_config.lora_alpha}")
print(f"  Target modules: {peft_config.target_modules}")
print("\nSFTTrainer will apply LoRA during initialization.")


LoRA configuration defined.
  Rank (r): 64
  Alpha: 16
  Target modules: {'o_proj', 'v_proj', 'k_proj', 'q_proj'}

SFTTrainer will apply LoRA during initialization.


## 7) Training Configuration

We use the `SFTTrainer` (Supervised Fine-Tuning Trainer) from TRL, which is designed for instruction tuning.

### Key training hyperparameters:
- **learning_rate**: 2e-4 is a good starting point for LoRA
- **batch_size**: Adjust based on your GPU memory
- **num_train_epochs**: 1-3 epochs usually sufficient
- **max_seq_length**: Maximum sequence length (including prompt + translation)
- **packing**: Combines short examples to maximize GPU utilization


In [None]:
# Maximum sequence length for training
MAX_LENGTH = 512  # Increase if your translations are longer

# Training configuration using SFTConfig (combines TrainingArguments + SFT-specific settings)
sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    
    # Training hyperparameters
    num_train_epochs=1,
    per_device_train_batch_size=4,  # Reduce if OOM
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,  # Effective batch size = 4 * 4 = 16
    
    # Optimizer settings
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_steps=50,
    lr_scheduler_type="constant",
    
    # Precision
    bf16=True,  # Use bfloat16 for stable training
    
    # Logging and saving
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=5,
    save_strategy="epoch",
    eval_strategy="epoch",
    
    # Other settings
    report_to="none",  # Change to "wandb" for experiment tracking
    save_total_limit=2,  # Keep only last 2 checkpoints
    
    # SFT-specific settings
    max_length=MAX_LENGTH,  # Maximum length of tokenized sequences
    packing=True,  # Pack multiple examples into one sequence for efficiency
    dataset_text_field="text",  # Column name containing the text data
)

print("Training configuration:")
print(f"  Epochs: {sft_config.num_train_epochs}")
print(f"  Batch size: {sft_config.per_device_train_batch_size}")
print(f"  Gradient accumulation: {sft_config.gradient_accumulation_steps}")
print(f"  Effective batch size: {sft_config.per_device_train_batch_size * sft_config.gradient_accumulation_steps}")
print(f"  Learning rate: {sft_config.learning_rate}")
print(f"  Max length: {sft_config.max_length}")
print(f"  Packing: {sft_config.packing}")


Training configuration:
  Epochs: 1
  Batch size: 4
  Gradient accumulation: 4
  Effective batch size: 16
  Learning rate: 0.0002
  Max length: 512
  Packing: True


In [13]:
# Initialize the SFTTrainer
# Note: In newer TRL versions, max_seq_length, packing, and dataset_text_field
# are passed via SFTConfig instead of directly to SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    processing_class=tokenizer,  # 'tokenizer' is deprecated, use 'processing_class'
)

print("Trainer initialized successfully!")


Padding-free training is enabled, but the attention implementation is not set to a supported flash attention variant. Padding-free training flattens batches into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn2, kernels-community/flash-attn3, kernels-community/vllm-flash-attn3. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation` in the model configuration to one of these supported options or verify that your attention mechanism can handle flattened sequences.
You are using packing, but the attention implementation is not set to a supported flash attention variant. Packing gathers multiple samples into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn2, kernels-community/flash-attn3, kernels-community/vllm-f

Adding EOS to train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Trainer initialized successfully!


In [14]:
# Preview what the tokenized data looks like (with packing)
print("Preview of tokenized and packed data:")
print("-" * 60)
sample_decoded = tokenizer.decode(trainer.train_dataset["input_ids"][0][:200])
print(sample_decoded + "...")


Preview of tokenized and packed data:
------------------------------------------------------------
<s><s>[INST] Translate this sentence from English to French:
English: Having regard to Commission Regulation (EC) No 1410/1999 of 29 June 1999 amending Regulation (EC) No 2808/98 laying down detailed rules for the application of the agrimonetary system for the euro in agriculture and amending the definition of certain operative events provided for in regulations (EEC) No 3889/87, (EEC) No 3886/92, (EEC) No 1793/93, (EEC) No 2700/93 and (EC) No 293/98 [4], and in particular Article 2 thereof,
French:[/INST] vu le rÃ¨glement (CE) no 1410/1999 de...


## 8) Train the Model

Now let's fine-tune! Training time depends on:
- Dataset size
- GPU speed
- Batch size

With 2,000 examples and 1 epoch, this should take ~10-20 minutes on a semi-decent GPU.


In [None]:
# Start training!
print("Starting training...")
print("=" * 60)

trainer.train()

print("=" * 60)
print("Training complete!")


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Starting training...


Epoch,Training Loss,Validation Loss


In [None]:
# Save the training logs
logs = trainer.state.log_history
logs_path = os.path.join(OUTPUT_DIR, "training_logs.json")

os.makedirs(OUTPUT_DIR, exist_ok=True)
with open(logs_path, "w") as f:
    json.dump(logs, f, indent=2)

print(f"Training logs saved to: {logs_path}")

# Print final metrics
print("\nFinal training metrics:")
for log in logs:
    if "loss" in log:
        print(f"  Step {log.get('step', 'N/A')}: loss = {log['loss']:.4f}")


## 9) Save the Model

We save both:
- The LoRA adapter weights (small, ~100MB)
- The tokenizer

The adapter can be loaded on top of the base model for inference.


In [None]:
# Save the fine-tuned model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Model saved to: {OUTPUT_DIR}")
print(f"\nSaved files:")
for f in os.listdir(OUTPUT_DIR):
    size = os.path.getsize(os.path.join(OUTPUT_DIR, f))
    print(f"  {f}: {size / 1024 / 1024:.2f} MB")


## 10) Inference: Generate Translations

Now let's test our fine-tuned model! We'll:
1. Load the base model
2. Load our LoRA adapter
3. Generate translations for test sentences


In [None]:
# For inference, we can reload the model fresh
# (or continue using the trained model from memory)

# Option 1: Use the model from memory (already loaded)
inference_model = model
inference_tokenizer = tokenizer

# Option 2: Load from disk (uncomment if needed)
# from peft import PeftModel, PeftConfig
# 
# # Load the PEFT config to get base model name
# peft_config = PeftConfig.from_pretrained(OUTPUT_DIR)
# 
# # Load base model
# base_model = AutoModelForCausalLM.from_pretrained(
#     peft_config.base_model_name_or_path,
#     device_map="auto",
#     torch_dtype=torch.bfloat16,
# )
# 
# # Load the LoRA adapter
# inference_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
# inference_tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)

print("Model ready for inference!")


In [None]:
def translate(english_text, model, tokenizer, max_new_tokens=100):
    """
    Translate English text to French using the fine-tuned model.
    
    Args:
        english_text: The English sentence to translate
        model: The fine-tuned model
        tokenizer: The tokenizer
        max_new_tokens: Maximum tokens to generate
    
    Returns:
        The French translation
    """
    # Format the prompt
    prompt = format_translation_example(
        english_text, None, tokenizer, for_training=False
    )
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # Greedy decoding for deterministic output
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode only the generated part
    generated_tokens = outputs[0][inputs["input_ids"].shape[1]:]
    translation = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    return translation.strip()


In [None]:
# Test on our held-out test examples
print("=" * 70)
print("TRANSLATION RESULTS ON TEST SET")
print("=" * 70)

for i, (en, fr_ref) in enumerate(zip(en_test, fr_test)):
    fr_pred = translate(en, inference_model, inference_tokenizer)
    
    print(f"\n--- Example {i+1} ---")
    print(f"English:    {en}")
    print(f"Reference:  {fr_ref}")
    print(f"Predicted:  {fr_pred}")


In [None]:
# Try your own sentences!
custom_sentences = [
    "The weather is beautiful today.",
    "I love learning new languages.",
    "Machine translation has improved significantly in recent years.",
    "Can you help me find the train station?",
]

print("=" * 70)
print("CUSTOM TRANSLATIONS")
print("=" * 70)

for en in custom_sentences:
    fr = translate(en, inference_model, inference_tokenizer)
    print(f"\nEN: {en}")
    print(f"FR: {fr}")


## (Optional) Quick BLEU Evaluation

Here's how to compute BLEU score on your test set:


In [None]:
# Optional: Compute BLEU score
# Uncomment and run if you have sacrebleu installed
# pip install sacrebleu

# import sacrebleu
# 
# # Generate translations for test set
# predictions = [translate(en, inference_model, inference_tokenizer) for en in en_test]
# 
# # Compute BLEU
# bleu = sacrebleu.corpus_bleu(predictions, [fr_test])
# print(f"BLEU score: {bleu.score:.2f}")
