# Import/install

In [1]:
!pip install vllm
!pip install datasets
!pip install flash-attn --no-build-isolation
!pip install peft
!pip install trl
!pip install bitsandbytes
!pip install accelerate


Collecting vllm
  Downloading vllm-0.10.0-cp38-abi3-manylinux1_x86_64.whl.metadata (14 kB)
Collecting regex (from vllm)
  Downloading regex-2025.7.34-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting cachetools (from vllm)
  Downloading cachetools-6.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting sentencepiece (from vllm)
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting tqdm (from vllm)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting blake3 (from vllm)
  Downloading blake3-1.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting py-cpuinfo (from vllm)
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)
Collecting transformers>=4.53.2 (from vllm)
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
Collecting huggingface-hub>=0.33.0 (from huggingface-hub[hf_xet]>=0

In [2]:
!nvidia-smi

Sun Aug  3 06:00:37 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.05             Driver Version: 550.127.05     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:4A:00.0 Off |                    0 |
| N/A   29C    P0             61W /  400W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
from huggingface_hub import login
login("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")

# Main code

In [4]:
# Full script for fine-tuning Mistral-Nemo-Instruct-2407 for Role-Playing
# using the official Mistral-Instruct chat template.

# =====================================================================================
# Step 0: Install Dependencies
# =====================================================================================
# !pip install -qU "transformers==4.43.3" "datasets==2.20.0" "accelerate==0.32.0" "bitsandbytes==0.43.1" "peft==0.11.1" "trl==0.9.4"
# !pip install -qU "flash-attn==2.6.2" --no-build-isolation

import torch
import os
import re
from datasets import load_dataset, concatenate_datasets, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import warnings

# Suppress warnings for a cleaner output
warnings.filterwarnings("ignore")

# =====================================================================================
# 1. Configuration
# =====================================================================================
# Model and tokenizer names
base_model_name = "mistralai/Mistral-Nemo-Instruct-2407"
# Name for the LoRA adapter directory
new_model_name = "Mistral-Nemo-2407-LORA-combineddata-8ac-lr7e6-2ep"
# Name for the final merged model directory
final_model_name = "Mistral-Nemo-2407-LORA-combineddata-8ac-lr7e6-2ep"


In [5]:
import json
# workspace/combined_data_old_format.jsonl
with open("combined_data_old_format.jsonl", "r") as f:
    all_examples = [json.loads(line) for line in f]

unified_dataset = Dataset.from_list(all_examples).shuffle(seed=42)
print(f"\n✅ Total combined and processed examples: {len(unified_dataset)}")


✅ Total combined and processed examples: 44139


In [6]:
unified_dataset[0]

{'messages': [{'content': 'Hello there, how can I help \n\n Hi, I just wanted to talk with someone',
   'role': 'user'},
  {'content': 'Of course, what about?', 'role': 'assistant'},
  {'content': 'I just wanted to talk in general', 'role': 'user'},
  {'content': 'That sounds great, I love talking about anything and everything',
   'role': 'assistant'},
  {'content': "That's good", 'role': 'user'},
  {'content': 'So tell me about yourself', 'role': 'assistant'},
  {'content': "Well, my name is Seán. I'm 19 and I'm bisexual",
   'role': 'user'},
  {'content': 'Oh really? What kind of things do you enjoy doing?',
   'role': 'assistant'},
  {'content': 'I like to read and play video games', 'role': 'user'},
  {'content': 'video games!', 'role': 'assistant'},
  {'content': 'Yeah', 'role': 'user'},
  {'content': 'Do you have any favorite ones?', 'role': 'assistant'},
  {'content': 'Not really', 'role': 'user'},
  {'content': 'Hmm, well what kind of things do you like to read?',
   'role': '

In [7]:
from transformers import AutoTokenizer
import json

def check_mistral_chat_template():
    """
    Check and test the chat template for Mistral-Nemo-Instruct-2407
    """
    
    # Load the tokenizer which contains the chat template
    try:
        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-Nemo-Instruct-2407")
        print("✅ Successfully loaded Mistral-Nemo-Instruct-2407 tokenizer")
    except Exception as e:
        print(f"❌ Error loading tokenizer: {e}")
        return
    
    # Test cases including edge cases with brackets
    
    test_cases = [
        unified_dataset[0]['messages']
    ]
    
    for i, messages in enumerate(test_cases, 1):
        print(f"\n--- Test Case {i} ---")
        print("Input messages:")
        for msg in messages:
            print(f"  {msg['role']}: {repr(msg['content'])}")
        
        try:
            # Apply chat template
            formatted = tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
            )
            print("\nFormatted output:")
            print(repr(formatted))
            # print("\nRendered output:")
            # print(formatted)
            
            # Also test tokenization
            tokens = tokenizer.apply_chat_template(
                messages, 
                tokenize=True, 
                add_generation_prompt=True
            )
            print(f"\nToken count: {len(tokens)}")
            
        except Exception as e:
            print(f"❌ Error processing: {e}")
        
        print("-" * 30)
check_mistral_chat_template()

tokenizer_config.json:   0%|          | 0.00/181k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

✅ Successfully loaded Mistral-Nemo-Instruct-2407 tokenizer

--- Test Case 1 ---
Input messages:
  user: 'Hello there, how can I help \n\n Hi, I just wanted to talk with someone'
  assistant: 'Of course, what about?'
  user: 'I just wanted to talk in general'
  assistant: 'That sounds great, I love talking about anything and everything'
  user: "That's good"
  assistant: 'So tell me about yourself'
  user: "Well, my name is Seán. I'm 19 and I'm bisexual"
  assistant: 'Oh really? What kind of things do you enjoy doing?'
  user: 'I like to read and play video games'
  assistant: 'video games!'
  user: 'Yeah'
  assistant: 'Do you have any favorite ones?'
  user: 'Not really'
  assistant: 'Hmm, well what kind of things do you like to read?'

Formatted output:
"<s>[INST]Hello there, how can I help \n\n Hi, I just wanted to talk with someone[/INST]Of course, what about?</s>[INST]I just wanted to talk in general[/INST]That sounds great, I love talking about anything and everything</s>[INST]Tha

In [8]:
# =====================================================================================
# 4. Model and Tokenizer Setup
# =====================================================================================
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM

print("\n🔧 Setting up model, tokenizer, and configurations...")

# Load the base model
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    # quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2",
    trust_remote_code=True,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


🔧 Setting up model, tokenizer, and configurations...


config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/29.9k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.87G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [9]:
# --- THIS IS THE KEY CHANGE ---
# Define the official Mistral Instruct template, modified to handle a system prompt
MISTRAL_INSTRUCT_TEMPLATE = (
    "{{ bos_token }}"
    "{% set system_message_found = false %}"
    "{% for message in messages %}"
        "{% if message['role'] == 'system' %}"
            "{% set system_message = message['content'] %}"
            "{% set system_message_found = true %}"
        "{% elif message['role'] == 'user' %}"
            "{% if loop.first and system_message_found %}"
                "{{ '[INST] ' + system_message + '\\n' + message['content'] + ' [/INST]' }}"
            "{% else %}"
                "{{ '[INST] ' + message['content'] + ' [/INST]' }}"
            "{% endif %}"
        "{% elif message['role'] == 'assistant' %}"
            "{{ ' ' + message['content'] + eos_token }}"
        "{% endif %}"
    "{% endfor %}"
)
tokenizer.chat_template = MISTRAL_INSTRUCT_TEMPLATE
print("✅ Mistral Instruct chat template configured.")

# PEFT/LoRA Configuration
lora_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)



✅ Mistral Instruct chat template configured.


In [10]:
unified_dataset

Dataset({
    features: ['messages'],
    num_rows: 44139
})

In [11]:
from datasets import Dataset
from transformers import AutoTokenizer

def apply_chat_template(dataset, tokenizer, text_column="messages"):
    """
    Apply chat template to a dataset with messages field.
    
    Args:
        dataset: HuggingFace Dataset with messages field
        tokenizer: HuggingFace tokenizer with chat template
        text_column: Name of the column containing messages (default: "messages")
    
    Returns:
        Dataset with templated and tokenized text
    """
    
    def format_example(example):
        # Apply the chat template to the messages
        # Assumes messages is a list of dicts with 'role' and 'content' keys
        formatted_text = tokenizer.apply_chat_template(
            example[text_column], 
            tokenize=False,  # Get string first, then tokenize separately
            add_generation_prompt=False  # Set to True if you want to add generation prompt
        )
        
        # Tokenize the formatted text
        tokenized = tokenizer(
            formatted_text,
            truncation=True,
            padding=False,  # Usually done in batches later
            return_tensors=None  # Return lists, not tensors
        )
        
        return {
            "input_ids": tokenized["input_ids"],
            "attention_mask": tokenized["attention_mask"],
            "formatted_text": formatted_text  # Keep for debugging
        }
    
    # Apply the formatting function to the dataset
    formatted_dataset = dataset.map(
        format_example,
        remove_columns=[text_column],  # Remove original messages column
        desc="Applying chat template"
    )
    
    return formatted_dataset

# Apply chat template
formatted_dataset = apply_chat_template(unified_dataset, tokenizer)
formatted_dataset


Applying chat template:   0%|          | 0/44139 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset({
    features: ['input_ids', 'attention_mask', 'formatted_text'],
    num_rows: 44139
})

In [12]:
# formatted_dataset['formatted_text'][10000]

In [13]:
# =====================================================================================
# 5. Training
# =====================================================================================
# =====================================================================================
from transformers import TrainerCallback, TrainerControl, TrainerState, Trainer
from transformers.trainer_utils import get_last_checkpoint

class LrLoggerCallback(TrainerCallback):
    def on_log(self, args, state: TrainerState, control: TrainerControl, logs=None, **kwargs):
        if logs is not None and "loss" in logs:
            lr = logs.get("learning_rate", "N/A")
            step = state.global_step
            loss = logs["loss"]
            print(f"Step: {step:>5} | Loss: {loss:.6f} | LR: {lr:.8f}")


# Training Arguments
training_args = TrainingArguments(
    output_dir=new_model_name,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    save_steps=500,
    logging_steps=25,
    learning_rate=7e-6,
    weight_decay=0.001,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",  # or "cosine_with_restarts"
    # report_to="tensorboard"
)

# def tokenize(example):
#     return tokenizer(example["text"], truncation=True, padding="max_length", max_length=2048)

# tokenized_dataset = unified_dataset[0].map(tokenize)

# SFT Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    peft_config=lora_config,
    # max_seq_length=2048,
    # tokenizer=tokenizer,
    args=training_args,
    # packing=True, # Improves efficiency by packing short sequences together
    callbacks=[LrLoggerCallback()],
)

print("\n🚂 Starting training...")
trainer.train()
print("✅ Training complete.")

# Save the LoRA adapter
trainer.save_model(new_model_name)
print(f"✅ LoRA adapter saved to {new_model_name}")


Truncating train dataset:   0%|          | 0/44139 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



🚂 Starting training...


Step,Training Loss
25,2.9039
50,4.1452
75,2.7556
100,4.0491
125,2.5076
150,3.4274
175,2.253
200,2.9912
225,2.1366
250,2.7854


Step:    25 | Loss: 2.903900 | LR: 0.00000101
Step:    50 | Loss: 4.145200 | LR: 0.00000207
Step:    75 | Loss: 2.755600 | LR: 0.00000312
Step:   100 | Loss: 4.049100 | LR: 0.00000417
Step:   125 | Loss: 2.507600 | LR: 0.00000523
Step:   150 | Loss: 3.427400 | LR: 0.00000628
Step:   175 | Loss: 2.253000 | LR: 0.00000700
Step:   200 | Loss: 2.991200 | LR: 0.00000700
Step:   225 | Loss: 2.136600 | LR: 0.00000700
Step:   250 | Loss: 2.785400 | LR: 0.00000700
Step:   275 | Loss: 2.075900 | LR: 0.00000699
Step:   300 | Loss: 2.679300 | LR: 0.00000699
Step:   325 | Loss: 2.049700 | LR: 0.00000698
Step:   350 | Loss: 2.678900 | LR: 0.00000698
Step:   375 | Loss: 1.991600 | LR: 0.00000697
Step:   400 | Loss: 2.612200 | LR: 0.00000697
Step:   425 | Loss: 2.031700 | LR: 0.00000696
Step:   450 | Loss: 2.617500 | LR: 0.00000695
Step:   475 | Loss: 2.008200 | LR: 0.00000694
Step:   500 | Loss: 2.615400 | LR: 0.00000693
Step:   525 | Loss: 1.974100 | LR: 0.00000692
Step:   550 | Loss: 2.574200 | LR:

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [14]:

# =====================================================================================
# 6. Merge Model and Save Final Version
# =====================================================================================
print("\n🧬 Merging model and preparing for upload...")

# Reload the base model in FP16 for merging
base_model_for_merge = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

# Load the PEFT model with the saved adapter
merged_model = PeftModel.from_pretrained(base_model_for_merge, new_model_name)
# Merge the adapter into the base model
merged_model = merged_model.merge_and_unload()
print("✅ Model merged.")

# Save the final, merged model and its tokenizer
merged_model.save_pretrained(final_model_name)
tokenizer.save_pretrained(final_model_name)
print(f"✅ Final merged model saved to {final_model_name}")

# To push to hub, run this after logging in via `huggingface-cli login`:
# merged_model.push_to_hub(final_model_name)
# tokenizer.push_to_hub(final_model_name)



🧬 Merging model and preparing for upload...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Model merged.
✅ Final merged model saved to Mistral-Nemo-2407-LORA-combineddata-8ac-lr7e6-2ep


In [15]:
merged_model.push_to_hub(final_model_name)
tokenizer.push_to_hub(final_model_name)

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.87G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [16]:

# =====================================================================================
# 7. Inference Test
# =====================================================================================
print("\n🧪 Running inference test...")

# Define a persona and a starting message
test_messages = [
    {
        "role": "system",
        "content": "You are a weary, old wizard named Elara. You are suspicious of strangers but possess deep knowledge of ancient magic. You speak in a cryptic and slightly paranoid manner."
    },
    {
        "role": "user",
        "content": "Greetings, old one. I seek knowledge of the Sunken City."
    }
]

# Use the Hugging Face pipeline for easy inference
# This will automatically use the chat template we configured!
pipe = pipeline("text-generation", model=final_model_name, tokenizer=tokenizer, torch_dtype=torch.bfloat16)
outputs = pipe(test_messages, max_new_tokens=150, do_sample=True, temperature=0.7, top_p=0.9, top_k=50)

print("\n--- INFERENCE RESULT ---")
# The pipeline output includes the full conversation string
full_output = outputs[0]['generated_text']

# Extract just the assistant's latest response for cleaner display
assistant_response = full_output.split('[/INST]')[-1].strip()
print(assistant_response)
print("--- END OF SCRIPT ---")


🧪 Running inference test...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Device set to use cuda:0



--- INFERENCE RESULT ---


AttributeError: 'list' object has no attribute 'split'