In [1]:
import os

import torch
from unsloth import FastLanguageModel

max_seq_length = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    #model_name="mistralai/Mistral-Nemo-Instruct-2407",
    device_map="cuda:0",
    max_seq_length=max_seq_length,  # Choose any for long context!
    load_in_4bit=True,  # 4 bit quantization to reduce memory
    load_in_8bit=False,  # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning=False,  # [NEW!] We have full finetuning now!
    attn_implementation="flash_attention_2",
    trust_remote_code=True, # needed for non-officially supported model
    token=os.environ["HUGGINGFACE_HUB_TOKEN"],
)

  import pynvml  # type: ignore[import]


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


INFO 01-01 04:17:13 [__init__.py:241] Automatically detected platform cuda.
ü¶• Unsloth Zoo will now patch everything to make training faster!
Unsloth: Could not import trl.trainer.alignprop_trainer: Failed to import trl.trainer.alignprop_trainer because of the following error (look up to see its traceback):
cannot import name 'DDPOStableDiffusionPipeline' from 'trl.models' (/usr/local/lib/python3.12/dist-packages/trl/models/__init__.py)
Unsloth: Could not import trl.trainer.nash_md_trainer: Failed to import trl.trainer.nash_md_trainer because of the following error (look up to see its traceback):
cannot import name 'amp' from 'apex' (/usr/local/lib/python3.12/dist-packages/apex/__init__.py)
Unsloth: Could not import trl.trainer.online_dpo_trainer: Failed to import trl.trainer.online_dpo_trainer because of the following error (look up to see its traceback):
cannot import name 'amp' from 'apex' (/usr/local/lib/python3.12/dist-packages/apex/__init__.py)
Unsloth: Could not import trl.tra

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=32,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

Unsloth: Already have LoRA adapters! We shall skip this step.


In [16]:
from datasets import load_dataset

dataset = load_dataset("yahma/alpaca-cleaned", split="train")

dataset

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 51760
})

In [19]:
def format_chatml(example):
    user_content = example["instruction"]
    if example.get("input"):
        user_content += f"\n\n{example['input']}"

    return {
        "messages": [
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": example["output"]},
        ]
    }


# 3. Apply the transformation and remove old columns
dataset = dataset.map(
    format_chatml, remove_columns=dataset.column_names
)

In [8]:
dataset

Dataset({
    features: ['messages'],
    num_rows: 1225
})

In [32]:
# dataset = dataset.select(range(100))
# dataset

Dataset({
    features: ['messages'],
    num_rows: 100
})

In [20]:
def formatting_prompts_func(examples):
    convos = examples["messages"]

    # Filter out system messages from each conversation
    filtered_convos = [
        [msg for msg in convo if msg["role"] != "system"] for convo in convos
    ]

    texts = [
        tokenizer.apply_chat_template(
            convo, tokenize=False, add_generation_prompt=False
        )
        for convo in filtered_convos
    ]

    return {
        "text": texts,
    }


# Apply the map as usual
dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51760/51760 [00:02<00:00, 23087.28 examples/s]


In [10]:
dataset["text"][0]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 30 Dec 2025\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat should management ensure before personnel are granted access to organizational assets?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nPersonnel should be properly briefed on their information security roles and responsibilities.<|eot_id|>'

In [9]:
from unsloth import UnslothTrainer, UnslothTrainingArguments

import wandb

!unset WANDB_DISABLED

wandb.init(project="huggingface")

trainer = UnslothTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args=UnslothTrainingArguments(
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        learning_rate=1.5e-5,
        warmup_ratio=0.03,
        lr_scheduler_type="cosine",
        optim="adamw_8bit",
        weight_decay=0.001,
        logging_steps=10,
        eval_steps=2200,
        seed=3407,
        dataset_num_proc=4,
        report_to="wandb",
    ),
)

wandb: Currently logged in as: raziman-terra (raziman-terra-raziman-inc) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


wandb: Detected [huggingface_hub.inference, openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
Unsloth: Tokenizing ["text"] (num_proc=4): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1225/1225 [00:01<00:00, 1187.90 examples/s]


ü¶• Unsloth: Padding-free auto-enabled, enabling faster training.


In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,225 | Num Epochs = 3 | Total steps = 231
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 22,544,384 of 1,258,358,784 (1.79% trained)


Step,Training Loss
10,5.683
20,4.4917
30,3.0656
40,2.4596
50,2.1672
60,2.0283
70,1.8821
80,1.7414
90,1.7195
100,1.7258


Unsloth: Will smartly offload gradients to save VRAM!


0,1
train/epoch,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
train/grad_norm,‚ñà‚ñÖ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
train/learning_rate,‚ñà‚ñà‚ñà‚ñà‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
train/loss,‚ñà‚ñÜ‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ

0,1
total_flos,1655584372224000.0
train/epoch,3.0
train/global_step,231.0
train/grad_norm,1.06863
train/learning_rate,0.0
train/loss,1.583
train_loss,2.06793
train_runtime,115.5304
train_samples_per_second,31.81
train_steps_per_second,1.999


In [11]:
messages = [
    {"role": "user", "content": "What are the key components of the policy?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer, skip_prompt=True)
# 1. Generate (Streamer will print to console automatically)
outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=256,
    temperature=0.1,
)

# 2. Extract only the new tokens (the model's answer)
# inputs.shape[1] is the length of your prompt
new_tokens = outputs[0][inputs.shape[1] :]
final_text = tokenizer.decode(new_tokens, skip_special_tokens=True)

print(final_text)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The key components of the policy include: 
- Clear objectives and targets.
- Effective communication channels.
- Regular review and update of the policy.
- Compliance with relevant laws and regulations.
- Training and awareness for personnel.


In [12]:
# using code name prevents models overridding previous fine-tuning runs
# and also help us determine which model was better
code_name = "unsloth"

In [13]:
model.save_pretrained(code_name)
tokenizer.save_pretrained(code_name)

('unsloth/tokenizer_config.json',
 'unsloth/special_tokens_map.json',
 'unsloth/chat_template.jinja',
 'unsloth/tokenizer.json')

In [14]:
# this is only to be run once before exporting to GGUF for the first time
!git clone https://github.com/ggml-org/llama.cpp.git && \
    cd llama.cpp && \
    cmake -B build && \
    cmake --build build --config Release -j$(nproc) && \
    cp build/bin/* .

Cloning into 'llama.cpp'...
remote: Enumerating objects: 74550, done.[K
remote: Counting objects: 100% (240/240), done.[K
remote: Compressing objects: 100% (187/187), done.[K
remote: Total 74550 (delta 150), reused 53 (delta 53), pack-reused 74310 (from 4)[K
Receiving objects: 100% (74550/74550), 271.27 MiB | 12.86 MiB/s, done.
Resolving deltas: 100% (53995/53995), done.
-- The C compiler identification is GNU 13.3.0
-- The CXX compiler identification is GNU 13.3.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.43.0")
-- The ASM compiler identificati

In [15]:
!python -m venv .venv && source .venv/bin/activate
# Save to q4_k_m GGUF
model.save_pretrained_gguf(code_name, tokenizer, quantization_method="q4_k_m")

Unsloth: Merging model weights to 16-bit format...
Found HuggingFace hub cache directory: /home/unsloth/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [02:47<00:00, 167.19s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:06<00:00,  6.19s/it]


Unsloth: Merge process complete. Saved to `/app/unsloth`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF bf16 might take 3 minutes.
\        /    [2] Converting GGUF bf16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: llama.cpp found in the system. Skipping installation.
Unsloth: Preparing converter script...
Unsloth: [1] Converting model into bf16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['llama-3.2-1b-instruct.BF16.gguf']
Unsloth: [2] Converting GGUF bf16 into q4_k_m. This might take 10 minutes...
Unsloth: Model files cleanup...
Unsloth: All GGUF conversions completed successfully!
Generated files: ['llama-3.2-1b-instruct.Q4_K_M.gguf']
Unsloth: example usage for text only LLMs: llama-cli --model llama-3.2-1b-instruct

{'save_directory': 'unsloth',
 'gguf_files': ['llama-3.2-1b-instruct.Q4_K_M.gguf'],
 'modelfile_location': '/app/Modelfile',
 'want_full_precision': False,
 'is_vlm': False,
 'fix_bos_token': False}