In [1]:
# Authenticate to Hugging Face
from huggingface_hub import login

# login()

# for convenience you can create an environment variable containing your hub token as HF_TOKEN

Step 1: Import Required Libraries

In [2]:
# Import necessary libraries
from unsloth import FastLanguageModel
import torch
from datasets import Dataset, load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

import os

os.environ["TORCHDYNAMO_DISABLE"] = "1"

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


Step 2: Load the Pre-trained Model

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

# Check CUDA compatibility
major_version, minor_version = torch.cuda.get_device_capability()

# Load the model and tokenizer
max_seq_length = 2048
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=load_in_4bit,
)
# tokenizer = AutoTokenizer.from_pretrained("unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit")

==((====))==  Unsloth 2025.6.8: Fast Phi3 patching. Transformers: 4.52.4.
   \\   /|    NVIDIA GeForce RTX 3070 Ti Laptop GPU. Num GPUs = 1. Max memory: 7.667 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Step 3: Prepare the Model for Fine-Tuning

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

Unsloth: Making `model.base_model.model.model` require gradients


Step 4: Load and Prepare the Dataset

In [5]:
# Data Preparation and Formatting
dataset = load_dataset("aifeifei798/Chinese-DeepSeek-R1-Distill-data-110k-alpaca")

text_data = {"text": []}

for example in dataset:
    input_text = example[1]
    output_text = example[2]
    text_format = f"<|system|>Your name is feifei, an AI math expert developed by DrakIdol.<|end|><|user|>{input_text}<|end|><|assistant|>{output_text}<|end|>"
    text_data["text"].append(text_format)

# Convert the dictionary to a Dataset object
train_dataset = Dataset.from_dict(text_data)
del text_data
# Print the first 3 entries of the dataset
for i, row in enumerate(train_dataset.select(range(1))):
    print(f"Row {i + 1}:")
    for key in row.keys():
        print(f"{key}: {row[key]}")
    print("\n")

# prompt = """Based on given instruction and context, generate an appropriate response
# ### Instruction:
# {}
# ### Context:
# {}
# ### Response:
# {}
# """

# Load a sample dataset
# EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

# def formatting_prompts_func(examples):
#     instructions = examples["instruction"]
#     contexts = examples["context"]
#     responses = examples["response"]
#     texts = []
#     for i, j, k in zip(instructions, contexts, responses):
#         text = prompt.format(i, j, k) + EOS_TOKEN
#         texts.append(text)
#     return {
#         "text": texts,
#     }

# pass
# from datasets import load_dataset

# dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
# dataset = dataset.map(formatting_prompts_func, batched=True)

Row 1:
text: <|system|>Your name is feifei, an AI math expert developed by DrakIdol.<|end|><|user|>r<|end|><|assistant|>a<|end|>




Step 5: Set Up the Trainer

In [6]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    lora_rank=8,  # Adjust rank based on VRAM considerations
    lora_alpha=32,  # Controls the strength of the adapter contribution
    lora_dropout=0.1,  # Regularization to prevent overfitting
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=50,  # Set max steps to 15,000
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="temp/outputs",
        save_steps=5,  # Save the model every 5 steps
        save_total_limit=10,  # Keep only the 10 most recent checkpoints
        report_to="none",  # Use this for WandB etc
        # resume_from_checkpoint=True,  # Resume from the latest checkpoint
        # resume_from_checkpoint=checkpoint_path,  # Resume from the specified checkpoint
    ),
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/1 [00:00<?, ? examples/s]

Step 6: Train the Model

In [7]:
# Training
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1 | Num Epochs = 50 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 8,912,896/2,347,568,128 (0.38% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,7.1131
2,7.1131
3,7.0267
4,6.9364
5,6.5904
6,6.0248
7,5.1864
8,4.4226
9,3.7864
10,3.1781


Step 7: Save the Model

In [8]:
# Local saving
trainer.model.save_pretrained("temp/drakidol-Phi-4-lora_model")

Step 8: Test the Model

In [9]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-4",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=64,
    use_cache=True,
    temperature=1.5,
    min_p=0.1,
    do_sample=False,
)
tokenizer.batch_decode(outputs)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(
    input_ids=inputs,
    streamer=text_streamer,
    max_new_tokens=128,
    use_cache=True,
    temperature=1.5,
    min_p=0.1,
    do_sample=False,
)

# inputs = tokenizer(
#     [
#         prompt.format(
#             "Provide a detailed explanation of the events and circumstances that led to the outbreak of World War II.",  # instruction
#             " The goal is to offer a clear and informative account of the factors, political decisions, and international tensions that played a crucial role in triggering World War II. Ensure that the explanation covers the period leading up to the war, key events, and the involvement of major nations",  # context
#             " ",  # response
#         )
#     ]
#     * 1,
#     return_tensors="pt",
# ).to("cuda")

The following generation flags are not valid and may be ignored: ['temperature', 'min_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The following generation flags are not valid and may be ignored: ['temperature', 'min_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
You have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation will be skipped.
The following generation flags are not valid and may be ignored: ['temperature', 'min_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


13, 21, 34, 55,  club halving  excessive  excessive 01 fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe


Step 9: Save trained model and tokenizer

In [10]:
# Save the trained model and tokenizer
model.save_pretrained("temp/drakidol-Phi-4_model")
tokenizer.save_pretrained("temp/drakidol-Phi-4_model")

('temp/drakidol-Phi-4_model/tokenizer_config.json',
 'temp/drakidol-Phi-4_model/special_tokens_map.json',
 'temp/drakidol-Phi-4_model/chat_template.jinja',
 'temp/drakidol-Phi-4_model/vocab.json',
 'temp/drakidol-Phi-4_model/merges.txt',
 'temp/drakidol-Phi-4_model/added_tokens.json',
 'temp/drakidol-Phi-4_model/tokenizer.json')