In [2]:
%%capture
import os
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install --no-deps git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3

In [3]:

from unsloth import FastModel
import torch
model,tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it",
    max_seq_length = 2048,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.14: Fast Gemma3 patching. Transformers: 4.50.0.dev0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors:   0%|          | 0.00/4.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/192 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

In [4]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers = False,
    finetune_language_layers = True,
    finetune_attention_modules = True,
    finetune_mlp_modules = True,
    r = 8,
    lora_alpha = 8,
    lora_dropout = 0,
    bias = "none",
    random_state = 23,
)

In [5]:
from datasets import load_dataset
import json

dataset = load_dataset("Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT", streaming=True)
filtered_dataset = dataset.filter(lambda example: example['repo_name'] == 'zhihu/zhihu_score9.0-10_clean_v10')

def save_to_json(dataset, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for example in dataset:
            json.dump(example, f, ensure_ascii=False)
            f.write('\n')

save_to_json(filtered_dataset['train'], 'zhihu_data.json')


README.md:   0%|          | 0.00/5.03k [00:00<?, ?B/s]

数据已保存到 xhs_data.json


In [6]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

import re
from datasets import load_dataset

dataset = load_dataset("json", data_files="zhihu_data.json", split="train")

def apply_chat_template_zhihu(examples):
    conversations = []
    for instruction, input_text, output_text in zip(examples["instruction"], examples["input"], examples["output"]):

        if input_text.strip():
            user_message = instruction.strip() + "\n" + input_text.strip()
        else:
            user_message = instruction.strip()

        cleaned_output = re.sub(r'<think>.*?</think>', '', output_text, flags=re.DOTALL).strip()

        conversation = (
            "<bos>\n"
            "<start_of_turn>user\n" + user_message + "\n<end_of_turn>\n"
            "<start_of_turn>model\n" + cleaned_output + "\n<end_of_turn>"
        )
        conversations.append(conversation)
    return {"text": conversations}

dataset = dataset.map(apply_chat_template_zhihu, batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2534 [00:00<?, ? examples/s]

In [7]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 23,
        report_to = "none",
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16
Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/2534 [00:00<?, ? examples/s]

In [8]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=2):   0%|          | 0/2534 [00:00<?, ? examples/s]

In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,534 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 14,901,248/4,000,000,000 (0.37% trained)
It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.3803
2,2.9591
3,3.1038
4,2.7605
5,2.8773
6,2.7539
7,3.3423
8,2.6528
9,2.9389
10,2.7885


In [10]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "神经网络的参数一般怎么初始化？",
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
)
outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 1024,

    temperature = 1.0, top_p = 0.95, top_k = 64,
)
print(tokenizer.batch_decode(outputs)[0])

<bos><start_of_turn>user
神经网络的参数一般怎么初始化？<end_of_turn>
<start_of_turn>model
神经网络的参数初始化是一个至关重要的环节，它会直接影响到训练过程的稳定性和收敛速度。不同的初始化方法会带来不同的效果，选择合适的初始化方法可以帮助避免梯度消失或梯度爆炸等问题。

以下是一些常见的神经网络参数初始化方法：

**1. 随机初始化 (Random Initialization)**

* **均匀分布 (Uniform Distribution):**  参数从一个均匀分布中随机采样。
    * **优点:** 简单易实现。
    * **缺点:**  容易导致梯度消失或梯度爆炸，尤其是在深度网络中。
    * **公式:**  `参数 = np.random.uniform(low, high, size)`，其中 `low` 和 `high` 是均匀分布的范围，`size` 是参数的形状。
* **正态分布 (Normal Distribution):**  参数从一个正态分布中随机采样。
    * **优点:**  比均匀分布更常见，通常能更好地稳定训练。
    * **缺点:**  仍然可能出现梯度问题，尤其是在某些情况下。
    * **公式:** `参数 = np.random.normal(loc=0.0, scale=std, size)`，其中 `loc` 是均值，`std` 是标准差，`size` 是参数的形状。  通常 `std` 设置为 `1.0` 或 `0.01`。
* **Xavier 初始化 (Xavier Initialization):**  基于神经元连接的权重和偏置的方差来初始化参数。
    * **优点:**  对深度网络更友好，能更好地平衡梯度大小，有助于稳定训练。
    * **两种类型:**
        * **Glorot 初始化 (也称为 Xavier 初始化):**  假设权重矩阵的每个元素都从标准正态分布中采样，均值为 0，标准差为 `sqrt(6 / (n_in + n_out))`，其中 `n_in` 是输入层神经元数量，`n_out` 是输出层神经元数量。
        * **He 初始化:**  类似于 Glorot 初