In [5]:
# 安装依赖（Colab里加感叹号）
!pip install -q transformers accelerate peft datasets bitsandbytes
!pip install --upgrade datasets fsspec aiohttp

from datasets import Dataset
import json

# -------------------------------
# Step 1: 读取本地 jsonl 文件，加载每行json为一个dict，放入data列表
data = []
with open("medical_sample.jsonl", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

print(f"读取数据条数：{len(data)}")
if len(data) == 0:
    raise ValueError("数据为空！请确认medical_sample.jsonl文件是否存在且非空")

# Step 2: 用 datasets.Dataset 构建数据集对象
dataset = Dataset.from_list(data)
print("第一条数据预览：", dataset[0])

# Step 3: 手动划分训练集和测试集（90%训练，10%测试）
split_index = int(len(dataset) * 0.9)
train_dataset = dataset.select(range(split_index))
test_dataset = dataset.select(range(split_index, len(dataset)))

# Step 4: 加载Qwen模型和分词器
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id = "Qwen/Qwen1.5-0.5B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,
    trust_remote_code=True,
    device_map="auto"
)

# Step 5: 配置LoRA微调
from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, peft_config)

# Step 6: 格式化数据和tokenize
def format_example(example):
    user_instruction = ""
    assistant_output = ""
    for message in example['conversations']:
        if message['role'] == 'user':
            user_instruction += message['content'] + "\n"
        elif message['role'] == 'assistant':
            assistant_output += message['content'] + "\n"
    return f"### 用户:\n{user_instruction.strip()}\n\n### 医生:\n{assistant_output.strip()}"

def tokenize(example):
    prompt = format_example(example)
    tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=512)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

train_dataset = train_dataset.map(tokenize)
test_dataset = test_dataset.map(tokenize)

# Step 7: 训练参数配置（加长训练轮数，降低学习率，开启评估）
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./qwen-medical",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    report_to="none"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Step 8: 开始训练
trainer.train()

# Step 9: 测试微调效果
model.eval()
def chat(question):
    prompt = f"### 用户:\n{question}\n\n### 医生:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=128)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

chat("胃疼吃什么药？")

# Step 10: 保存微调后模型和分词器
model.save_pretrained("./qwen-medical-peft")
tokenizer.save_pretrained("./qwen-medical-peft")


Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
读取数据条数：134
第一条数据预览： {'conversations': [{'content': '感冒有什么症状？', 'role': 'user'}, {'content': '感冒的主要症状包括流鼻涕、咳嗽、喉咙痛和发热。', 'role': 'assistant'}]}


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,3.9512
20,0.2741
30,0.1726
40,0.127
50,0.1071
60,0.104
70,0.0889
80,0.0914
90,0.0873


### 用户:
胃疼吃什么药？

### 医生:
清淡饮食、止痛药等。


('./qwen-medical-peft/tokenizer_config.json',
 './qwen-medical-peft/special_tokens_map.json',
 './qwen-medical-peft/chat_template.jinja',
 './qwen-medical-peft/vocab.json',
 './qwen-medical-peft/merges.txt',
 './qwen-medical-peft/added_tokens.json',
 './qwen-medical-peft/tokenizer.json')

!pip install transformers accelerate datasets peft trl bitsandbytes


In [10]:
!pip install transformers accelerate datasets peft trl bitsandbytes


Collecting trl
  Downloading trl-0.19.0-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2

下载qwen1.5

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen1.5-7B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype="auto"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/295 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.54G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]



In [30]:
trainer.model.save_pretrained("/content/qwen1.5-lora-finetuned")


NameError: name 'trainer' is not defined

准备 LoRA 配置并加载数据集

In [22]:
from peft import LoraConfig, get_peft_model
if hasattr(model, "peft_config"):
    model = model.unload()

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj"],  # 根据打印结果调整
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)




准备数据集

In [24]:
from datasets import load_dataset

data_files = {"train": "/content/medical_sample.jsonl"}  # 你的微调数据路径
dataset = load_dataset("json", data_files=data_files)["train"]

Generating train split: 0 examples [00:00, ? examples/s]

数据预处理
```



In [25]:
def preprocess(examples):
    inputs = []
    for conv in examples["conversations"]:
        text = ""
        for turn in conv:
            role = turn["role"]
            content = turn["content"]
            if role == "user":
                text += f"<|User|>: {content}\n"
            else:
                text += f"<|Assistant|>: {content}\n"
        inputs.append(text)
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    model_inputs["labels"] = model_inputs["input_ids"].copy()  # 自回归模型，label就是input_ids
    return model_inputs

dataset = dataset.map(preprocess, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



Map:   0%|          | 0/5 [00:00<?, ? examples/s]

配置训练参数并启动训练

In [27]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./qwen1.5-lora-finetuned",
    per_device_train_batch_size=4,
    save_strategy="steps",
    num_train_epochs=3,
    save_steps=100,
    logging_steps=50,
    learning_rate=3e-4,
    fp16=True,
    push_to_hub=False,
)


demo

In [28]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# 模型路径：微调后的模型
base_model_path = "Qwen/Qwen1.5-7B"
lora_model_path = "./qwen1.5-lora-finetuned"

# 加载 tokenizer 和基础模型
tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(base_model_path, trust_remote_code=True, device_map="auto", torch_dtype=torch.float16)

# 加载 LoRA adapter（微调权重）
model = PeftModel.from_pretrained(model, lora_model_path)

# 切换为 evaluation 模式
model.eval()

# 示例问题
question = "百日咳有什么症状？"

# 构造 ChatML 输入格式
prompt = f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"

# 编码输入
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# 推理生成
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

# 解码输出
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# 只提取 assistant 回答部分
if "<|im_start|>assistant\n" in response:
    response = response.split("<|im_start|>assistant\n")[-1]

print("模型回答：")
print(response.strip())


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



ValueError: Can't find 'adapter_config.json' at './qwen1.5-lora-finetuned'

保存模型

In [None]:
model.save_pretrained("./qwen1.5-lora-finetuned")
tokenizer.save_pretrained("./qwen1.5-lora-finetuned")


In [None]:
# ✅ 第 1 步：安装依赖包
!pip install -q transformers accelerate peft datasets bitsandbytes
!pip install --upgrade datasets fsspec aiohttp

# ✅ 第 2 步：准备少量 ChatML 格式的医药问答示例数据
import json

example_data = [
    {
        "messages": [
            {"role": "user", "content": "我最近老是头疼，可能是什么原因？"},
            {"role": "assistant", "content": "头痛可能由多种原因引起，包括紧张型头痛、偏头痛、高血压、用眼过度等，建议就医确诊。"}
        ]
    },
    {
        "messages": [
            {"role": "user", "content": "咳嗽有痰吃什么药比较好？"},
            {"role": "assistant", "content": "可以考虑使用止咳化痰的药物，比如氨溴索、鲜竹沥、复方甘草片等。"}
        ]
    },
    {
        "messages": [
            {"role": "user", "content": "感冒发烧了可以喝藿香正气水吗？"},
            {"role": "assistant", "content": "藿香正气水主要用于湿热型感冒，如果是风寒或病毒性感冒，不建议使用。"}
        ]
    },
]

# 写入 jsonl 文件
with open("medical_sample.jsonl", "w", encoding="utf-8") as f:
    for item in example_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

# ✅ 第 3 步：加载 jsonl 文件到 HuggingFace 的 Dataset
from datasets import Dataset

data = []
with open("medical_sample.jsonl", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

dataset = Dataset.from_list(data)

# 手动划分训练集和测试集（90% / 10%）
split_index = int(len(dataset) * 0.9)
train_dataset = dataset.select(range(split_index))
test_dataset = dataset.select(range(split_index, len(dataset)))

# ✅ 第 4 步：加载 Qwen1.5 模型和分词器（Qwen1.5-0.5B-Chat）
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "Qwen/Qwen1.5-0.5B-Chat"

bnb_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=bnb_config
)

# ✅ 第 5 步：应用 LoRA 微调配置
from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, peft_config)

# ✅ 第 6 步：格式化数据为 ChatML 格式，并进行分词
def format_chatml(example):
    prompt = ""
    for msg in example["messages"]:
        role = msg["role"]
        content = msg["content"]
        prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
    prompt += "<|im_start|>assistant\n"  # 预测的部分
    return prompt

def tokenize(example):
    prompt = format_chatml(example)
    tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=512)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

train_dataset = train_dataset.map(tokenize)
test_dataset = test_dataset.map(tokenize)

# ✅ 第 7 步：配置训练参数并训练模型
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./qwen-medical",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    report_to="none"  # 禁用默认日志报告（wandb等）
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

# ✅ 第 8 步：测试模型效果，模拟问答
model.eval()

def chat(question):
    prompt = f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=128)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# 示例测试
chat("我嗓子疼，吃什么药好？")

# ✅ 第 9 步：保存微调后的模型和分词器
model.save_pretrained("./qwen-medical-peft")
tokenizer.save_pretrained("./qwen-medical-peft")

# ✅ （可选）合并LoRA权重到原模型中并保存为完整模型（适合推理部署）
merged_model = model.merge_and_unload()
merged_model.save_pretrained("./qwen-medical-full")
tokenizer.save_pretrained("./qwen-medical-full")


Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp
  Downloading aiohttp-3.12.13-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Downloading aiohttp-3.12.13-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: aiohttp
  Attempting uninstall: aiohttp
    Found existing installation: aiohttp 3.11.15
    Uninstalling aiohttp-3.11.15:
      Successfully uninstalled aiohttp-3.11.15
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.[0m[31m
[0mSuccessfully installed aiohttp-3.12.13


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]