In [None]:
# Install required packages
!pip install  datasets

# Mount Google Drive (optional, if you want to save your model)
from google.colab import drive
drive.mount('/content/drive')

# Set working directory (optional)
%cd /content

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
import torch
import gc
import os

def deep_clean_gpu():
    """Thorough GPU memory cleanup"""
    # 1. Clear PyTorch cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

    # 2. Delete all models and trainers
    for var in list(globals()):
        if any(x in var.lower() for x in ['model', 'trainer', 'optimizer', 'scheduler', 'tokenizer']):
            del globals()[var]

    # 3. Force garbage collection multiple times
    for _ in range(3):
        gc.collect()

    # 4. Print current memory status
    if torch.cuda.is_available():
        print(f"Current GPU memory usage:")
        print(f"Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
        print(f"Cached: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

    # 5. Optional: Reset CUDA device
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.reset_accumulated_memory_stats()

# Use the function
deep_clean_gpu()

# If still needed, restart the runtime
os.kill(os.getpid(), 9)

# get data

In [None]:
pip install -U bitsandbytes



In [None]:
import torch

from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,#在计算的时候需要还原到16bit
    bnb_4bit_use_double_quant=True
)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig

model_name = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=bnb_config,device_map="auto")
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id# 由于批处理需要padding，然后可以用eos， end of speech作为pading

text = "曲匹地尔片的用法用量"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

曲匹地尔片的用法用量
儿童用药：尚不明确
曲匹地尔片药理作用
曲匹地尔片药理作用
曲匹地尔片药理作用：
1.本品为非特异性三环类抗组胺药，具有抗组胺、抗胆碱、抗5-羟色胺、抗肾上腺素、抗多巴胺、抗乙酰胆碱、抗毒蕈碱样作用。
2.本品


In [None]:
text = "帕金森叠加综合征的辅助治疗有些什么"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

帕金森叠加综合征的辅助治疗有些什么
帕金森叠加综合征的辅助治疗有些什么？帕金森叠加综合征是帕金森病的一种，帕金森叠加综合征的临床表现与帕金森病相似，但帕金森叠加综合征的临床表现比帕金森病更复杂，帕金森叠加综合征的临床表现比帕金森病更复杂，帕金森叠加综合征的临床表现比帕金森病更复杂，帕金森叠加综合征的临床表现比帕金森


In [None]:
def prepare_dataset(data_path):
    """逐行加载JSON数据，跳过错误行"""
    df_data = []
    valid_rows = 0

    with open(data_path, 'r', encoding='utf-8') as f:
        # 逐行读取
        for line in f:
            if valid_rows >= 10000:  # 限制加载100000行
                break

            try:
                # 尝试解析每一行JSON
                item = json.loads(line.strip())

                if 'instruction' in item and 'output' in item:
                    if item.get('input'):
                        prompt = f"指令: {item['instruction']}\n输入: {item['input']}\n输出: "
                    else:
                        prompt = f"指令: {item['instruction']}\n输出: "

                    df_data.append({
                        "prompt": prompt,
                        "response": item['output'],
                        "text": prompt + item['output']
                    })
                    valid_rows += 1

                    if valid_rows % 10000 == 0:  # 每处理10000行打印进度
                        print(f"已处理 {valid_rows} 行有效数据")

            except (json.JSONDecodeError, UnicodeError, KeyError):
                # 跳过解析错误的行
                continue

    print(f"总共成功加载 {valid_rows} 行有效数据")
    return pd.DataFrame(df_data)

In [None]:
# 2. 数据预处理和标记化
def tokenize_function(examples):
    """将文本标记化为模型输入"""
    # 对于自回归训练，我们使用text字段
    model_inputs = tokenizer(
        examples["text"],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # 创建标签 (与输入相同，用于自回归训练)
    model_inputs["labels"] = model_inputs["input_ids"].clone()

    return model_inputs

In [None]:
def tokenize_function(examples):
    """将文本标记化为模型输入"""
    # 确保最大长度合理且启用截断和填充
    tokenized = tokenizer(
        examples["text"],
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors=None  # 重要：不要在这里转换为张量
    )

    # 创建标签
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized

# 使用方式
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names  # 移除原始列
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
import os
import torch
import json
import pandas as pd
from datasets import Dataset
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training
)
data_path = "/content/train_zh_0.json"
output_dir = "/content/peft_model"

    # 准备数据集
df = prepare_dataset(data_path)

已处理 10000 行有效数据
总共成功加载 10000 行有效数据


In [None]:
output_dir = "/content/peft_model"

In [None]:
df=df[:10000]
dataset = Dataset.from_pandas(df)

In [None]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})

In [None]:
for i, item in enumerate(tokenized_dataset[:5]):
    print(f"\nSample {i+1}:")
    print(item)


Sample 1:
input_ids

Sample 2:
attention_mask

Sample 3:
labels


In [None]:
def print_model_structure(model):
    """打印模型结构，找出可用的模块名称"""
    print("模型中的所有模块名称：")
    for name, _ in model.named_modules():
        if any(key in name for key in ['query', 'key', 'value', 'attention', 'dense']):
            print(f"- {name}")

def get_target_modules(model):
    """自动检测并返回适合的目标模块列表"""
    target_modules = []

    # 检查模型类型和结构
    model_modules = dict(model.named_modules())

    # LLaMA 系列模型
    if any('q_proj' in name for name in model_modules):
        target_modules = ["q_proj", "v_proj", "k_proj", "o_proj"]

    # Bloom 系列模型
    elif any('query_key_value' in name for name in model_modules):
        target_modules = ["query_key_value"]

    # GPT 系列模型
    elif any('c_attn' in name for name in model_modules):
        target_modules = ["c_attn", "c_proj"]

    # Falcon 系列模型
    elif any('query_key_value' in name for name in model_modules):
        target_modules = ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]

    # 如果没有找到匹配的模块，打印所有可能的注意力层
    if not target_modules:
        print("未找到预定义模块，请检查以下可能的模块：")
        for name in model_modules:
            if any(key in name for key in ['query', 'key', 'value', 'attention', 'dense']):
                print(f"- {name}")

    return target_modules

target_modules = get_target_modules(model)
print(f"检测到的目标模块: {target_modules}")


检测到的目标模块: ['q_proj', 'v_proj', 'k_proj', 'o_proj']


In [None]:
# 3. 配置LoRA
def setup_peft_model(model):
    """配置并返回PEFT模型"""
    # 确保模型不可以进行梯度计算，减少显存
    for param in model.parameters():
        param.requires_grad = False

    # 配置LoRA


    # peft_config = LoraConfig(
    #     task_type=TaskType.CAUSAL_LM,
    #     inference_mode=False,
    #     r=8,  # LoRA的秩
    #     lora_alpha=32,
    #     lora_dropout=0.1,
    #     # 根据您的模型架构调整目标模块
    #     target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj']
    # )
    peft_config = LoraConfig(
    r=8,                               # Reduce rank
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],#这是需要进行lora更改的部分参数
    bias="none",
    task_type="CAUSAL_LM",
    inference_mode=False,
    lora_dropout=0.05
)

    # 准备模型进行训练
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)

    # 打印可训练参数比例
    model.print_trainable_parameters()

    return model


In [None]:
peft_model = setup_peft_model(model)

trainable params: 3,932,160 || all params: 6,914,297,856 || trainable%: 0.0569


In [None]:
peft_model.print_trainable_parameters()

trainable params: 3,932,160 || all params: 6,914,297,856 || trainable%: 0.0569


In [None]:
# 4. 训练设置
def train_model(model, train_dataset, output_dir="./peft_model"):
    """设置训练参数并训练模型"""
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        save_steps=500,
        save_total_limit=3,
        remove_unused_columns=False,
    )

    # 数据整理器
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=-100,
        pad_to_multiple_of=8
    )

    # 初始化训练器
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
    )

    # 开始训练
    trainer.train()

    # 保存模型
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    return model

In [None]:
# 设置PEFT模型

peft_model = setup_peft_model(model)

# 训练模型
trained_model = train_model(peft_model, tokenized_dataset, output_dir)

trainable params: 3,932,160 || all params: 6,914,297,856 || trainable%: 0.0569


  return fn(*args, **kwargs)


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.12 GiB. GPU 0 has a total capacity of 39.56 GiB of which 2.81 GiB is free. Process 222294 has 36.74 GiB memory in use. Of the allocated memory 27.47 GiB is allocated by PyTorch, and 8.76 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# 方法1：查看内存中的参数大小
def print_model_size(model, model_name):
    param_count = sum(p.numel() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    total_size = (param_count * 4) + buffer_size  # 假设float32（4字节）
    print(f"{model_name} 大小: {total_size/1024**2:.2f} MB")

print_model_size(trained_model, "训练后的Adapter")

训练后的Adapter 大小: 14780.95 MB


In [None]:
# 方法1：查看内存中的参数大小
def print_model_size(model, model_name):
    param_count = sum(p.numel() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    total_size = (param_count * 4) + buffer_size  # 假设float32（4字节）
    print(f"{model_name} 大小: {total_size/1024**2:.2f} MB")

print_model_size(model, "训练后的Adapter")

训练后的Adapter 大小: 14780.95 MB


In [None]:
print("=== 可训练参数检查 ===")
trainable_params = [n for n, p in trained_model.named_parameters() if p.requires_grad]
print("可训练参数数量:", len(trainable_params))

=== 可训练参数检查 ===
可训练参数数量: 0


In [None]:
trained_model.save_pretrained(
    "/content/saved_adapter",
    safe_serialization=True,
    save_embedding_layers=False  # 确保不保存基础模型参数
)

In [None]:
from peft import PeftModel
trained_model.save_pretrained("/content/saved_adapter")

# Now load and use the model
merged_model = PeftModel.from_pretrained(model, "/content/saved_adapter")
merged_model = merged_model.merge_and_unload()




In [None]:
text = "得了脂肪肝会医治不"
inputs = tokenizer(text, return_tensors="pt")
outputs = merged_model.generate(**inputs.to(merged_model.device), max_new_tokens=100)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

得了脂肪肝会医治不愈吗？
脂肪肝是肝脏疾病中的一种，脂肪肝的医治是许多患者都十分关怀的论题，那么得了脂肪肝会医治不愈吗？
脂肪肝的医治是许多患者都十分关怀的论题，那么得了脂肪肝会医治不愈吗？
脂肪肝的医治是许多患者都十分关怀的论题，那么得了脂肪肝会医治不愈吗？脂肪肝的医治是许多患者


In [None]:
text = "得了脂肪肝会医治不"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

得了脂肪肝会医治不愈吗？
脂肪肝是肝脏疾病中的一种，脂肪肝的医治是许多患者都十分关怀的论题，那么得了脂肪肝会医治不愈吗？
脂肪肝的医治是许多患者都十分关怀的论题，那么得了脂肪肝会医治不愈吗？
脂肪肝的医治是许多患者都十分关怀的论题，那么得了脂肪肝会医治不愈吗？脂肪肝的医治是许多患者


In [None]:

# 2. 检查模型参数是否有区别
def compare_models(base_model, merged_model):
    print("\n=== 参数对比 ===")
    for (name1, p1), (name2, p2) in zip(base_model.named_parameters(),
                                       merged_model.named_parameters()):
        if not torch.equal(p1, p2):
            print(f"参数 {name1} 存在差异")
            print(f"最大差异值: {(p1 - p2).abs().max().item()}")

compare_models(model, merged_model)


=== 参数对比 ===


In [None]:
print("\n=== 检查模型的base_model ===")
if hasattr(model, 'base_model'):
    for name, _ in model.base_model.named_parameters():
        print(name)


=== 检查模型的base_model ===
embed_tokens.weight
layers.0.self_attn.q_proj.weight
layers.0.self_attn.k_proj.weight
layers.0.self_attn.v_proj.weight
layers.0.self_attn.o_proj.weight
layers.0.mlp.gate_proj.weight
layers.0.mlp.up_proj.weight
layers.0.mlp.down_proj.weight
layers.0.input_layernorm.weight
layers.0.post_attention_layernorm.weight
layers.1.self_attn.q_proj.weight
layers.1.self_attn.k_proj.weight
layers.1.self_attn.v_proj.weight
layers.1.self_attn.o_proj.weight
layers.1.mlp.gate_proj.weight
layers.1.mlp.up_proj.weight
layers.1.mlp.down_proj.weight
layers.1.input_layernorm.weight
layers.1.post_attention_layernorm.weight
layers.2.self_attn.q_proj.weight
layers.2.self_attn.k_proj.weight
layers.2.self_attn.v_proj.weight
layers.2.self_attn.o_proj.weight
layers.2.mlp.gate_proj.weight
layers.2.mlp.up_proj.weight
layers.2.mlp.down_proj.weight
layers.2.input_layernorm.weight
layers.2.post_attention_layernorm.weight
layers.3.self_attn.q_proj.weight
layers.3.self_attn.k_proj.weight
layers.3.s

In [None]:
print("\n=== 检查peft_config ===")
if hasattr(model, 'peft_config'):
    print(model.peft_config)
    print("\n目标模块:", model.peft_config.get('default').target_modules)


=== 检查peft_config ===
{'default': LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='deepseek-ai/deepseek-llm-7b-base', revision=None, inference_mode=True, r=8, target_modules={'v_proj', 'q_proj'}, exclude_modules=None, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)}

目标模块: {'v_proj', 'q_proj'}


In [None]:

# 2. 数据预处理和标记化
def tokenize_function(examples):
    """将文本标记化为模型输入"""
    # 对于自回归训练，我们使用text字段
    model_inputs = tokenizer(
        examples["text"],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # 创建标签 (与输入相同，用于自回归训练)
    model_inputs["labels"] = model_inputs["input_ids"].clone()

    return model_inputs

In [None]:
import os
import torch
import json
import pandas as pd
from datasets import Dataset
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training
)

# 假设您已经加载了tokenizer和model
# tokenizer = AutoTokenizer.from_pretrained("your_model_name")
# model = AutoModelForCausalLM.from_pretrained("your_model_name")

def prepare_dataset(data_path):
    """逐行加载JSON数据，跳过错误行"""
    df_data = []
    valid_rows = 0

    with open(data_path, 'r', encoding='utf-8') as f:
        # 逐行读取
        for line in f:
            if valid_rows >= 100000:  # 限制加载100000行
                break

            try:
                # 尝试解析每一行JSON
                item = json.loads(line.strip())

                if 'instruction' in item and 'output' in item:
                    if item.get('input'):
                        prompt = f"指令: {item['instruction']}\n输入: {item['input']}\n输出: "
                    else:
                        prompt = f"指令: {item['instruction']}\n输出: "

                    df_data.append({
                        "prompt": prompt,
                        "response": item['output'],
                        "text": prompt + item['output']
                    })
                    valid_rows += 1

                    if valid_rows % 10000 == 0:  # 每处理10000行打印进度
                        print(f"已处理 {valid_rows} 行有效数据")

            except (json.JSONDecodeError, UnicodeError, KeyError):
                # 跳过解析错误的行
                continue

    print(f"总共成功加载 {valid_rows} 行有效数据")
    return pd.DataFrame(df_data)

# 2. 数据预处理和标记化
def tokenize_function(examples):
    """将文本标记化为模型输入"""
    # 对于自回归训练，我们使用text字段
    model_inputs = tokenizer(
        examples["text"],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # 创建标签 (与输入相同，用于自回归训练)
    model_inputs["labels"] = model_inputs["input_ids"].clone()

    return model_inputs

# 3. 配置LoRA
def setup_peft_model(model):
    """配置并返回PEFT模型"""
    # 确保模型可以进行梯度计算
    for param in model.parameters():
        param.requires_grad = False

    # 配置LoRA
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,  # LoRA的秩
        lora_alpha=32,
        lora_dropout=0.1,
        # 根据您的模型架构调整目标模块
        target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]
    )

    # 准备模型进行训练
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)

    # 打印可训练参数比例
    model.print_trainable_parameters()

    return model

# 4. 训练设置
def train_model(model, train_dataset, output_dir="./peft_model"):
    """设置训练参数并训练模型"""
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        save_steps=500,
        save_total_limit=3,
        remove_unused_columns=False,
    )

    # 数据整理器
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=-100,
        pad_to_multiple_of=8
    )

    # 初始化训练器
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
    )

    # 开始训练
    trainer.train()

    # 保存模型
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    return model

# 5. 主函数
def main():
    # 数据路径
    data_path = "/content/train_zh_0.json"
    output_dir = "d:\\projects\\agent\\peft_model"

    # 准备数据集
    df = prepare_dataset(data_path)
    dataset = Dataset.from_pandas(df)

    # 标记化数据集
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    # 设置PEFT模型
    peft_model = setup_peft_model(model)

    # 训练模型
    trained_model = train_model(peft_model, tokenized_dataset, output_dir)

    # 测试生成
    test_instruction = "卵巢癌肉瘤的影像学检查有些什么？"
    test_prompt = f"指令: {test_instruction}\n输出: "

    inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)
    outputs = trained_model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"指令: {test_instruction}")
    print(f"生成回答: {response}")

if __name__ == "__main__":
    main()

In [None]:
# 在训练前执行内存优化
!export PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.8"
!nvidia-smi --gpu-reset-delay=1

ERROR: Option --gpu-reset-delay=1 is not recognized. Please run 'nvidia-smi -h'.



In [None]:
!pip install  datasets
!pip install -U bitsandbytes

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
# Install required packages

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig
import os
import torch
import json
import pandas as pd
from datasets import Dataset
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training
)
from peft import PeftModel

In [None]:

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,#在计算的时候需要还原到16bit
    bnb_4bit_use_double_quant=True
)


model_name = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=bnb_config,device_map="auto")
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id# 由于批处理需要padding，然后可以用eos， end of speech作为pading

text = "曲匹地尔片的用法用量"
#inputs = tokenizer(text, return_tensors="pt")
#outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)

#result = tokenizer.decode(outputs[0], skip_special_tokens=True)
#print(result)


def prepare_dataset(data_path):
    """逐行加载JSON数据，跳过错误行"""
    df_data = []
    valid_rows = 0

    with open(data_path, 'r', encoding='utf-8') as f:
        # 逐行读取
        for line in f:
            if valid_rows >= 1000:  # 限制加载100000行
                break

            try:
                # 尝试解析每一行JSON
                item = json.loads(line.strip())

                if 'instruction' in item and 'output' in item:
                    if item.get('input'):
                        prompt = f"指令: {item['instruction']}\n输入: {item['input']}\n输出: "
                    else:
                        prompt = f"指令: {item['instruction']}\n输出: "

                    df_data.append({
                        "prompt": prompt,
                        "response": item['output'],
                        "text": prompt + item['output']
                    })
                    valid_rows += 1

                    if valid_rows % 10000 == 0:  # 每处理10000行打印进度
                        print(f"已处理 {valid_rows} 行有效数据")

            except (json.JSONDecodeError, UnicodeError, KeyError):
                # 跳过解析错误的行
                continue

    print(f"总共成功加载 {valid_rows} 行有效数据")
    return pd.DataFrame(df_data)




def tokenize_function(examples):
    """将文本标记化为模型输入"""
    # 确保最大长度合理且启用截断和填充
    tokenized = tokenizer(
        examples["text"],
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors=None  # 重要：不要在这里转换为张量
    )

    # 创建标签
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized

# 3. 配置LoRA
def setup_peft_model(model):
    """配置并返回PEFT模型"""
    # 确保模型不可以进行梯度计算，减少显存
    for param in model.parameters():
        param.requires_grad = False

    # 配置LoRA


    # peft_config = LoraConfig(
    #     task_type=TaskType.CAUSAL_LM,
    #     inference_mode=False,
    #     r=8,  # LoRA的秩
    #     lora_alpha=32,
    #     lora_dropout=0.1,
    #     # 根据您的模型架构调整目标模块
    #     target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj']
    # )
    peft_config = LoraConfig(
    r=8,                               # Reduce rank
    lora_alpha=16,
    target_modules=["q_proj", "v_proj","k_proj", "o_proj"],#这是需要进行lora更改的部分参数
    bias="none",
    task_type="CAUSAL_LM",
    inference_mode=False,
    lora_dropout=0.05
)

    # 准备模型进行训练
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)

    # 打印可训练参数比例
    model.print_trainable_parameters()

    return model

# 4. 训练设置
def train_model(model, train_dataset, output_dir="./peft_model"):
    """设置训练参数并训练模型"""
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        save_steps=500,
        save_total_limit=3,
        remove_unused_columns=False,
    )

    # 数据整理器
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=-100,
        pad_to_multiple_of=8
    )

    # 初始化训练器
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
    )

    # 开始训练
    trainer.train()

    # 保存模型
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    return model


import os
import torch
import json
import pandas as pd
from datasets import Dataset
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training
)
# 修改后（推荐使用绝对路径）
data_path = "/content/train_zh_0.json"  # 确保文件实际存在
output_dir = "/content/peft_model"

    # 准备数据集
df = prepare_dataset(data_path)

dataset = Dataset.from_pandas(df)
# 使用方式
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names  # 移除原始列
)




peft_model = setup_peft_model(model)

# 添加在setup_peft_model之后
print("\n=== 可训练参数检查 ===")
for name, param in peft_model.named_parameters():
    if param.requires_grad:
        print(f"{name} | Shape: {param.shape}")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

总共成功加载 1000 行有效数据


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137

=== 可训练参数检查 ===
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight | Shape: torch.Size([8, 4096])
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight | Shape: torch.Size([4096, 8])
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight | Shape: torch.Size([8, 4096])
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight | Shape: torch.Size([4096, 8])
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight | Shape: torch.Size([8, 4096])
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight | Shape: torch.Size([4096, 8])
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight | Shape: torch.Size([8, 4096])
base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight | Shape: torch.Size([4096, 8])
base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight | Shape: torch.Si

In [None]:

trained_model = train_model(peft_model, tokenized_dataset, output_dir)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchrispeng912[0m ([33mchrispeng912-IceKredit[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss


In [None]:
trained_model.save_pretrained("/content/saved_adapter")

# Now load and use the model
merged_model = PeftModel.from_pretrained(model, "/content/saved_adapter")
merged_model = merged_model.merge_and_unload()




In [None]:
text = "得了脂肪肝会医治不"
inputs = tokenizer(text, return_tensors="pt")
outputs = merged_model.generate(**inputs.to(merged_model.device), max_new_tokens=100)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

得了脂肪肝会医治不愈吗？
脂肪肝是肝脏疾病中的一种，脂肪肝的医治是许多患者都十分关怀的问题，由于脂肪肝的医治不及时，会致使脂肪肝的病况加剧，乃至会致使脂肪肝的医治不愈，那么，得了脂肪肝会医治不愈吗？
脂肪肝的医治不及时，会致使脂肪肝的病况加剧，乃至会致使脂肪肝的医治不愈，因而，脂肪肝的


In [None]:
text = "得了脂肪肝会医治不"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

得了脂肪肝会医治不愈吗？
脂肪肝是肝脏疾病中的一种，脂肪肝的医治是许多患者都十分关怀的问题，由于脂肪肝的医治不及时，会致使脂肪肝的病况加剧，乃至会致使脂肪肝的医治不愈，那么，得了脂肪肝会医治不愈吗？
脂肪肝的医治不及时，会致使脂肪肝的病况加剧，乃至会致使脂肪肝的医治不愈，因而，脂肪肝的


# test different code


In [None]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
# Model from Hugging Face hub
base_model = "NousResearch/Llama-2-7b-chat-hf"

# New instruction dataset
guanaco_dataset = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model
new_model = "llama-2-7b-chat-guanaco"

In [None]:
dataset = load_dataset(guanaco_dataset, split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

(…)-00000-of-00001-9ad84bb9cf65a42f.parquet:   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [None]:

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
#df = Dataset.from_pandas(dataset)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    peft_config=peft_params,
  #  dataset_text_field="text",
   # max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
   # packing=False,
)

  trainer = SFTTrainer(


Converting train dataset to ChatML:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 54.88 MiB is free. Process 19220 has 39.49 GiB memory in use. Of the allocated memory 37.53 GiB is allocated by PyTorch, and 1.47 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:

prompt = "曲匹地尔片的用法用量"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

Device set to use cuda:0


<s>[INST] 曲匹地尔片的用法用量 [/INST]  Citalopram is an antidepressant medication that belongs to a class of drugs called selective serotonin reuptake inhibitors (SSRIs). It is used to treat depression, anxiety disorders, and some other mental health conditions. Unterscheidung between citalopram and other SSRIs is that it has a longer duration of action compared to other SSRIs, meaning it stays in the body for a longer period of time.

The typical dosage of citalopram for adults is 20-40 mg per day, taken orally, usually in the morning or evening. The dosage may be increased gradually over time as needed, under the supervision of a healthcare provider. It is important to follow the dosage instructions carefully and not to stop taking the medication without consult


# test llama3b

In [None]:
!huggingface-cli login
#resource
#https://www.datacamp.com/tutorial/fine-tuning-llama-3-2


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
The token `testllama3` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authent

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
import torch


base_model = "meta-llama/Llama-3.2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

KeyboardInterrupt: 

In [None]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
instruction = """You are a top-rated customer service agent named John.
    Be polite to customers and answer all their questions.
    """

messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": "I cannot afford this order, cancel purchase {{Order Number}}"}]


prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True)

print(outputs[0]["generated_text"])

In [None]:

%%capture
%pip install datasets accelerate peft bitsandbytes transformers trl wandb


from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [None]:
wandb.login(key='
run = wandb.init(
    project='Fine-tune Llama 3.2 on Customer Support Dataset',
    job_type="training",
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchrispeng912[0m ([33mchrispeng912-IceKredit[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
new_model = "llama-3.2-3b-it-Ecommerce-ChatBot"
dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"

if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"



In [None]:
# QLoRA config
base_model="meta-llama/Llama-3.2-3B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
# Set pad_token to be the same as eos_token
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.shuffle(seed=65).select(range(2000)) # Only use 1000 samples for quick demo
instruction = """You are a top-rated customer service agent named John.
    Be polite to customers and answer all their questions.
    """
def format_chat_template(row):

    row_json = [{"role": "system", "content": instruction },
               {"role": "user", "content": row["instruction"]},
               {"role": "assistant", "content": row["response"]}]

    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False) #this will turn json to pure text with special token to separate the roles.
    return row

dataset_train = dataset.map(
    format_chat_template,
    num_proc= 4,
)

dataset_test = dataset.shuffle(seed=77).select(range(200))
dataset_test = dataset_test.map(
    format_chat_template,
    num_proc= 4,
)

README.md:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

(…)t_Training_Dataset_27K_responses-v11.csv:   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26872 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
def conversation_template(row):

    row_json = """{"role": "system", "content": instruction },
               {"role": "user", "content": row["instruction"]},
               {"role": "assistant", "content": row["response"]}"""

    row["text"] = row_json #this will turn json to pure text with special token to separate the roles.
    return row
dataset = dataset.map(
    conversation_template,
    num_proc= 4,
)

Map (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
import bitsandbytes as bnb
# this part???? if just use k ,q, v
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [None]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
#model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    peft_config=peft_config,
  #  max_seq_length= 512,  # it is not listed in TrainingArguments, could be set in model or tokenizer
   # dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
   # packing= False,  ## if some input is short pack them to one to more efficient
)

  trainer = SFTTrainer(


Converting train dataset to ChatML:   0%|          | 0/2000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/200 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
type(dataset)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss
200,0.6552,0.662792
400,0.6284,0.569746
600,0.5458,0.518583
800,0.4055,0.485124
1000,0.4003,0.469618


TrainOutput(global_step=1000, training_loss=0.631606174916029, metrics={'train_runtime': 1114.0356, 'train_samples_per_second': 1.795, 'train_steps_per_second': 0.898, 'total_flos': 6691315230842880.0, 'train_loss': 0.631606174916029})

In [None]:
messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": "I can no longer afford order {{Order Number}}, cancel it"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=300, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])



I'm sorry to hear that you're facing financial difficulties and need to cancel your order with the order number {{Order Number}}. We understand that unforeseen circumstances can arise, and we're here to assist you. To proceed with the cancellation, please provide us with your order details, including your full name, email address, and any other relevant information. Our team will promptly process your request and ensure that your order is cancelled successfully. If you have any further questions or concerns, please don't hesitate to let us know. We appreciate your trust in us, and we're committed to finding a solution that works for you. Thank you for reaching out to us. How can I assist you further?


# fine tune a paper


In [None]:
#get data
!pip install langchain arxiv pymupdf

from langchain.document_loaders import ArxivLoader

docs = ArxivLoader(query="2311.06242").load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100
)


splits = text_splitter.split_documents(docs)