In [1]:
from datasets import load_dataset
from random import randrange
 
# 从hub加载数据集
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

Request 7f4aa450-1e1a-4b67-b994-8216059b8731: GET https://hf-mirror.com/api/datasets/databricks/databricks-dolly-15k (authenticated: False)
Request 146d425b-59b2-49ec-a4ca-0ad59710ca4a: GET https://hf-mirror.com/api/datasets/databricks/databricks-dolly-15k (authenticated: False)
Request 59804eea-0348-4f4c-97d1-5295c8ff8ae4: GET https://hf-mirror.com/api/datasets/databricks/databricks-dolly-15k/revision/bdd27f4d94b9c1f951818a7da7fd7aeea5dbff1a (authenticated: False)
Request 2f6da5ec-6b8f-4c04-b6eb-df5e706c5303: POST https://hf-mirror.com/api/datasets/databricks/databricks-dolly-15k/paths-info/bdd27f4d94b9c1f951818a7da7fd7aeea5dbff1a (authenticated: False)
Request 63ff88d8-9ad3-47fd-ba96-6cc2a0302d4d: GET https://hf-mirror.com/api/datasets/databricks/databricks-dolly-15k/tree/bdd27f4d94b9c1f951818a7da7fd7aeea5dbff1a/data?recursive=False&expand=False (authenticated: False)
Request 7bc34692-0bed-4021-9d37-7c2a51df7612: GET https://hf-mirror.com/api/datasets/databricks/databricks-dolly-15k/

In [2]:
# 数据集样例总数: 15011
dataset

Dataset({
    features: ['instruction', 'context', 'response', 'category'],
    num_rows: 15011
})

In [3]:
# 随机抽选一个数据样例打印
print(dataset[randrange(len(dataset))])

{'instruction': "What's the most impressive thing happened in the world since you were born?", 'context': '', 'response': 'The day that I was born is just like yesterday and the most impressive thing I saw is that people never learned from history.', 'category': 'brainstorming'}


In [4]:
def format_instruction(sample_data):
    """
    Formats the given data into a structured instruction format.

    Parameters:
    sample_data (dict): A dictionary containing 'response' and 'instruction' keys.

    Returns:
    str: A formatted string containing the instruction, input, and response.
    """
    # Check if required keys exist in the sample_data
    if 'response' not in sample_data or 'instruction' not in sample_data:
        # Handle the error or return a default message
        return "Error: 'response' or 'instruction' key missing in the input data."

    return f"""### Instruction:
Use the Input below to create an instruction, which could have been used to generate the input using an LLM. 
 
### Input:
{sample_data['response']}
 
### Response:
{sample_data['instruction']}
"""

In [5]:
# 随机抽选一个样例，打印 Alpaca 格式化后的样例 
print(format_instruction(dataset[randrange(len(dataset))]))

### Instruction:
Use the Input below to create an instruction, which could have been used to generate the input using an LLM. 
 
### Input:
agriculture
 
### Response:
What is the chief economic activity of Weselberg?



In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# 如果硬件设备支持，成功安装 flash-attn后，将 use_flash_attention 设置为True
use_flash_attention = True
 
# 取消注释以使用 flash-atten
# if torch.cuda.get_device_capability()[0] >= 8:
#     from utils.llama_patch import replace_attn_with_flash_attn
#     print("Using flash attention")
#     replace_attn_with_flash_attn()
#     use_flash_attention = True
 
 
# 获取 LLaMA 2-7B 模型权重
# 无需 Meta AI 审核的模型权重
model_id = "NousResearch/Llama-2-7b-hf" 
# 通过 Meta AI 审核后可使用此 Model ID 下载
# model_id = "meta-llama/Llama-2-7b-hf" 
 
 
# 使用 BnB 加载量化后的模型
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
 
# 加载模型与分词器
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, use_cache=False, device_map="auto")
model.config.pretraining_tp = 1 
 
# # 通过对比doc中的字符串，验证模型是否在使用flash attention
# if use_flash_attention:
#     from utils.llama_patch import forward    
#     assert model.model.layers[0].self_attn.forward.__doc__ == forward.__doc__, "Model is not using flash attention"
 
 
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Request 7fe41872-0abe-4c04-a41c-eac8b31b0b21: HEAD https://hf-mirror.com/NousResearch/Llama-2-7b-hf/resolve/main/config.json (authenticated: False)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Request 43719ac9-e485-4dff-8ce4-4fe1b0276cdf: HEAD https://hf-mirror.com/NousResearch/Llama-2-7b-hf/resolve/main/generation_config.json (authenticated: False)
Request 054880b0-5085-4fb3-a33f-a55c51cef593: HEAD https://hf-mirror.com/NousResearch/Llama-2-7b-hf/resolve/main/tokenizer_config.json (authenticated: False)


In [7]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
 
# QLoRA 配置
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=16,
        bias="none",
        task_type="CAUSAL_LM", 
)
 
 
# 使用 QLoRA 配置加载 PEFT 模型
model = prepare_model_for_kbit_training(model)
qlora_model = get_peft_model(model, peft_config)

In [8]:
qlora_model.print_trainable_parameters()

trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.12433454005023165


In [9]:
import datetime
import os
model_save_path=os.environ["MODEL_SAVE_PATH"]
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# 演示训练参数（实际训练是设置为 False）
demo_train = False
output_dir = f"{model_save_path}/llama-7-int4-dolly-{timestamp}"

In [10]:
from transformers import TrainingArguments
 
args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1 if demo_train else 3,
    #max_steps=100,
    per_device_train_batch_size=6, # Nvidia T4 16GB 显存支持的最大 Batch Size
    gradient_accumulation_steps=1 if demo_train else 4,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="steps" if demo_train else "epoch",
    save_steps=10,
    learning_rate=2e-4,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant"
)

In [11]:
from trl import SFTTrainer
 
# 数据集的最大长度序列（筛选后的训练数据样例数为1158）
max_seq_length = 2048 
 
trainer = SFTTrainer(
    model=qlora_model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction, 
    args=args,
)

In [12]:
trainer.train()



Step,Training Loss
10,1.5977
20,1.3702
30,1.3012
40,1.2655
50,1.2225
60,1.2337
70,1.2219
80,1.226
90,1.1927
100,1.1951




TrainOutput(global_step=144, training_loss=1.256234135892656, metrics={'train_runtime': 3436.6207, 'train_samples_per_second': 1.011, 'train_steps_per_second': 0.042, 'total_flos': 2.8095247031677747e+17, 'train_loss': 1.256234135892656, 'epoch': 2.98})

In [13]:
trainer.save_model()