In [1]:
!pip install -q -U transformers accelerate datasets bitsandbytes einops wandb trl peft scikit-learn

[0m

# 导入由弱模型标记的数据集 #

In [2]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="dataset_train_strong.jsonl", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

# 预处理数据集，合并字段

In [3]:
def merge_fields(example):
    text = example['text']
    label = example['predicted_label']
    merged_input = f"Text: {text}\nLabel: {label}"
    return {'merged_input': merged_input}

In [4]:
dataset = dataset.map(merge_fields, remove_columns=['text', 'predicted_label'])

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [5]:
dataset[0]

{'merged_input': 'Text: \n\nHuman: How do I work on robotics on space shuttles?\n\nAssistant: What kind of work would you like to do on robotics?\n\nHuman: I will test them out and help to maintain them.\n\nAssistant: What kind of testing would you like to do?\n\nHuman: I would make sure they’re working properly and correctly.\n\nAssistant: What kind of maintenance would you like to help with?\n\nHuman: Maintaining the entire robotics system.\n\nAssistant: What specific aspect of the robotics system would you like to maintain?\nLabel: -1'}

# 强模型定义

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = 'gpt2-medium'

#bnb_config = BitsAndBytesConfig(
    #load_in_4bit=True,
    #bnb_4bit_quant_type='nf4',
    #bnb_4bit_compute_dtype=torch.float16,
#)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    #quantization_config=bnb_config,
    trust_remote_code=True
)

model.config.use_cache = False

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# 训练参数

In [8]:
from transformers import TrainingArguments

output_dir = './results1'
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'paged_adamw_32bit'
save_steps = 1000
logging_steps = 10
learning_rate = 2e-5
max_grad_norm = 0.3
max_steps = 1000
warmup_ratio = 0.03
lr_scheduler_type = 'constant'

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    #fp16=True,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
)

# 训练器设置

In [9]:
from trl import SFTTrainer
#from accelerate import Accelerator

max_seq_length = 512

#accelerator = Accelerator()

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    #peft_config=peft_config,
    dataset_text_field='merged_input',
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

for name, module in trainer.model.named_modules():
    if 'norm' in name:
        module = module.to(torch.float32)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

# 强模型训练阶段

In [10]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
10,2.4361
20,2.2755
30,2.1615
40,2.0669
50,1.8732
60,2.2567
70,2.1643
80,2.0711
90,2.0185
100,1.7876


TrainOutput(global_step=750, training_loss=1.924448346455892, metrics={'train_runtime': 513.5424, 'train_samples_per_second': 23.367, 'train_steps_per_second': 1.46, 'total_flos': 4761869278347264.0, 'train_loss': 1.924448346455892, 'epoch': 3.0})

In [11]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
model_to_save.save_pretrained('gpt2medium_strong_model_outputs')

# 使用训练好的强模型在测试集上进行评估

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import logging

# 设置日志级别为ERROR,以抑制警告信息
logging.getLogger("transformers").setLevel(logging.ERROR)

# 加载微调后的tokenizer和模型
model_path = "gpt2medium_strong_model_outputs"
tokenizer = AutoTokenizer.from_pretrained('gpt2-medium')
finetuned_model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# 将模型移动到GPU(如果可用)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
finetuned_model.to(device)

# 定义分类标签
label_map = {0: " No", 1: " Yes"}

# 加载测试数据集
test_ds = load_dataset("json", data_files="test_ds.jsonl", split='train')

# 对数据集进行预处理和打标签
def preprocess_and_label(example):
    # 对输入文本进行预处理
    inputs = tokenizer(example["text"], return_tensors="pt", truncation=True, max_length=512, padding=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    
    # 将输入数据移动到与模型相同的设备
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    
    # 使用模型进行预测
    with torch.no_grad():
        outputs = finetuned_model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=10, num_return_sequences=1)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 判断生成的文本属于哪个类别
    if label_map[0] in generated_text:
        predicted_label = 0
    elif label_map[1] in generated_text:
        predicted_label = 1
    else:
        predicted_label = None  # 如果生成的文本不包含任何一个标签,则不考虑这个样本
    
    return {"text": example["text"], "label": example["label"], "predicted_label": predicted_label}

# 在测试集上进行预测
predictions = []
labels = []
total_examples = len(test_ds)
print(f"Total examples: {total_examples}")

progress_bar = tqdm(test_ds, desc="Evaluating")

for example in progress_bar:
    processed_example = preprocess_and_label(example)
    if processed_example["predicted_label"] is not None:  # 只考虑有效的预测
        predictions.append(processed_example["predicted_label"])
        labels.append(processed_example["label"])

# 打印最终的评估指标
print(f"Accuracy: {accuracy_score(labels, predictions):.4f}")
print(f"F1 score: {f1_score(labels, predictions):.4f}")

Total examples: 17104


Evaluating: 100%|██████████| 17104/17104 [44:16<00:00,  6.44it/s]

Accuracy: 0.4955
F1 score: 0.5274





# 使用原始模型在同样的数据集上进行评估

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import logging

# 设置日志级别为ERROR,以抑制警告信息
logging.getLogger("transformers").setLevel(logging.ERROR)

# 加载微调后的tokenizer和模型
model_path = "gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained('gpt2-medium')
finetuned_model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# 将模型移动到GPU(如果可用)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
finetuned_model.to(device)

# 定义分类标签
label_map = {0: " No", 1: " Yes"}

# 加载测试数据集
test_ds = load_dataset("json", data_files="test_ds.jsonl", split='train')

# 对数据集进行预处理和打标签
def preprocess_and_label(example):
    # 对输入文本进行预处理
    inputs = tokenizer(example["text"], return_tensors="pt", truncation=True, max_length=512, padding=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    
    # 将输入数据移动到与模型相同的设备
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    
    # 使用模型进行预测
    with torch.no_grad():
        outputs = finetuned_model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=10, num_return_sequences=1)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 判断生成的文本属于哪个类别
    if label_map[0] in generated_text:
        predicted_label = 0
    elif label_map[1] in generated_text:
        predicted_label = 1
    else:
        predicted_label = None  # 如果生成的文本不包含任何一个标签,则不考虑这个样本
    
    return {"text": example["text"], "label": example["label"], "predicted_label": predicted_label}

# 在测试集上进行预测
predictions = []
labels = []
total_examples = len(test_ds)
print(f"Total examples: {total_examples}")

progress_bar = tqdm(test_ds, desc="Evaluating")

for example in progress_bar:
    processed_example = preprocess_and_label(example)
    if processed_example["predicted_label"] is not None:  # 只考虑有效的预测
        predictions.append(processed_example["predicted_label"])
        labels.append(processed_example["label"])

# 打印最终的评估指标
print(f"Accuracy: {accuracy_score(labels, predictions):.4f}")
print(f"F1 score: {f1_score(labels, predictions):.4f}")

Total examples: 17104


Evaluating: 100%|██████████| 17104/17104 [45:06<00:00,  6.32it/s]

Accuracy: 0.4982
F1 score: 0.5284



