In [1]:
!pip install -q -U transformers accelerate datasets bitsandbytes einops wandb trl peft scikit-learn

[0m

# 分别导入sciq的训练集和测试集，并且将训练集和测试集都转换成二分类任务

In [4]:
from datasets import load_dataset

# 加载sciq数据集的训练集和测试集
train_dataset = load_dataset("allenai/sciq", split="train")
validation_dataset = load_dataset("allenai/sciq", split="validation")
test_dataset = load_dataset("allenai/sciq", split="test")

# 合并训练集与验证集，组成新的训练集

In [5]:
from datasets import concatenate_datasets

# 合并训练集和验证集
train_data = concatenate_datasets([train_dataset, validation_dataset])
train_data = train_data.shuffle()

In [6]:
train_data[1]

{'question': 'Which group of metals in the periodic table include elements such as sodium and potassium?',
 'distractor3': 'lanthanides',
 'distractor1': 'igneous metals',
 'distractor2': 'actinides',
 'correct_answer': 'alkali metals',
 'support': 'The first step in the process of inductive reasoning is making specific observations. In the periodic table of elements, which we will discuss later, there is a group of metals with similar properties called the alkali metals. The alkali metals include elements such as sodium and potassium. If I put sodium or potassium in water, I will observe a very violent reaction every time. I draw a general conclusion from these observations: all alkali metals will react violently with water.'}

# 将数据集转化为一个二分类问题

In [7]:
from datasets import load_dataset, Dataset,Features, Value

# 定义转换为二元分类数据集的函数
def process_correct_answer(example):
    return {
        'question': example['question'],
        'answer_option': example['correct_answer'],
        'label': 1
    }

def process_distractor(example, distractor_key):
    return {
        'question': example['question'],
        'answer_option': example[distractor_key],
        'label': 0
    }

# 预处理训练集

In [8]:
# 对训练集进行转换
train_correct_answers = train_data.map(process_correct_answer)
train_distractor1 = train_data.map(process_distractor, fn_kwargs={'distractor_key': 'distractor1'})
train_distractor2 = train_data.map(process_distractor, fn_kwargs={'distractor_key': 'distractor2'})
train_distractor3 = train_data.map(process_distractor, fn_kwargs={'distractor_key': 'distractor3'})
train_dataset = concatenate_datasets([train_correct_answers, train_distractor1, train_distractor2, train_distractor3])

Map:   0%|          | 0/12679 [00:00<?, ? examples/s]

Map:   0%|          | 0/12679 [00:00<?, ? examples/s]

Map:   0%|          | 0/12679 [00:00<?, ? examples/s]

Map:   0%|          | 0/12679 [00:00<?, ? examples/s]

In [9]:
train_dataset = train_dataset.shuffle(seed=42)

# 将原始数据集一分为二

In [10]:
train_dataset

Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer_option', 'label'],
    num_rows: 50716
})

In [2]:
import random

def create_subset(dataset, n_docs, seed=None):
    if seed is not None:
        random.seed(seed)
    
    if n_docs is None or n_docs > len(dataset):
        return dataset
    else:
        indices = random.sample(range(len(dataset)), n_docs)
        return dataset.select(indices)

In [9]:
train_weak_model = create_subset(train_dataset, 20000, seed=53)
train_weak_model

Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer_option', 'label'],
    num_rows: 20000
})

In [10]:
#from sklearn.model_selection import train_test_split

#split_data = train_dataset.train_test_split(test_size=0.5, seed=42)
#train_ds1, train_ds2 = split_data['train'], split_data['test']

#print('len(train_weak_model):', len(train_ds1), 'len(label_weak_model):', len(train_ds2))

# 先处理用来训练的训练集

In [21]:
train_weak_model

Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer_option', 'label'],
    num_rows: 20000
})

In [22]:
# 将txt和label字段合并为一个整体
def merge_fields(example):
    text = example['question']
    distractor1 = example['distractor1']
    distractor2 = example['distractor2']
    distractor3 = example['distractor3']
    correct_answer = example['correct_answer']
    support = example['support']
    answer_option = example['answer_option']
    label = example['label']
    merged_input = f"Text: {text}\nDistractor1: {distractor1}\nDistractor2: {distractor2}\nDistractor1: {distractor3}\ncorrect_answer: {correct_answer}\nsupport: {support}\nanswer_option: {answer_option}\nLabel: {label}"
    return {'merged_input': merged_input}

In [23]:
train_weak_model = train_weak_model.map(merge_fields, remove_columns=['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer_option', 'label'])

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [24]:
train_weak_model[1]

{'merged_input': 'Text: The theory of evolution by what (and other processes) explains both the diversity of organisms and how populations of organisms change over time?\nDistractor1: natural evolution\nDistractor2: characteristic selection\nDistractor1: genocide\ncorrect_answer: natural selection\nsupport: Biology has only a few over arching theories. One of these, the Cell Theory of Life, explains the historic continuity of organisms, while the Theory of Evolution by Natural Selection (and other processes), explains both the diversity of organisms and how populations of organisms change over time. Finally, the Physicochemical Theory of Life explains how it is that organisms can display their remarkable properties without violating the laws that govern all physical and chemical systems.40 What is life, exactly? Clearly, if we are going to talk about biology, and organisms and cells and such, we have to define exactly what we mean by life. This raises a problem peculiar to biology as a

# 导入弱模型

In [25]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = 'gpt2'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    trust_remote_code=True,
    #num_labels=2
)
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# 设置训练参数

In [26]:
from transformers import TrainingArguments

output_dir = './results'
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'paged_adamw_32bit'
save_steps = 1000
logging_steps = 10
learning_rate = 5e-5
max_grad_norm = 0.3
max_steps = 1000
warmup_ratio = 0.03
lr_scheduler_type = 'constant'
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
)

# 设置训练器

In [27]:
from trl import SFTTrainer

max_seq_length = 1024
trainer = SFTTrainer(
    model=model,
    train_dataset=train_weak_model,
    dataset_text_field='merged_input',
    #label_field='label',
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

for name, module in trainer.model.named_modules():
    if 'norm' in name:
        module = module.to(torch.float32)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

# 弱模型训练阶段

In [28]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33ms1820587[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,3.1188
20,2.5675
30,2.2193
40,1.9036
50,1.7194
60,2.7298
70,2.2832
80,2.0968
90,1.8201


KeyboardInterrupt: 

In [21]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
model_to_save.save_pretrained('weak_model_outputs')

# 现在来处理用于标记的数据集

In [11]:
label_weak_model = create_subset(train_dataset, 30716, seed=26)
label_weak_model

Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer_option', 'label'],
    num_rows: 30716
})

In [12]:
label_weak_model.to_json("label_weak_model.jsonl", lines=True)

Creating json from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

20826443

In [13]:
from datasets import Dataset

# 假设你的原始数据集名为 "dataset"
label_dataset = label_weak_model.remove_columns(['label'])

# 查看更新后的数据集结构
print(label_dataset)

Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer_option'],
    num_rows: 30716
})


# 给数据集打标签

In [24]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm
import logging

# 设置日志级别为ERROR,以抑制警告信息
logging.getLogger("transformers").setLevel(logging.ERROR)

# 加载微调后的tokenizer和模型
model_path = "weak_model_outputs"
tokenizer = AutoTokenizer.from_pretrained('gpt2')
finetuned_model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto')

# 将模型移动到GPU(如果可用)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
finetuned_model.to(device)

# 定义分类标签
label_map = {0: " No", 1: " Yes"}

# 对数据集进行预处理和打标签
def preprocess_and_label(example):
    # 将问题、答案选项和其他相关信息拼接成一个完整的输入文本
    input_text = f"Question: {example['question']}\n"
    input_text += f"Answer Option: {example['answer_option']}\n"
    input_text += f"Correct Answer: {example['correct_answer']}\n"
    input_text += f"Distractor 1: {example['distractor1']}\n"
    input_text += f"Distractor 2: {example['distractor2']}\n"
    input_text += f"Distractor 3: {example['distractor3']}\n"
    input_text += f"Support: {example['support']}\n"
    input_text += "Label:"
    
    # 对输入文本进行预处理
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    input_ids = inputs["input_ids"]
    
    # 将输入数据移动到与模型相同的设备
    input_ids = input_ids.to(device)
    
    # 使用模型进行预测
    with torch.no_grad():
        outputs = finetuned_model.generate(input_ids, max_new_tokens=10, num_return_sequences=1)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 判断生成的文本属于哪个类别
    if label_map[0] in generated_text:
        predicted_label = 0
    elif label_map[1] in generated_text:
        predicted_label = 1
    else:
        predicted_label = -1 # 如果生成的文本不包含任何一个标签,则认为分类失败
    
    return {
        "question": example["question"],
        "answer_option": example["answer_option"],
        "correct_answer": example["correct_answer"],
        "distractor1": example["distractor1"],
        "distractor2": example["distractor2"],
        "distractor3": example["distractor3"],
        "support": example["support"],
        "predicted_label": predicted_label
    }

# 对数据集进行处理和打标签
labeled_dataset = label_dataset.map(preprocess_and_label)

# 使用tqdm显示进度条
with tqdm(total=len(label_dataset), desc="Labeling dataset") as pbar:
    for example in labeled_dataset:
        pbar.update(1)

# 查看打标签后的数据集
print(labeled_dataset)

labeled_dataset.to_json("labeled_dataset.jsonl", lines=True)

Map:   0%|          | 0/30716 [00:00<?, ? examples/s]

Labeling dataset: 100%|██████████| 30716/30716 [00:01<00:00, 18031.24it/s]


Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer_option', 'predicted_label'],
    num_rows: 30716
})


Creating json from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

21138061

# 将带弱标签的数据与真实标签的数据合并

In [15]:
label_weak_model

Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer_option', 'label'],
    num_rows: 30716
})

In [27]:
labeled_dataset

Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer_option', 'label'],
    num_rows: 30716
})

In [None]:
import pandas as pd

# 读取JSON Lines数据到pandas DataFrame
df = pd.read_json("labeled_dataset.jsonl", lines=True)

# 重命名 predicted_label 列为 label
df = df.rename(columns={"predicted_label": "label"})

# 将修改后的DataFrame写回JSON Lines文件
df.to_json("labeled_dataset.jsonl", orient="records", lines=True)

In [28]:
from datasets import concatenate_datasets

merged_dataset = concatenate_datasets([labeled_dataset, label_weak_model])

In [29]:
merged_dataset

Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer_option', 'label'],
    num_rows: 61432
})

In [30]:
mergerd_dataset = merged_dataset.shuffle(seed=42)

In [31]:
merged_dataset[1]

{'question': 'What incredibly successful species has quickly colonized almost all of earth’s terrestrial habitats, but also impacted earth, its climate, and its environment?',
 'distractor3': 'fish',
 'distractor1': 'chimpanzees',
 'distractor2': 'birds',
 'correct_answer': 'humans',
 'support': 'The human species has been incredibly successful. In a relatively short period of time, it has colonized almost all of Earth’s terrestrial habitats. Unfortunately, human beings have also impacted Earth, its climate, and its environment. Human actions threaten Earth’s valuable biodiversity.',
 'answer_option': 'chimpanzees',
 'label': -1}

In [32]:
merged_dataset.to_json("merged_dataset.json", lines=True)

Creating json from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

41582885