In [1]:
!pip install -q -U transformers accelerate datasets bitsandbytes einops wandb trl peft scikit-learn

[0m

In [1]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="labeled_dataset.jsonl", split="train")

In [2]:
dataset

Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer_option', 'label'],
    num_rows: 30716
})

In [3]:
dataset[0]

{'question': 'What is the main form of energy storage in plants?',
 'distractor3': 'liquid',
 'distractor1': 'nitrogen',
 'distractor2': 'dioxide',
 'correct_answer': 'starch',
 'support': 'Many simple sugars can combine by repeated condensation reactions until a very large molecule is formed. A polysaccharide is a complex carbohydrate polymer formed from the linkage of many monosaccharide monomers. One of the best known polysaccharides is starch, the main form of energy storage in plants. Starch is a staple in most human diets. Foods such as corn, potatoes, rice, and wheat have high starch contents. Starch is made of glucose monomers and occurs in both straight-chain and branched forms. Amylose is the straight-chain form and consists of hundreds of linked glucose molecules. The branched form of starch is called amylopectin. In the small intestine, starch is hydrolyzed to form glucose. The glucose can then be converted to biochemical energy or stored for later use.',
 'answer_option': 

In [4]:
# 将txt和label字段合并为一个整体
def merge_fields(example):
    text = example['question']
    distractor1 = example['distractor1']
    distractor2 = example['distractor2']
    distractor3 = example['distractor3']
    correct_answer = example['correct_answer']
    support = example['support']
    answer_option = example['answer_option']
    label = example['label']
    merged_input = f"Text: {text}\nDistractor1: {distractor1}\nDistractor2: {distractor2}\nDistractor1: {distractor3}\ncorrect_answer: {correct_answer}\nsupport: {support}\nanswer_option: {answer_option}\nLabel: {label}"
    return {'merged_input': merged_input}

In [5]:
dataset = dataset.map(merge_fields, remove_columns=['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'answer_option', 'label'])

Map:   0%|          | 0/30716 [00:00<?, ? examples/s]

In [6]:
dataset[0]

{'merged_input': 'Text: What is the main form of energy storage in plants?\nDistractor1: nitrogen\nDistractor2: dioxide\nDistractor1: liquid\ncorrect_answer: starch\nsupport: Many simple sugars can combine by repeated condensation reactions until a very large molecule is formed. A polysaccharide is a complex carbohydrate polymer formed from the linkage of many monosaccharide monomers. One of the best known polysaccharides is starch, the main form of energy storage in plants. Starch is a staple in most human diets. Foods such as corn, potatoes, rice, and wheat have high starch contents. Starch is made of glucose monomers and occurs in both straight-chain and branched forms. Amylose is the straight-chain form and consists of hundreds of linked glucose molecules. The branched form of starch is called amylopectin. In the small intestine, starch is hydrolyzed to form glucose. The glucose can then be converted to biochemical energy or stored for later use.\nanswer_option: nitrogen\nLabel: -1

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = 'gpt2-medium'

#bnb_config = BitsAndBytesConfig(
    #load_in_4bit=True,
    #bnb_4bit_quant_type='nf4',
    #bnb_4bit_compute_dtype=torch.float16,
#)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    #quantization_config=bnb_config,
    trust_remote_code=True
)

model.config.use_cache = False

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [9]:
from transformers import TrainingArguments

output_dir = './results5'
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'paged_adamw_32bit'
save_steps = 1000
logging_steps = 10
learning_rate = 2e-5
max_grad_norm = 0.3
max_steps = 1000
warmup_ratio = 0.03
lr_scheduler_type = 'constant'

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    #fp16=True,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
)

In [10]:
from trl import SFTTrainer
#from accelerate import Accelerator

max_seq_length = 1024

#accelerator = Accelerator()

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    #peft_config=peft_config,
    dataset_text_field='merged_input',
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

for name, module in trainer.model.named_modules():
    if 'norm' in name:
        module = module.to(torch.float32)

Map:   0%|          | 0/30716 [00:00<?, ? examples/s]

In [None]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33ms1820587[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,2.7908
20,2.3428
30,1.9847
40,1.7676
50,1.5525
60,2.3868
70,2.0081
80,1.8617
90,1.5512
100,1.4706


In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
model_to_save.save_pretrained('gpt2medium_strong_model_outputs')

# 使用测试集对微调后的模型进行评估

In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import logging

# 设置日志级别为ERROR, 以抑制警告信息
logging.getLogger("transformers").setLevel(logging.ERROR)

# 加载微调后的tokenizer和模型
model_path = "gpt2medium_strong_model_outputs1"
tokenizer = AutoTokenizer.from_pretrained('gpt2-medium')
finetuned_model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# 将模型移动到GPU（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
finetuned_model.to(device)

# 定义分类标签
label_map = {0: " No", 1: " Yes"}

# 加载测试数据集
test_ds = load_dataset("json", data_files="test_ds.jsonl", split='train')

# 对数据集进行预处理和打标签
def preprocess_and_label(example):
    # 将所有特征拼接为一个输入文本
    input_text = f"Question: {example['question']} Distractor3: {example['distractor3']} Distractor1: {example['distractor1']} Distractor2: {example['distractor2']} Correct Answer: {example['correct_answer']} Support: {example['support']} Answer Option: {example['answer_option']}"
    
    # 对输入文本进行预处理
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # 将输入数据移动到与模型相同的设备
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    # 使用模型进行预测
    with torch.no_grad():
        outputs = finetuned_model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=10, num_return_sequences=1)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # 判断生成的文本属于哪个类别
    if label_map[0] in generated_text:
        predicted_label = 0
    elif label_map[1] in generated_text:
        predicted_label = 1
    else:
        predicted_label = None  # 如果生成的文本不包含任何一个标签，则不考虑这个样本

    return {"text": input_text, "label": example["label"], "predicted_label": predicted_label}

# 在测试集上进行预测
predictions = []
labels = []
total_examples = len(test_ds)
print(f"Total examples: {total_examples}")

progress_bar = tqdm(test_ds, desc="Evaluating")
for example in progress_bar:
    processed_example = preprocess_and_label(example)
    if processed_example["predicted_label"] is not None:
        # 只考虑有效的预测
        predictions.append(processed_example["predicted_label"])
        labels.append(processed_example["label"])

# 打印最终的评估指标
print(f"Accuracy: {accuracy_score(labels, predictions):.4f}")
print(f"F1 score: {f1_score(labels, predictions):.4f}")

Generating train split: 0 examples [00:00, ? examples/s]

Total examples: 4000


Evaluating: 100%|██████████| 4000/4000 [08:08<00:00,  8.19it/s]

Accuracy: 0.7391
F1 score: 0.0455





# 使用测试集对原始模型进行评估

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import logging

# 设置日志级别为ERROR, 以抑制警告信息
logging.getLogger("transformers").setLevel(logging.ERROR)

# 加载微调后的tokenizer和模型
model_path = "gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained('gpt2-medium')
finetuned_model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# 将模型移动到GPU（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
finetuned_model.to(device)

# 定义分类标签
label_map = {0: " No", 1: " Yes"}

# 加载测试数据集
#test_ds = load_dataset("json", data_files="test_ds.jsonl", split='train')

# 对数据集进行预处理和打标签
def preprocess_and_label(example):
    # 将所有特征拼接为一个输入文本
    input_text = f"Question: {example['question']} Distractor3: {example['distractor3']} Distractor1: {example['distractor1']} Distractor2: {example['distractor2']} Correct Answer: {example['correct_answer']} Support: {example['support']} Answer Option: {example['answer_option']}"
    
    # 对输入文本进行预处理
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # 将输入数据移动到与模型相同的设备
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    # 使用模型进行预测
    with torch.no_grad():
        outputs = finetuned_model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=10, num_return_sequences=1)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # 判断生成的文本属于哪个类别
    if label_map[0] in generated_text:
        predicted_label = 0
    elif label_map[1] in generated_text:
        predicted_label = 1
    else:
        predicted_label = None  # 如果生成的文本不包含任何一个标签，则不考虑这个样本

    return {"text": input_text, "label": example["label"], "predicted_label": predicted_label}

# 在测试集上进行预测
predictions = []
labels = []
total_examples = len(test_ds)
print(f"Total examples: {total_examples}")

progress_bar = tqdm(test_ds, desc="Evaluating")
for example in progress_bar:
    processed_example = preprocess_and_label(example)
    if processed_example["predicted_label"] is not None:
        # 只考虑有效的预测
        predictions.append(processed_example["predicted_label"])
        labels.append(processed_example["label"])

# 打印最终的评估指标
print(f"Accuracy: {accuracy_score(labels, predictions):.4f}")
print(f"F1 score: {f1_score(labels, predictions):.4f}")