In [1]:
!pip install -q -U transformers accelerate datasets bitsandbytes einops wandb trl peft scikit-learn

[0m

# 分别导入imdb的训练集和测试集

In [1]:
from datasets import load_dataset

# 加载sciq数据集的训练集和测试集
train_dataset = load_dataset("imdb", split="train")
test_dataset = load_dataset("imdb", split="test")

# 打乱训练集

In [2]:
train_dataset[0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [3]:
from datasets import concatenate_datasets
train_data = train_dataset.shuffle()

In [4]:
train_data[1]

{'text': 'VIVAH in my opinion is the best movie of 2006, coming from a director that has proved successful throughout his career. I am not too keen in romantic movies these days, because i see them as "old wine in a new bottle" and so predictable. However, i have watched this movie three times now...and believe me it\'s an awesome movie.<br /><br />VIVAH goes back to the traditional route, displaying simple characters into a sensible and realistic story of the journey between engagement and marriage. The movie entertains in all manners as it can be reflected to what we do (or would do) when it comes to marriage. In that sense Sooraj R. Barjatya has done his homework well and has depicted a very realistic story into a well-made highly entertaining movie.<br /><br />Several sequences in this movie catch your interest immediately: <br /><br />* When Shahid Kapoor comes to see the bride (Amrita Rao) - the way he tries to look at her without making it too obvious in front of his and her fam

In [5]:
# 从原始数据集中随机抽取10000条数据
train_data = train_data.shuffle(seed=42).select(range(10000))

# 处理用来训练的训练集

In [6]:
# 将txt和label字段合并为一个整体
def merge_fields(example):
    text = example['text']
    label = example['label']
    merged_input = f"Text: {text}\nLabel: {label}"
    return {'merged_input': merged_input}

In [7]:
train_data = train_data.map(merge_fields, remove_columns=['text', 'label'])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [8]:
train_data[1]

{'merged_input': 'Text: Gregory Peck\'s brilliant portrayal of Douglas MacArthur from the Battle of Corregidor in the Philippines at the start of the Pacific War largely through to his removal as UN Commander during the Korean War offers reason to believe all three of the above possibilities. Certainly the most controversial American General of the Second World War (and possibly ever) MacArthur is presented here as a man of massive contradictions. He claims that soldiers above all yearn for peace, yet he obviously glories in war; he consistently denies any political ambitions, yet almost everything he does is deliberately used to boost himself as a presidential candidate; he obviously believes that soldiers under his command have to follow his orders to the letter, yet he himself deliberately defies orders from the President of the United States; he shows great respect for other cultures (particularly in the Philippines and Japan) and yet is completely out of touch with his own country

# 导入模型

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = 'llama3'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    trust_remote_code=True,
    #num_labels=2
)
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 设置训练参数

In [10]:
from transformers import TrainingArguments

output_dir = './results'
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'paged_adamw_32bit'
save_steps = 1000
logging_steps = 10
learning_rate = 2e-5
max_grad_norm = 0.3
max_steps = 1000
warmup_ratio = 0.03
lr_scheduler_type = 'constant'
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
)

# 设置训练器

In [11]:
from trl import SFTTrainer

max_seq_length = 512
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    dataset_text_field='merged_input',
    #label_field='label',
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

for name, module in trainer.model.named_modules():
    if 'norm' in name:
        module = module.to(torch.float32)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

# 模型训练阶段

In [None]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33ms1820587[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


In [12]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
model_to_save.save_pretrained('finetuned_model')

In [13]:
from transformers import Trainer

# 保存训练好的模型
output_dir = "multitask_model"
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Trained model and tokenizer saved to {output_dir}")

Trained model and tokenizer saved to multitask_model
