In [1]:
#!pip install -U transformers datasets
#!pip install -U bitsandbytes accelerate peft trl

In [2]:
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import (
    set_seed,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)
from transformers.models.llama.modeling_llama import (
    LlamaForSequenceClassification,
)
from transformers.modeling_outputs import ModelOutput
from peft import LoraConfig
from trl import SFTTrainer

set_seed(123)

In [3]:
dataset = load_dataset(
    "shunk031/livedoor-news-corpus",
    train_ratio=0.8,
    val_ratio=0.1,
    test_ratio=0.1,
    random_state=42,
    shuffle=True,
    trust_remote_code=True,
)
num_categories = len(set(dataset["train"]["category"]))

max_seq_length = 512

In [4]:
class LivedoorNet(LlamaForSequenceClassification):
    def __init__(self, *args, **kwargs):
        super(LivedoorNet, self).__init__(*args, **kwargs)
        
    def forward(
        self,
        input_ids,
        category=None,
        attention_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        inputs_embeds=None,
        labels=None,
    ):
        outputs = super(LivedoorNet, self).forward(
            input_ids,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(outputs.logits, category)
        return ModelOutput(
            loss=loss,
            logits=outputs.logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [5]:
model_name = "elyza/Llama-3-ELYZA-JP-8B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage=torch.bfloat16,
)

model = LivedoorNet.from_pretrained(
    model_name,
    num_labels=num_categories,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=max_seq_length)

# pad_tokenをeos_tokenに設定しないと、
# ミニバッチの中の最も長いトークン列の末尾で、分類用のlogitを取得してしまう。
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LivedoorNet were not initialized from the model checkpoint at elyza/Llama-3-ELYZA-JP-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=32,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
        ],
)

In [7]:
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    output_dir="outputs_cls",
    max_steps=1000,
    eval_steps=100,
    logging_steps=100,
    save_steps=100,
    learning_rate=5e-5,
    eval_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
)

In [8]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    dataset_text_field="title",
    peft_config=peft_config,
    max_seq_length=max_seq_length,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/5894 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [9]:
trainer.train_dataset = trainer.train_dataset.add_column("category", dataset["train"]["category"])
trainer.eval_dataset = trainer.eval_dataset.add_column("category", dataset["validation"]["category"])

In [10]:
def evaluate_by_accuracy(model, tokenizer, dataset, batch_size=4):
    model.eval()
    num_correct_answers = 0
    num_answers = 0
    for i in tqdm(range(0, len(dataset), batch_size)):
        examples = dataset[i:i+batch_size]
        encoding = tokenizer(
            examples["title"],
            padding=True,
            return_tensors="pt",
            )
        category = torch.tensor(examples["category"])
        with torch.no_grad():
            outputs = model.forward(**encoding, category=category)
        num_correct_answers += (outputs.logits.argmax(-1) == category).sum()
        num_answers += len(examples["category"])
    model.train()
    return num_correct_answers / num_answers

In [11]:
evaluate_by_accuracy(model, tokenizer, dataset["validation"])

  0%|          | 0/185 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


tensor(0.0855)

* GPUのメモリは12GB程度使う。

In [12]:
trainer.train()

Step,Training Loss,Validation Loss
100,1.195,0.499742
200,0.4136,0.420022
300,0.3943,0.298456
400,0.294,0.285161
500,0.1423,0.310085
600,0.1452,0.333352
700,0.122,0.287881
800,0.0725,0.278061
900,0.0267,0.280388
1000,0.0339,0.279766


TrainOutput(global_step=1000, training_loss=0.2839558079242706, metrics={'train_runtime': 1396.5825, 'train_samples_per_second': 11.457, 'train_steps_per_second': 0.716, 'total_flos': 2.579698107953971e+16, 'train_loss': 0.2839558079242706, 'epoch': 2.7137042062415198})

In [13]:
evaluate_by_accuracy(model, tokenizer, dataset["validation"])

  0%|          | 0/185 [00:00<?, ?it/s]

tensor(0.9213)