# 目的
causalでのFine-Tuningを行う

ref: 
- https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing#scrollTo=6bZsfBuZDeCL
- https://huggingface.co/blog/mlabonne/sft-llama3
- https://huggingface.co/docs/trl/sft_trainer#accelerate-fine-tuning-2x-using-unsloth

In [48]:
# path setting
EXP_NAME = "e001-baseline"
MODEL_NAME = "unsloth/gemma-2-9b-it-bnb-4bit"
COMPETITION_NAME = "eedi-mining-misconceptions-in-mathematics"

DATA_PATH = "data"
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME.split('/')[-1]}"
OUTPUT_PATH = f"output/causal/{EXP_NAME}"
MODEL_OUTPUT_PATH = f"{OUTPUT_PATH}/trained_model"
ENV_PATH = "env_file"

# experiment parameter
DEBUG = True
TRAINING = True
UPLOAD_DATA_TO_S3 = True
UPLOAD_DATA_TO_KAGGLE = True
REMOVE_LOCAL_FILE = False
WANDB = True

# USE_FOLD = 0
# USE_DATA_RATE = 1.0
# VALID_DATA_SIZE = 3000

# model parameter
TRAINING_MAX_LENGTH = 512
SEED = 42
EPOCH = 1
LR = 2e-04
TRAIN_BS = 4  # 16
GRAD_ACC_STEP = 128 // TRAIN_BS  # 仮想的なバッチサイズはTRAIN_BS * GRAD_ACC_STEPとなる
EVAL_BS = 4  # 16
# NUM_LABELS = 3

FREEZE_LAYERS = (
    0  # there're 42 layers in total, we don't add adapters to the first 16 layers
)

# rola parameter
LORA_R = 16  # TODO: 64
LORA_ALPHA = LORA_R * 2
LORA_DROPOUT = 0.05
LORA_BIAS = "none"

RESUME_FROM_CHECKPOINT = False  # 途中から再開する場合はTrueにする

In [49]:
!nvidia-smi

2777.07s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Mon Sep 23 07:26:25 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-16GB           On  | 00000000:E1:00.0 Off |                    0 |
| N/A   22C    P0              34W / 250W |   7102MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [50]:
!python --version

2782.92s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Python 3.10.14


In [51]:
def resolve_path(base_path: str) -> str:
    import os

    cwd = os.getcwd()
    print(cwd)
    if cwd == f"/notebooks":
        print("Jupyter Kernel By VSCode!")
        return f"/notebooks/{COMPETITION_NAME}/{base_path}"
    elif cwd == f"/notebooks/{COMPETITION_NAME}":
        print("nohup!")
        return base_path
    elif cwd == f"/notebooks/{COMPETITION_NAME}/{COMPETITION_NAME}/exp":
        print("Jupyter Lab!")
        return f"../../{base_path}"
    elif cwd == f"/root/{COMPETITION_NAME}/exp/reranker":
        print("VastAi! Reranker")
        return f"../../{base_path}"
    elif cwd == f"/root/{COMPETITION_NAME}/exp/retriever":
        print("VastAi! Retriever")
        return f"../../{base_path}"
    elif cwd == f"/root/{COMPETITION_NAME}/exp/causal":
        print("VastAi! Causal")
        return f"../../{base_path}"
    elif cwd == f"/root/{COMPETITION_NAME}":
        print("VastAi!")
        return base_path
    else:
        raise Exception("Unknown environment")


DATA_PATH = resolve_path(DATA_PATH)
print(DATA_PATH)
OUTPUT_PATH = resolve_path(OUTPUT_PATH)
print(OUTPUT_PATH)
MODEL_OUTPUT_PATH = resolve_path(MODEL_OUTPUT_PATH)
print(MODEL_OUTPUT_PATH)
ENV_PATH = resolve_path(ENV_PATH)
print(ENV_PATH)

/root/eedi-mining-misconceptions-in-mathematics/exp/causal
VastAi! Causal
../../data
/root/eedi-mining-misconceptions-in-mathematics/exp/causal
VastAi! Causal
../../output/causal/e001-baseline
/root/eedi-mining-misconceptions-in-mathematics/exp/causal
VastAi! Causal
../../output/causal/e001-baseline/trained_model
/root/eedi-mining-misconceptions-in-mathematics/exp/causal
VastAi! Causal
../../env_file


In [52]:
def validate_dataset_name(dataset_name: str) -> None:
    if len(dataset_name) < 6 or len(dataset_name) > 50:
        raise Exception(
            f"データセットの文字列は6~50文字にしてください。現在{len(DATASET_NAME)}文字"
        )
    if "_" in dataset_name:
        raise Exception("datasetの名称に_の使用は禁止です")


validate_dataset_name(DATASET_NAME)

# import

In [53]:
import os
import random

import polars as pl
import numpy as np

import wandb
from datasets import Dataset

from trl import SFTConfig, SFTTrainer
import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported

In [54]:
# import os
# import random
# import ast
# import json

# import polars as pl
# import numpy as np
# import torch
# import torch.nn as nn
# from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
# import wandb
# from datasets import (
#     Dataset,
#     DatasetDict,
#     Value,
#     concatenate_datasets,
#     load_dataset,
#     ClassLabel,
# )
# from tokenizers import AddedToken
# from tqdm.auto import tqdm
# from scipy.special import softmax
# from sklearn.metrics import log_loss
# from transformers import (
#     AutoConfig,
#     AutoModel,
#     AutoModelForSequenceClassification,
#     AutoTokenizer,
#     DataCollatorWithPadding,
#     Trainer,
#     TrainingArguments,
#     BitsAndBytesConfig,
#     Gemma2ForSequenceClassification,
#     GemmaTokenizerFast,
#     Gemma2Config,
#     PreTrainedTokenizerBase,
#     EvalPrediction,
#     Trainer,
#     DataCollatorWithPadding,
#     TrainingArguments,
# )

# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# from sklearn.metrics import log_loss, accuracy_score

# from datasets import load_dataset
# from unsloth import FastLanguageModel



In [55]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"
NUM_PROC = 16  # os.cpu_count()

In [56]:
import transformers
import datasets
import evaluate
import accelerate
import bitsandbytes
import peft

assert transformers.__version__ == "4.44.2"
assert datasets.__version__ == "3.0.0"
assert evaluate.__version__ == "0.4.3"
assert bitsandbytes.__version__ == "0.43.3"
assert accelerate.__version__ == "0.34.2"
assert peft.__version__ == "0.12.0"

In [57]:
# Seed the same seed to all
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(SEED)

In [58]:
from dotenv import load_dotenv

load_dotenv(f"{ENV_PATH}/.env")

True

# Wandb

In [59]:
if WANDB:
    wandb.login(key=os.environ["WANDB_API_KEY"])
    wandb.init(project=COMPETITION_NAME, name=EXP_NAME)
    REPORT_TO = "wandb"
else:
    REPORT_TO = "none"

REPORT_TO



VBox(children=(Label(value='0.018 MB of 0.018 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

'wandb'

# Model Tokenizer Load

In [60]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=TRAINING_MAX_LENGTH,
    dtype=None,
    load_in_4bit=True,
)

==((====))==  Unsloth 2024.9.post1: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla V100-PCIE-16GB. Max memory: 15.773 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1. CUDA = 7.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [61]:
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=LORA_ALPHA,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

# Data Import & Preprocess

In [62]:
train_long = pl.read_csv(f"{DATA_PATH}/train_long.csv").filter(
    (pl.col("CorrectAnswer") != pl.col("AnswerAlphabet"))
    & (pl.col("MisconceptionId").is_not_null())
)

In [63]:
if DEBUG:
    train_long = train_long.head(100)

In [64]:
train_long

QuestionId,ConstructName,SubjectName,QuestionText,CorrectAnswer,CorrectAnswerText,AnswerType,AnswerText,AllText,AnswerAlphabet,QuestionId_Answer,MisconceptionId,MisconceptionName
i64,str,str,str,str,str,str,str,str,str,str,i64,str
0,"""Use the order of operations to…","""BIDMAS""","""\[ 3 \times 2+4-5 \] Where do …","""A""","""\( 3 \times(2+4)-5 \)""","""AnswerDText""","""Does not need brackets""","""Use the order of operations to…","""D""","""0_D""",1672,"""Confuses the order of operatio…"
1000,"""Simplify an algebraic fraction…","""Simplifying Algebraic Fraction…","""Simplify the following, if pos…","""B""","""\( -1 \)""","""AnswerAText""","""\( t \)""","""Simplify an algebraic fraction…","""A""","""1000_A""",891,"""Incorrectly cancels what they …"
1000,"""Simplify an algebraic fraction…","""Simplifying Algebraic Fraction…","""Simplify the following, if pos…","""B""","""\( -1 \)""","""AnswerCText""","""\( 1 \)""","""Simplify an algebraic fraction…","""C""","""1000_C""",891,"""Incorrectly cancels what they …"
1000,"""Simplify an algebraic fraction…","""Simplifying Algebraic Fraction…","""Simplify the following, if pos…","""B""","""\( -1 \)""","""AnswerDText""","""Does not simplify""","""Simplify an algebraic fraction…","""D""","""1000_D""",353,"""Does not recognise when one pa…"
1001,"""Round numbers to two decimal p…","""Rounding to Decimal Places""","""What is \( \mathbf{3 . 5 1 6 3…","""B""","""\( 3.52 \)""","""AnswerAText""","""\( 3.51 \)""","""Round numbers to two decimal p…","""A""","""1001_A""",1379,"""Rounds down instead of up"""
…,…,…,…,…,…,…,…,…,…,…,…,…
1037,"""Calculate percentage of an amo…","""Percentages of an Amount""","""What is \( 4 \% \) of \( 200 ?…","""D""","""\( 8 \)""","""AnswerCText""","""\( 80 \)""","""Calculate percentage of an amo…","""C""","""1037_C""",1070,"""Believes that dividing by 10 g…"
1038,"""Convert from weeks to hours""","""Time""","""Which one of the following cal…","""B""","""\( 10 \times 7 \times 24 \)""","""AnswerAText""","""\( 10 \times 5 \times 24 \)""","""Convert from weeks to hours Ti…","""A""","""1038_A""",950,"""Thinks there are 5 days in a w…"
1038,"""Convert from weeks to hours""","""Time""","""Which one of the following cal…","""B""","""\( 10 \times 7 \times 24 \)""","""AnswerCText""","""\( 10 \times 24 \)""","""Convert from weeks to hours Ti…","""C""","""1038_C""",1179,"""Forgets to multiply up for the…"
1038,"""Convert from weeks to hours""","""Time""","""Which one of the following cal…","""B""","""\( 10 \times 7 \times 24 \)""","""AnswerDText""","""\( 10 \times 7 \times 12 \)""","""Convert from weeks to hours Ti…","""D""","""1038_D""",443,"""Thinks there are 12 hours in 1…"


In [65]:
# TODO: CoT(think step by step)をどうFine-Tuningに反映させるべきかを調べる
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Identify the misconception behind Incorrect Answer.

### Input:
Question: {question_text}
Incorrect Answer: {incorrect_answer_text}
Correct Answer: {correct_answer_text}
Construct Name: {construct_name}
Subject Name: {subject_name}

### Response:
{misconception_name}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def formatting_prompts_func(examples):
    texts = []
    for (
        question_text,
        incorrect_answer_text,
        correct_answer_text,
        construct_name,
        subject_name,
        misconception_name,
    ) in zip(
        examples["QuestionText"],
        examples["AnswerText"],  # incorrect_answer_text
        examples["CorrectAnswerText"],
        examples["ConstructName"],
        examples["SubjectName"],
        examples["MisconceptionName"],
    ):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = (
            alpaca_prompt.format(
                question_text=question_text,
                incorrect_answer_text=incorrect_answer_text,
                correct_answer_text=correct_answer_text,
                construct_name=construct_name,
                subject_name=subject_name,
                misconception_name=misconception_name,
            )
            + EOS_TOKEN
        )
        texts.append(text)
    return {
        "text": texts,
    }


train = Dataset.from_polars(train_long).map(
    formatting_prompts_func,
    batched=True,
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [66]:
print(train[0]["text"])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Identify the misconception behind Incorrect Answer.

### Input:
Question: \[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ?
Incorrect Answer: Does not need brackets
Correct Answer: \( 3 \times(2+4)-5 \)
Construct Name: Use the order of operations to carry out calculations involving powers
Subject Name: BIDMAS

### Response:
Confuses the order of operations, believes addition comes before multiplication <eos>


In [67]:
print(train[10]["text"])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Identify the misconception behind Incorrect Answer.

### Input:
Question: This pictogram shows the different types of music Bob has in his music collection.

How many Blues CDs does Bob have? ![A pictogram showing the number of CDs Bob has in his musical collection. Pop has 3 and a half symbols, rock has 2 symbols, blues has 2 and a quarter symbols, jazz has 3 and a quarter symbols and classical has 1 and three-quarter symbols. Each symbol represents 4 CDs.]()
Incorrect Answer: \( 2.25 \)
Correct Answer: \( 9 \)
Construct Name: Interpret a pictogram involving fractions of symbols
Subject Name: Pictogram

### Response:
When interpreting a pictogram, thinks each symbol stands for 1<eos>


# Model

In [69]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train,
    dataset_text_field="text",
    max_seq_length=TRAINING_MAX_LENGTH,
    dataset_num_proc=NUM_PROC,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=TRAIN_BS,
        gradient_accumulation_steps=TRAIN_BS,
        warmup_steps=5,
        # max_steps=None,
        num_train_epochs=1,
        learning_rate=LR,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

Setting TOKENIZERS_PARALLELISM=false for forked processes.


Map (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

In [70]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla V100-PCIE-16GB. Max memory = 15.773 GB.
13.152 GB of memory reserved.


In [71]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 6
 "-____-"     Number of trainable parameters = 54,018,048
AUTOTUNE bmm(64x180x256, 64x256x180)
  triton_bmm_8 0.0328 ms 100.0%
  triton_bmm_0 0.0358 ms 91.4%
  triton_bmm_1 0.0379 ms 86.5%
  triton_bmm_2 0.0379 ms 86.5%
  triton_bmm_7 0.0379 ms 86.5%
  triton_bmm_3 0.0389 ms 84.2%
  triton_bmm_4 0.0399 ms 82.1%
  triton_bmm_10 0.0440 ms 74.4%
  triton_bmm_5 0.0451 ms 72.7%
  triton_bmm_9 0.0451 ms 72.7%
SingleProcess AUTOTUNE takes 5.0242 seconds


BackendCompilerFailed: backend='inductor' raised:
LoweringException: IndexError: map::at
  target: aten.bmm.default
  args[0]: TensorBox(
    View(
      StorageBox(
        ComputedBuffer(name='buf5', layout=FixedLayout('cuda', torch.float16, size=[s10, s2*s3, s11, s11], stride=[s11**2*s2*s3, s11**2, s11, 1]), data=Pointwise(
          'cuda',
          torch.float16,
          def inner_fn(index):
              i0, i1, i2, i3 = index
              tmp0 = ops.load(buf2, i3 + s11 * i2 + i1 * s11**2 + s2 * s3 * i0 * s11**2)
              tmp1 = ops.constant(50.0, torch.float16)
              tmp2 = tmp0 / tmp1
              tmp3 = ops.tanh(tmp2)
              tmp4 = ops.constant(50.0, torch.float16)
              tmp5 = tmp3 * tmp4
              tmp6 = ops.load(arg14_1, i3 + i2 * s14)
              tmp7 = tmp5 + tmp6
              tmp8 = ops.to_dtype(tmp7, torch.float32, src_dtype=torch.float16)
              tmp9 = ops.load(buf3, i2 + s11 * i1 + s11 * s2 * s3 * i0)
              tmp10 = tmp8 - tmp9
              tmp11 = ops.exp(tmp10)
              tmp12 = ops.load(buf4, i2 + s11 * i1 + s11 * s2 * s3 * i0)
              tmp13 = tmp11 / tmp12
              tmp14 = ops.to_dtype(tmp13, torch.float16, src_dtype=torch.float32)
              return tmp14
          ,
          ranges=[s10, s2*s3, s11, s11],
          origin_node=expand_4,
          origins={exp, tanh, convert_element_type_3, convert_element_t...
        ))
      ),
      size=[s10*s2*s3, s11, s11],
      reindex=lambda i0, i1, i2: [ModularIndexing(i0, s2*s3, s10), ModularIndexing(i0, 1, s2*s3), i1, i2],
      origins={exp, tanh, convert_element_type_3, convert_element_t...
    )
  )
  args[1]: TensorBox(
    View(
      View(
        StorageBox(
          ComputedBuffer(name='buf6', layout=FixedLayout('cuda', torch.float16, size=[s10, s2, s3, s11, s1], stride=[s1*s11*s2*s3, s1*s11*s3, s1*s11, s1, 1]), data=Pointwise(
            'cuda',
            torch.float16,
            def inner_fn(index):
                i0, i1, i2, i3, i4 = index
                tmp0 = ops.load(arg11_1, i4 + s1 * i1 + s1 * s2 * i3 + s1 * s11 * s2 * i0)
                return tmp0
            ,
            ranges=[s10, s2, s3, s11, s1],
            origin_node=clone_1,
            origins={clone_1}
          ))
        ),
        size=[s10, s2*s3, s11, s1],
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, s3, s2), ModularIndexing(i1, 1, s3), i2, i3],
        origins={clone_1, view_1}
      ),
      size=[s10*s2*s3, s11, s1],
      reindex=lambda i0, i1, i2: [ModularIndexing(i0, s2*s3, s10), ModularIndexing(i0, 1, s2*s3), i1, i2],
      origins={view_6}
    )
  )

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


In [19]:
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,  # low-rankマトリクスのスケーリングファクター
    # only target self-attention
    # target_modules=["q_proj", "k_proj", "v_proj"],
    target_modules=[  # Linear層を全て含める
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    layers_to_transform=[i for i in range(42) if i >= FREEZE_LAYERS],
    lora_dropout=LORA_DROPOUT,  # LoRAレイヤーのドロップアウト確率
    bias=LORA_BIAS,
    task_type=TaskType.SEQ_CLS,
)

In [20]:
# tokenizer = GemmaTokenizerFast.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"
# tokenizer.add_special_tokens({"additional_special_tokens": ["[SEP]"]})

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
# model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=16)
print(model)
print(model.print_trainable_parameters())

In [22]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=16)

# Tokenize

In [23]:
class CustomTokenizer:
    def __init__(self, tokenizer: PreTrainedTokenizerBase, max_length: int) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch: dict) -> dict:
        prompt = ["<prompt>: " + self.process_text(t) for t in batch["prompt"]]
        response_a = [
            "\n\n<response_a>: " + self.process_text(t) for t in batch["response_a"]
        ]
        response_b = [
            "\n\n<response_b>: " + self.process_text(t) for t in batch["response_b"]
        ]
        texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)
        labels = []
        for a_win, b_win in zip(batch["winner_model_a"], batch["winner_model_b"]):
            if a_win:
                label = 0
            elif b_win:
                label = 1
            else:
                label = 2
            labels.append(label)
        return {**tokenized, "labels": labels}

    @staticmethod
    def process_text(text: str) -> str:
        return " ".join(eval(text, {"null": ""}))

In [24]:
train_dataset = train_dataset.map(
    CustomTokenizer(tokenizer, max_length=TRAINING_MAX_LENGTH), batched=True
)

Map:   0%|          | 0/57477 [00:00<?, ? examples/s]

In [25]:
print(train_dataset)

In [26]:
# def tokenize(examples, max_token_length: int):
#     separator = " [SEP] "

#     joined_text = (
#         examples["last_prompt"]
#         + separator
#         + examples["last_response_a"]
#         + separator
#         + examples["last_response_b"]
#     )

#     return tokenizer(
#         joined_text,
#         max_length=max_token_length,
#         truncation=True,
#         padding="max_length",
#     )


# train_dataset = train_dataset.map(
#     tokenize,
#     batched=False,
#     fn_kwargs={"max_token_length": TRAINING_MAX_LENGTH},
#     num_proc=NUM_PROC,
# )

# Train Test Split

In [27]:
filtered_train = train_dataset.filter(
    lambda x: x["fold"] != USE_FOLD, num_proc=NUM_PROC
)
filtered_valid = train_dataset.filter(
    lambda x: x["fold"] == USE_FOLD, num_proc=NUM_PROC
)
filtered_valid = filtered_valid.select(range(min(VALID_DATA_SIZE, len(filtered_valid))))

train_valid_dataset = DatasetDict(
    {
        "train": filtered_train,
        "valid": filtered_valid,
    }
)

del filtered_train, filtered_valid

Filter (num_proc=12):   0%|          | 0/57477 [00:00<?, ? examples/s]

Filter (num_proc=12):   0%|          | 0/57477 [00:00<?, ? examples/s]

In [28]:
print(train_valid_dataset)

In [29]:
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     preds_prob = softmax(predictions, axis=-1)
#     return {"log_loss": log_loss(labels, preds_prob)}


def compute_metrics(eval_pred) -> dict:
    preds, labels = eval_pred
    preds_prob = softmax(preds, axis=-1)
    return {
        "log_loss": log_loss(y_true=labels, y_pred=preds_prob),
        "acc": accuracy_score(y_true=labels, y_pred=preds.argmax(-1)),
    }

In [30]:
# スケジューラの設定
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_PATH,
    overwrite_output_dir=True,
    learning_rate=LR,
    per_device_train_batch_size=TRAIN_BS,
    gradient_accumulation_steps=GRAD_ACC_STEP,
    eval_accumulation_steps=GRAD_ACC_STEP,
    per_device_eval_batch_size=EVAL_BS,
    num_train_epochs=EPOCH,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=0.1,
    save_strategy="steps",
    save_steps=0.1,
    save_total_limit=10,
    logging_steps=2,
    seed=SEED,
    metric_for_best_model="eval_loss",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine_with_restarts",  # "linear", # "constant_with_warmup",
    report_to=REPORT_TO,
    run_name=EXP_NAME,
    load_best_model_at_end=True,
    fp16=True,
    fp16_full_eval=True,
    gradient_checkpointing=True,
    optim="adamw_8bit",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_valid_dataset["train"],
    # train_dataset=ConcatDataset(train_valid_dataset["train"]),
    eval_dataset=train_valid_dataset["valid"],
    # eval_dataset=ConcatDataset(train_valid_dataset["valid"]),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [31]:
if TRAINING:
    # モデルの学習
    trainer.train(
        resume_from_checkpoint=RESUME_FROM_CHECKPOINT
        if RESUME_FROM_CHECKPOINT
        else None
    )
    # ログの保存に利用したストレージを削除
    os.system(f"rm -rf {MODEL_OUTPUT_PATH}/checkpoint-*")
    # モデルの保存
    trainer.save_model(MODEL_OUTPUT_PATH)
else:
    # TRAINED_MODEL_PATHを用いて、学習済のモデルを読み込む
    model = AutoModelForSequenceClassification.from_pretrained(
        TRAINED_MODEL_PATH,
        num_labels=NUM_LABELS,
    )
    # model = CustomDebertaSequenceClassification.from_pretrained(MODEL_NAME)

    args = TrainingArguments(
        ".",
        per_device_eval_batch_size=4,
        report_to="none",
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=args,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

Step,Training Loss,Validation Loss,Log Loss,Acc,Runtime,Samples Per Second,Steps Per Second
328,0.8893,0.921124,0.921125,0.564,749.663,4.002,1.0
369,0.949,0.914646,0.914645,0.564667,749.4775,4.003,1.001


```
/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:61: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
TODO: この　Warningが問題ないのかを調べる

```

# valid_datasetの作成・保存

In [32]:
# TRAININGをINFERRENCEでMAX_TOKENを変えるために、validを作り直す
valid_dataset = train_dataset.filter(
    lambda example: example["id"] in train_valid_dataset["valid"]["id"],
    num_proc=NUM_PROC,
)

valid_dataset = valid_dataset.map(
    CustomTokenizer(tokenizer, max_length=INFERENCE_MAX_LENGTH),
    batched=True,
    num_proc=NUM_PROC,
)

# valid_dataset = valid_dataset.map(
#     tokenize,
#     batched=False,
#     fn_kwargs={"max_token_length": INFERENCE_MAX_LENGTH},
#     num_proc=NUM_PROC,
# )

# valid_dataset = valid_dataset.map(
#     tokenize,
#     batched=False,
#     fn_kwargs={
#         "suffix": "a",
#         "max_token_length": INFERENCE_MAX_LENGTH
#     },
#     num_proc=NUM_PROC,
# ).map(
#     tokenize,
#     batched=False,
#     fn_kwargs={
#         "suffix": "b",
#         "max_token_length": INFERENCE_MAX_LENGTH
#     },
#     num_proc=NUM_PROC,
# )


def add_valid_pred(example, idx, valid_pred):
    example["valid_pred"] = valid_pred[idx]
    return example


valid_dataset = train_valid_dataset["valid"]

valid_pred = softmax(trainer.predict(valid_dataset).predictions, axis=-1)
# valid_pred = softmax(trainer.predict(ConcatDataset(valid_dataset)).predictions, axis=-1)

np.save(f"{MODEL_OUTPUT_PATH}/valid_prediction.npy", valid_pred)

valid_dataset = valid_dataset.map(
    add_valid_pred, with_indices=True, fn_kwargs={"valid_pred": valid_pred}
)

valid_dataset.save_to_disk(f"{MODEL_OUTPUT_PATH}/valid_dataset")

Filter (num_proc=12):   0%|          | 0/57477 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

# CVの計算

In [33]:
cv_score = log_loss(valid_dataset["labels"], valid_pred)
print(f"CV Score: {cv_score}")

In [34]:
# output_textを保存
with open(f"{MODEL_OUTPUT_PATH}/cv_score.txt", "w") as f:
    f.write(str(cv_score))

# AWSへのアップロード

In [35]:
# S3へのアップロード
# TODO: colabでは動かないため直す
if not DEBUG and UPLOAD_DATA_TO_S3:
    # uninstall
    !sudo rm /usr/bin/aws
    !sudo rm /usr/bin/aws_completer
    !sudo rm -rf /usr/local/aws-cli

    # install
    !curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
    !unzip -o -qq awscliv2.zip
    !sudo ./aws/install --update

    # upload
    output_name = MODEL_OUTPUT_PATH.split("/")[-1]
    os.system(
        f"aws s3 cp --recursive {MODEL_OUTPUT_PATH} s3://{COMPETITION_NAME}/trained_model/{output_name}"
    )

In [36]:
# ダウンロード（参考）
# !sudo rm /usr/bin/aws
# !sudo rm /usr/bin/aws_completer
# !sudo rm -rf /usr/local/aws-cli

# !curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
# !unzip -o -qq awscliv2.zip
# !sudo ./aws/install --update

# !aws s3 cp --recursive s3://automated-essay-scoring/trained_model/e005-regression /notebooks/automated_essay_scoring/trained_models/e005-regression

# Kaggle Datasetへのupload

In [12]:
import os

os.system("mkdir -p ~/.kaggle/")
os.system(f"cp /{DATA_PATH}/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

0

In [15]:
if not DEBUG and UPLOAD_DATA_TO_KAGGLE:
    import os
    import json

    from kaggle.api.kaggle_api_extended import KaggleApi

    def dataset_create_new(dataset_name: str, upload_dir: str):
        # if "_" in dataset_name:
        #     raise ValueError("datasetの名称に_の使用は禁止です")
        dataset_metadata = {}
        dataset_metadata["id"] = f"sinchir0/{dataset_name}"
        dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
        dataset_metadata["title"] = dataset_name
        with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
            json.dump(dataset_metadata, f, indent=4)
        api = KaggleApi()
        api.authenticate()
        api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")

    print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
    dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

Create Dataset name:e060-10fold-gemma-2-9b-it-bnb-4bit, output_dir:/home/shinichiro.saito/lmsys/trained_models/e060-10fold
Starting upload for file tokenizer.json


  0%|          | 0.00/16.7M [00:00<?, ?B/s]

100%|██████████| 16.7M/16.7M [00:00<00:00, 25.6MB/s]


Upload successful: tokenizer.json (17MB)
Starting upload for file training_args.bin


100%|██████████| 5.12k/5.12k [00:00<00:00, 14.6kB/s]


Upload successful: training_args.bin (5KB)
Starting upload for file adapter_config.json


100%|██████████| 1.06k/1.06k [00:00<00:00, 3.01kB/s]


Upload successful: adapter_config.json (1KB)
Starting upload for file README.md


100%|██████████| 4.98k/4.98k [00:00<00:00, 13.3kB/s]


Upload successful: README.md (5KB)
Starting upload for file valid_dataset.tar


100%|██████████| 18.4M/18.4M [00:00<00:00, 26.3MB/s]


Upload successful: valid_dataset.tar (18MB)
Starting upload for file special_tokens_map.json


100%|██████████| 636/636 [00:00<00:00, 1.74kB/s]


Upload successful: special_tokens_map.json (636B)
Starting upload for file adapter_model.safetensors


100%|██████████| 206M/206M [00:01<00:00, 144MB/s]  


Upload successful: adapter_model.safetensors (206MB)
Starting upload for file tokenizer_config.json


100%|██████████| 39.7k/39.7k [00:00<00:00, 111kB/s]


Upload successful: tokenizer_config.json (40KB)
Starting upload for file valid_prediction.npy


100%|██████████| 35.3k/35.3k [00:00<00:00, 89.9kB/s]


Upload successful: valid_prediction.npy (35KB)
Starting upload for file cv_score.txt


100%|██████████| 18.0/18.0 [00:00<00:00, 48.9B/s]


Upload successful: cv_score.txt (18B)


# ローカルからのデータの削除

In [39]:
# if not DEBUG and REMOVE_LOCAL_FILE:
#     # ローカルからは削除
#     os.system(f"rm -rf {MODEL_OUTPUT_PATH}")

In [40]:
if WANDB:
    wandb.finish()

In [41]:
print("finish Notebook!")