# 目的
gemmaでconcatする

https://www.kaggle.com/code/emiz6413/training-gemma-2-9b-4-bit-qlora-fine-tuning

In [1]:
# path setting
EXP_NAME = "e032-gemma-concat-full"
MODEL_NAME = "unsloth/gemma-2-9b-it-bnb-4bit"
COMPETITION_NAME = "lmsys"

DATA_PATH = "data"
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME.split('/')[-1]}"
MODEL_OUTPUT_PATH = f"trained_models/{EXP_NAME}"

# experiment parameter
DEBUG = False
TRAINING = True
UPLOAD_DATA_TO_S3 = True
UPLOAD_DATA_TO_KAGGLE = True
REMOVE_LOCAL_FILE = False
WANDB = True
USE_FOLD = 2
USE_DATA_RATE = 1.0
VALID_DATA_SIZE = 3000

# model parameter
TRAINING_MAX_LENGTH = 1024 # 512
INFERENCE_MAX_LENGTH = 1536
SEED = 42
EPOCH = 1
LR = 2e-04
TRAIN_BS = 4 # 16
GRAD_ACC_STEP= 128 // TRAIN_BS # 仮想的なバッチサイズはTRAIN_BS * GRAD_ACC_STEPとなる
EVAL_BS = 4 # 16
NUM_LABELS = 3

FREEZE_LAYERS = (
    0  # there're 42 layers in total, we don't add adapters to the first 16 layers
)

# rola parameter
LORA_R = 16
LORA_ALPHA = LORA_R * 2
LORA_DROPOUT = 0.05
LORA_BIAS = "none"

RESUME_FROM_CHECKPOINT= False # 途中から再開する場合はTrueにする
# TRAINED_MODEL_PATH = "lmsys/trained_models/e006-use-concat"

In [2]:
!nvidia-smi

In [3]:
!python --version

In [4]:
def resolve_path(base_path: str) -> str:
    import os

    cwd = os.getcwd()
    print(cwd)
    if cwd == f"/notebooks":
        print("Jupyter Kernel By VSCode!")
        return "kernel", f"/notebooks/{COMPETITION_NAME}/{base_path}"
    elif cwd == f"/notebooks/{COMPETITION_NAME}":
        print("nohup!")
        return base_path
    elif cwd == f"/notebooks/{COMPETITION_NAME}/{COMPETITION_NAME}/exp":
        print("Jupyter Lab!")
        return "nohup", f"../../{base_path}"
    elif cwd == f"/content":
        print("Google Colab!")
        return "colab", f"/content/drive/MyDrive/Kaggle/{COMPETITION_NAME}/{base_path}"
    elif cwd.startswith("/home/shinichiro.saito"):
        print("GCP!")
        return "GCP", f"/home/shinichiro.saito/{COMPETITION_NAME}/{base_path}"
    else:
        raise Exception("Unknown environment")


ENV_NAME, DATA_PATH = resolve_path(DATA_PATH)
print(DATA_PATH)
_, MODEL_OUTPUT_PATH = resolve_path(MODEL_OUTPUT_PATH)
print(MODEL_OUTPUT_PATH)

In [5]:
def validate_dataset_name(dataset_name: str) -> None:
    if len(dataset_name) < 6 or len(dataset_name) > 50:
        raise Exception(
            f"データセットの文字列は6~50文字にしてください。現在{len(DATASET_NAME)}文字"
        )
    if "_" in dataset_name:
        raise Exception("datasetの名称に_の使用は禁止です")


validate_dataset_name(DATASET_NAME)

# install

In [6]:
if ENV_NAME != "GCP":
    %pip install -qq polars==1.0.0
    %pip install -qq transformers==4.42.3
    %pip install -qq datasets==2.20.0
    %pip install -qq evaluate==0.4.2
    %pip install -qq seqeval==1.2.2
    %pip install -qq accelerate==0.32.0
    %pip install -qq python-dotenv==1.0.1
    %pip install -qq wandb==0.17.4
    %pip install -qq bitsandbytes==0.43.1
    %pip install -qq accelerate==0.32.0
    %pip install -qq peft==0.11.1

    # formatter
    %pip install -qq black isort

    %pip install -qq kaggle

# import

In [7]:
import os
import random
import ast
import json

import polars as pl
import numpy as np
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
from torch.utils.data import Dataset as TorchDataset
import wandb
from datasets import (
    Dataset,
    DatasetDict,
    Value,
    concatenate_datasets,
    load_dataset,
    ClassLabel,
)
from tokenizers import AddedToken
from tqdm.auto import tqdm
from scipy.special import softmax
from sklearn.metrics import log_loss
from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
    PreTrainedTokenizerBase,
    EvalPrediction,
    Trainer,
    DataCollatorWithPadding,
    TrainingArguments,
)

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

from sklearn.metrics import log_loss, accuracy_score

In [8]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"
NUM_PROC = os.cpu_count()

In [9]:
import transformers
import datasets
import evaluate
import bitsandbytes
import accelerate
import peft

assert transformers.__version__ == "4.42.3"
assert datasets.__version__ == "2.20.0"
assert evaluate.__version__ == "0.4.2"
assert bitsandbytes.__version__ == "0.43.1"
assert accelerate.__version__ == "0.32.0"
assert peft.__version__ == "0.11.1"

In [10]:
# Seed the same seed to all
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(SEED)

In [11]:
from dotenv import load_dotenv

load_dotenv(f"{DATA_PATH}/.env")

True

# Wandb

In [12]:
if WANDB:
    wandb.login(key=os.environ["WANDB_API_KEY"])
    wandb.init(project=COMPETITION_NAME, name=EXP_NAME)
    REPORT_TO = "wandb"
else:
    REPORT_TO = "none"

REPORT_TO

'wandb'

# Data Import & Preprocess

In [13]:
with open(f"{DATA_PATH}/label_stratified_fold.json") as f:
    label_stratified_fold = json.load(f)

In [14]:
train = (
    pl.read_csv(f"{DATA_PATH}/train.csv")
    .with_columns( # labelを付与する
        pl.when(pl.col("winner_model_a") == 1)
        .then(0)
        .when(pl.col("winner_model_b") == 1)
        .then(1)
        .when(pl.col("winner_tie") == 1)
        .then(2)
        .alias("labels"),
    )
    .with_columns(  # foldを追加する
        pl.col("id").replace(label_stratified_fold).alias("fold")
    )
)

In [15]:
if DEBUG:
    train = train.head(100)
    VALID_DATA_SIZE = 10

In [16]:
train_dataset = Dataset.from_polars(train)

In [17]:
train_dataset

Dataset({
    features: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie', 'labels', 'fold'],
    num_rows: 57477
})

In [18]:
# 計算を早くするために、データを減らす
if not DEBUG:
    train_dataset = train_dataset.shuffle().select(
        range(
            int(len(train_dataset) * USE_DATA_RATE)
        )
    )

# Tokenizer

In [19]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"
tokenizer.add_special_tokens({"additional_special_tokens": ["[SEP]"]})

1

In [20]:
class CustomDataCollator:
    def __init__(self, tokenizer):
        self.data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=16)

    def __call__(self, features):
        # batch_a = [{'input_ids': f['input_ids_a'], 'token_type_ids': f['token_type_ids_a'], 'attention_mask': f['attention_mask_a']} for f in features]
        # batch_b = [{'input_ids': f['input_ids_b'], 'token_type_ids': f['token_type_ids_b'], 'attention_mask': f['attention_mask_b']} for f in features]
        
        batch_a = [{'input_ids': f['input_ids_a'], 'attention_mask': f['attention_mask_a']} for f in features]
        batch_b = [{'input_ids': f['input_ids_b'], 'attention_mask': f['attention_mask_b']} for f in features]
        
        batch_a = self.data_collator(batch_a)
        batch_b = self.data_collator(batch_b)
        
        labels = torch.tensor([f['labels'] for f in features])

        return {
            'input_ids_a': batch_a['input_ids'],
            # 'token_type_ids_a': batch_a['token_type_ids'],
            'attention_mask_a': batch_a['attention_mask'],
            'input_ids_b': batch_b['input_ids'],
            # 'token_type_ids_b': batch_b['token_type_ids'],
            'attention_mask_b': batch_b['attention_mask'],
            'labels': labels
        }

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=16)
data_collator = CustomDataCollator(tokenizer)

# Tokenize

In [21]:
def process_text(text: str) -> str:
    return " ".join(eval(text, {"null": ""}))

def tokenize(examples, suffix, max_token_length: int):
    separator = " [SEP] "
    
    joined_text = (
        process_text(examples["prompt"])
        + separator
        + process_text(examples[f"response_{suffix}"])
    )

    tokenized = tokenizer(
        joined_text,
        max_length=max_token_length,
        truncation=True,
        padding="max_length",
    )

    return {key + f"_{suffix}": value for key, value in tokenized.items()}

train_dataset = train_dataset.map(
    tokenize,
    batched=False,
    fn_kwargs={
        "suffix": "a",
        "max_token_length": TRAINING_MAX_LENGTH
    },
    num_proc=NUM_PROC, # NOTE: debugする際には1にする
).map(
    tokenize,
    batched=False,
    fn_kwargs={
        "suffix": "b",
        "max_token_length": TRAINING_MAX_LENGTH
    },
    num_proc=NUM_PROC,
)

Map (num_proc=12):   0%|          | 0/57477 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/57477 [00:00<?, ? examples/s]

In [22]:
# class CustomTokenizer:
#     def __init__(
#         self,
#         tokenizer: PreTrainedTokenizerBase,
#         max_length: int
#     ) -> None:
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __call__(self, batch: dict) -> dict:
#         prompt = ["<prompt>: " + self.process_text(t) for t in batch["prompt"]]
#         response_a = ["\n\n<response_a>: " + self.process_text(t) for t in batch["response_a"]]
#         response_b = ["\n\n<response_b>: " + self.process_text(t) for t in batch["response_b"]]
#         texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
#         tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)
#         labels=[]
#         for a_win, b_win in zip(batch["winner_model_a"], batch["winner_model_b"]):
#             if a_win:
#                 label = 0
#             elif b_win:
#                 label = 1
#             else:
#                 label = 2
#             labels.append(label)
#         return {**tokenized, "labels": labels}

#     @staticmethod
#     def process_text(text: str) -> str:
#         return " ".join(eval(text, {"null": ""}))

# train_dataset = train_dataset.map(
#     CustomTokenizer(tokenizer, max_length=TRAINING_MAX_LENGTH),
#     batched=True
# )

In [23]:
print(train_dataset)

# Model

In [24]:
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA, # low-rankマトリクスのスケーリングファクター
    target_modules=[ # Linear層を全て含める
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ],
    layers_to_transform=[i for i in range(42) if i >= FREEZE_LAYERS],
    lora_dropout=LORA_DROPOUT, # LoRAレイヤーのドロップアウト確率
    bias=LORA_BIAS,
    task_type=TaskType.SEQ_CLS,
)

In [25]:
from typing import Optional, Tuple, Union, List
from transformers.cache_utils import Cache
from transformers import Gemma2Model, Gemma2PreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutputWithPast


class CustomGemma2ForSequenceClassification(Gemma2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = Gemma2Model(config)
        # self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
        self.score = nn.Linear(config.hidden_size * 2, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    # @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids_a: torch.LongTensor = None,
        input_ids_b: torch.LongTensor = None,
        attention_mask_a: Optional[torch.Tensor] = None,
        attention_mask_b: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )
        
        transformer_outputs_a = self.model(
            input_ids=input_ids_a,
            attention_mask=attention_mask_a,
            return_dict=return_dict,
        )

        transformer_outputs_b = self.model(
            input_ids=input_ids_b,
            attention_mask=attention_mask_b,
            return_dict=return_dict,
        )

        hidden_states_a = transformer_outputs_a[0]
        hidden_states_b = transformer_outputs_b[0]

        concated_hidden_states = torch.cat((hidden_states_a, hidden_states_b), dim=-1)

        logits = self.score(concated_hidden_states)

        if input_ids_a is not None:
            batch_size = input_ids_a.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError(
                "Cannot handle batch sizes > 1 if no padding token is defined."
            )
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids_a is not None:
                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
                sequence_lengths = (
                    torch.eq(input_ids_a, self.config.pad_token_id).int().argmax(-1) - 1
                )
                sequence_lengths = sequence_lengths % input_ids_a.shape[-1]
                sequence_lengths = sequence_lengths.to(logits.device)
            else:
                sequence_lengths = -1

        pooled_logits = logits[
            torch.arange(batch_size, device=logits.device), sequence_lengths
        ]

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (
                    labels.dtype == torch.long or labels.dtype == torch.int
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # if self.config.problem_type == "regression":
            #     loss_fct = MSELoss()
            #     if self.num_labels == 1:
            #         loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
            #     else:
            #         loss = loss_fct(pooled_logits, labels)
            # elif self.config.problem_type == "single_label_classification":
            if self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(
                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
                )
            # elif self.config.problem_type == "multi_label_classification":
            #     loss_fct = BCEWithLogitsLoss()
            #     loss = loss_fct(pooled_logits, labels)
        # if not return_dict:
        #     # output = (pooled_logits,) + transformer_outputs[1:]
        #     output = (pooled_logits,) + transformer_outputs_a[1:]
        #     return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            # past_key_values=transformer_outputs.past_key_values,
            # hidden_states=transformer_outputs.hidden_states,
            # attentions=transformer_outputs.attentions,
            past_key_values=transformer_outputs_a.past_key_values,
            hidden_states=transformer_outputs_a.hidden_states,
            attentions=transformer_outputs_a.attentions,
        )

In [26]:
from peft.peft_model import PeftModelForSequenceClassification
from peft.utils import PeftType

# https://github.com/huggingface/peft/blob/0649947396a946a9333b731bea2e76551e85ad92/src/peft/peft_model.py#L1129
class CustomPeftModelForSequenceClassification(PeftModelForSequenceClassification):
    def forward(
        self,
        # input_ids=None,
        input_ids_a=None,
        input_ids_b=None,
        # attention_mask=None,
        attention_mask_a=None,
        attention_mask_b=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        task_ids=None,
        **kwargs,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        peft_config = self.active_peft_config
        if not peft_config.is_prompt_learning:
            with self._enable_peft_forward_hooks(**kwargs):
                kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
                if peft_config.peft_type == PeftType.POLY:
                    kwargs["task_ids"] = task_ids
                return self.base_model(
                    # input_ids=input_ids,
                    input_ids_a=input_ids_a,
                    input_ids_b=input_ids_b,
                    # attention_mask=attention_mask,
                    attention_mask_a=attention_mask_a,
                    attention_mask_b=attention_mask_b,
                    inputs_embeds=inputs_embeds,
                    labels=labels,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                    return_dict=return_dict,
                    **kwargs,
                )

In [27]:
# model = AutoModelForSequenceClassification.from_pretrained(

model = CustomGemma2ForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    torch_dtype=torch.float16,
    device_map="auto",
    cache_dir="./model_cache"
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
# model = get_peft_model(model, lora_config)
model = CustomPeftModelForSequenceClassification(model, lora_config)
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=16)
print(model)
print(model.print_trainable_parameters())

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

# Train Test Split

In [28]:
train_valid_dataset = DatasetDict(
    {
        "train": train_dataset.filter(
            lambda x: x["fold"] != USE_FOLD, num_proc=NUM_PROC
        ),
        "valid": train_dataset.filter(
            lambda x: x["fold"] == USE_FOLD, num_proc=NUM_PROC
        ).select(range(VALID_DATA_SIZE)),
    }
)

Filter (num_proc=12):   0%|          | 0/57477 [00:00<?, ? examples/s]

Filter (num_proc=12):   0%|          | 0/57477 [00:00<?, ? examples/s]

In [29]:
print(train_valid_dataset)

In [30]:
def compute_metrics(eval_pred) -> dict:
    preds, labels = eval_pred
    preds_prob = softmax(preds, axis=-1)
    return {
        "log_loss": log_loss(y_true=labels, y_pred=preds_prob),
        "acc": accuracy_score(y_true=labels, y_pred=preds.argmax(-1)),
    }

In [31]:
class ConcatDataset(TorchDataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        item = self.dataset[idx]

        input_ids_a = torch.tensor(item['input_ids_a'])
        # token_type_ids_a = torch.tensor(item['token_type_ids_a'])
        attention_mask_a = torch.tensor(item['attention_mask_a'])
        
        input_ids_b = torch.tensor(item['input_ids_b'])
        # token_type_ids_b = torch.tensor(item['token_type_ids_b'])
        attention_mask_b = torch.tensor(item['attention_mask_b'])
        
        labels = torch.tensor(item['labels'])

        return {
            'input_ids_a': input_ids_a,
            # 'token_type_ids_a': token_type_ids_a,
            'attention_mask_a': attention_mask_a,
            'input_ids_b': input_ids_b,
            # 'token_type_ids_b': token_type_ids_b,
            'attention_mask_b': attention_mask_b,
            'labels': labels
        }

    def __len__(self):
        return len(self.dataset)

In [32]:
# スケジューラの設定
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_PATH,
    overwrite_output_dir=True,
    learning_rate=LR,
    per_device_train_batch_size=TRAIN_BS,
    gradient_accumulation_steps=GRAD_ACC_STEP,
    eval_accumulation_steps=GRAD_ACC_STEP,
    per_device_eval_batch_size=EVAL_BS,
    num_train_epochs=EPOCH,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=0.1,
    save_strategy="steps",
    save_steps=0.1,
    save_total_limit=10,
    logging_steps=2,
    seed=SEED,
    metric_for_best_model="eval_loss",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine_with_restarts", # "linear", # "constant_with_warmup",
    report_to=REPORT_TO,
    run_name=EXP_NAME,
    load_best_model_at_end=True,
    fp16=True,
    fp16_full_eval=True,
    gradient_checkpointing=True,
    optim="adamw_8bit",
)

trainer = Trainer(
    model=model,
    args=training_args,
    # train_dataset=train_valid_dataset["train"],
    train_dataset=ConcatDataset(train_valid_dataset["train"]),
    # eval_dataset=train_valid_dataset["valid"],
    eval_dataset=ConcatDataset(train_valid_dataset["valid"]),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [33]:
# debugしたい場合は、一度STOPして、もう一度debugするとなぜかできる
if TRAINING:
    # モデルの学習
    trainer.train(
        resume_from_checkpoint = RESUME_FROM_CHECKPOINT if RESUME_FROM_CHECKPOINT else None
    )
    # ログの保存に利用したストレージを削除
    # os.system(f"rm -rf {MODEL_OUTPUT_PATH}/checkpoint-*")
    # モデルの保存
    trainer.save_model(MODEL_OUTPUT_PATH)
else:
    # TRAINED_MODEL_PATHを用いて、学習済のモデルを読み込む
    model = AutoModelForSequenceClassification.from_pretrained(
        TRAINED_MODEL_PATH,
        num_labels=NUM_LABELS,
    )
    # model = CustomDebertaSequenceClassification.from_pretrained(MODEL_NAME)

    args = TrainingArguments(
        ".",
        per_device_eval_batch_size=4,
        report_to="none",
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=args,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

Step,Training Loss,Validation Loss,Log Loss,Acc,Runtime,Samples Per Second,Steps Per Second
30,1.8313,1.938657,1.938169,0.339667,1309.8499,2.29,0.573
60,1.5839,1.625154,1.625155,0.381333,1309.5154,2.291,0.573
90,1.525,1.52927,1.529259,0.405,1309.4169,2.291,0.573
120,1.4742,1.47415,1.474153,0.394333,1309.3041,2.291,0.573
150,1.4449,1.437529,1.437531,0.415333,1309.1744,2.292,0.573
180,1.5792,1.423985,1.423984,0.419667,1309.3494,2.291,0.573
210,1.4457,1.411852,1.411853,0.416,1309.2144,2.291,0.573
240,1.3313,1.404825,1.404828,0.411667,1309.2507,2.291,0.573
270,1.4003,1.396047,1.396042,0.416667,1309.2566,2.291,0.573


```
/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:61: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
TODO: この　Warningが問題ないのかを調べる

```

# valid_datasetの作成・保存

In [34]:
# TRAININGをINFERRENCEでMAX_TOKENを変えるために、validを作り直す
valid_dataset = train_dataset.filter(
    lambda example: example["id"] in train_valid_dataset["valid"]["id"],
    num_proc=NUM_PROC,
)

valid_dataset = valid_dataset.map(
    tokenize,
    batched=False,
    fn_kwargs={
        "suffix": "a",
        "max_token_length": INFERENCE_MAX_LENGTH
    },
    num_proc=NUM_PROC, # NOTE: debugする際には1にする
).map(
    tokenize,
    batched=False,
    fn_kwargs={
        "suffix": "b",
        "max_token_length": INFERENCE_MAX_LENGTH
    },
    num_proc=NUM_PROC,
)

def add_valid_pred(example, idx, valid_pred):
    example["valid_pred"] = valid_pred[idx]
    return example

# valid_pred = softmax(trainer.predict(valid_dataset).predictions, axis=-1)
valid_pred = softmax(
    trainer.predict(ConcatDataset(train_valid_dataset["valid"])).predictions,
    axis=-1
)

np.save(f"{MODEL_OUTPUT_PATH}/valid_prediction.npy", valid_pred)

valid_dataset = valid_dataset.map(
    add_valid_pred, with_indices=True, fn_kwargs={"valid_pred": valid_pred}
)

valid_dataset.save_to_disk(f"{MODEL_OUTPUT_PATH}/valid_dataset")

Filter (num_proc=12):   0%|          | 0/57477 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/3000 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

# CVの計算

In [35]:
cv_score = log_loss(valid_dataset["labels"], valid_pred)
print(f"CV Score: {cv_score}")

In [36]:
# output_textを保存
with open(f"{MODEL_OUTPUT_PATH}/cv_score.txt", "w") as f:
    f.write(str(cv_score))

# AWSへのアップロード

In [37]:
# S3へのアップロード
# TODO: colabでは動かないため直す
if not DEBUG and UPLOAD_DATA_TO_S3:
    # uninstall
    !sudo rm /usr/bin/aws
    !sudo rm /usr/bin/aws_completer
    !sudo rm -rf /usr/local/aws-cli

    # install
    !curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
    !unzip -o -qq awscliv2.zip
    !sudo ./aws/install --update

    # upload
    output_name = MODEL_OUTPUT_PATH.split("/")[-1]
    os.system(
        f"aws s3 cp --recursive {MODEL_OUTPUT_PATH} s3://{COMPETITION_NAME}/trained_model/{output_name}"
    )

In [38]:
# ダウンロード（参考）
# !sudo rm /usr/bin/aws
# !sudo rm /usr/bin/aws_completer
# !sudo rm -rf /usr/local/aws-cli

# !curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
# !unzip -o -qq awscliv2.zip
# !sudo ./aws/install --update

# !aws s3 cp --recursive s3://automated-essay-scoring/trained_model/e005-regression /notebooks/automated_essay_scoring/trained_models/e005-regression

# Kaggle Datasetへのupload

In [39]:
import os

os.system("mkdir -p ~/.kaggle/")
os.system(f"cp /{DATA_PATH}/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

0

In [40]:
if not DEBUG and UPLOAD_DATA_TO_KAGGLE:
    import os
    import json

    from kaggle.api.kaggle_api_extended import KaggleApi

    def dataset_create_new(dataset_name: str, upload_dir: str):
        # if "_" in dataset_name:
        #     raise ValueError("datasetの名称に_の使用は禁止です")
        dataset_metadata = {}
        dataset_metadata["id"] = f"sinchir0/{dataset_name}"
        dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
        dataset_metadata["title"] = dataset_name
        with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
            json.dump(dataset_metadata, f, indent=4)
        api = KaggleApi()
        api.authenticate()
        api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")

    print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
    dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

# ローカルからのデータの削除

In [41]:
# if not DEBUG and REMOVE_LOCAL_FILE:
#     # ローカルからは削除
#     os.system(f"rm -rf {MODEL_OUTPUT_PATH}")

In [42]:
if WANDB:
    wandb.finish()

In [43]:
print("finish Notebook!")