# 目的
Concatを行う

In [1]:
# path setting
EXP_NAME = "e006-use-concat"
MODEL_NAME = "microsoft/deberta-v3-xsmall"
COMPETITION_NAME = "lmsys"

DATA_PATH = "data"
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME.split('/')[-1]}"
MODEL_OUTPUT_PATH = f"trained_models/{EXP_NAME}"

# experiment parameter
DEBUG = True
TRAINING = True
UPLOAD_DATA_TO_S3 = True
UPLOAD_DATA_TO_KAGGLE = True
WANDB = True

# model parameter
TRAINING_MAX_LENGTH = 1024
INFERENCE_MAX_LENGTH = 1536
SEED = 42
VALID_DATA_SIZE = 0.3
EPOCH = 3
LR = 2e-05
TRAIN_BS = 4
GRAD_ACC_NUM = 16
EVAL_BS = 4
NUM_LABELS = 3

In [2]:
!nvidia-smi

Mon Jul  8 23:40:10 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    Off  | 00000000:00:05.0 Off |                  Off |
| 61%   84C    P2   285W / 300W |   3293MiB / 49140MiB |     92%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!python --version

Python 3.11.7


In [4]:
def resolve_path(base_path: str) -> str:
    import os

    cwd = os.getcwd()
    print(cwd)
    if cwd == f"/notebooks":
        print("Jupyter Kernel By VSCode!")
        return f"/notebooks/{COMPETITION_NAME}/{base_path}"
    elif cwd == f"/notebooks/{COMPETITION_NAME}":
        print("nohup!")
        return base_path
    elif cwd == f"/notebooks/{COMPETITION_NAME}/{COMPETITION_NAME}/exp":
        print("Jupyter Lab!")
        return f"../../{base_path}"
    else:
        raise Exception("Unknown environment")


DATA_PATH = resolve_path(DATA_PATH)
print(DATA_PATH)
MODEL_OUTPUT_PATH = resolve_path(MODEL_OUTPUT_PATH)
print(MODEL_OUTPUT_PATH)

/notebooks/lmsys/lmsys/exp
Jupyter Lab!
../../data
/notebooks/lmsys/lmsys/exp
Jupyter Lab!
../../trained_models/e006-use-concat


In [5]:
def validate_dataset_name(dataset_name: str) -> None:
    if len(dataset_name) < 6 or len(dataset_name) > 50:
        raise Exception(
            f"データセットの文字列は6~50文字にしてください。現在{len(DATASET_NAME)}文字"
        )
    if "_" in dataset_name:
        raise Exception("datasetの名称に_の使用は禁止です")


validate_dataset_name(DATASET_NAME)

# install

In [6]:
%pip install -qq polars==1.0.0
%pip install -qq transformers==4.42.3
%pip install -qq datasets==2.20.0
%pip install -qq evaluate==0.4.2
%pip install -qq seqeval==1.2.2
%pip install -qq accelerate==0.32.0
%pip install -qq python-dotenv==1.0.1
%pip install -qq wandb==0.17.4

# formatter
%pip install -qq black isort

%pip install -qq kaggle

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# import

In [7]:
import os
import random
import ast
import json

import polars as pl
import numpy as np
import torch
import torch.nn as nn
import wandb
from datasets import Dataset, DatasetDict, Value, concatenate_datasets, load_dataset, ClassLabel
from tokenizers import AddedToken
from tqdm.auto import tqdm
from scipy.special import softmax
from sklearn.metrics import log_loss
from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

2024-07-08 23:40:48.800132: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 23:40:48.800194: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 23:40:48.801145: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 23:40:48.807128: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"
NUM_PROC = os.cpu_count()

In [9]:
import transformers
import datasets
import evaluate

assert transformers.__version__ == "4.42.3"
assert datasets.__version__ == "2.20.0"
assert evaluate.__version__ == "0.4.2"

In [10]:
# Seed the same seed to all
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(SEED)

In [11]:
from dotenv import load_dotenv

load_dotenv(f"{DATA_PATH}/.env")

True

# Wandb

In [12]:
if WANDB:
    wandb.login(key=os.environ["WANDB_API_KEY"])
    wandb.init(project=COMPETITION_NAME, name=EXP_NAME)
    REPORT_TO = "wandb"
else:
    REPORT_TO = "none"

REPORT_TO

[34m[1mwandb[0m: Currently logged in as: [33msinchir0[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


'wandb'

# Data Import & Preprocess

In [13]:
train = (
    pl.read_csv(f"{DATA_PATH}/train.csv")
    .with_columns(
        pl.col("prompt").str.json_decode(),
        pl.col("response_a").str.json_decode(),
        pl.col("response_b").str.json_decode(),
    )
    .with_columns( # 長さの情報を追加する
        pl.col("prompt").map_elements(lambda x: len(x), return_dtype=pl.Int64).alias("len_prompt"),
        pl.col("response_a").map_elements(lambda x: len(x), return_dtype=pl.Int64).alias("len_response_a"),
        pl.col("response_b").map_elements(lambda x: len(x), return_dtype=pl.Int64).alias("len_response_b"),
    )
    .with_columns( # 最初のレスポンスのみを取得する
        pl.col("prompt").map_elements(lambda x: x[0], return_dtype=pl.String).alias("first_prompt"),
        pl.col("response_a").map_elements(lambda x: x[0], return_dtype=pl.String).alias("first_response_a"),
        pl.col("response_b").map_elements(lambda x: x[0], return_dtype=pl.String).alias("first_response_b"),
    )
    .with_columns( # 最後のレスポンスのみを取得する
        pl.col("prompt").map_elements(lambda x: x[-1], return_dtype=pl.String).alias("last_prompt"),
        pl.col("response_a").map_elements(lambda x: x[-1], return_dtype=pl.String).alias("last_response_a"),
        pl.col("response_b").map_elements(lambda x: x[-1], return_dtype=pl.String).alias("last_response_b"),
    )
    .with_columns( # 最後のレスポンスがNoneの場合を空文字にする、約60件程度
        pl.col("last_response_a").fill_null(""),
        pl.col("last_response_b").fill_null(""),
    )
    .with_columns( # labelを付与する
        pl.when(pl.col("winner_model_a") == 1)
        .then(0)
        .when(pl.col("winner_model_b") == 1)
        .then(1)
        .when(pl.col("winner_tie") == 1)
        .then(2)
        .alias("label"),
    )
    .select( # 元のprompt, responseを削除する
        pl.exclude(["prompt", "response_a", "response_b"])
    )
)

In [14]:
if DEBUG:
    train = train.head(100)

In [15]:
train_dataset = Dataset.from_polars(train)

In [16]:
train_dataset

Dataset({
    features: ['id', 'model_a', 'model_b', 'winner_model_a', 'winner_model_b', 'winner_tie', 'len_prompt', 'len_response_a', 'len_response_b', 'first_prompt', 'first_response_a', 'first_response_b', 'last_prompt', 'last_response_a', 'last_response_b', 'label'],
    num_rows: 100
})

# Model

In [None]:
# https://chatgpt.com/share/f7a30189-2ca6-4870-96e6-c06120bf4ca7

# class CustomDebertaModel(nn.Module):
#     def __init__(self, model_name):
#         super(CustomDebertaModel, self).__init__()
#         self.deberta = DebertaModel.from_pretrained(model_name)
#         self.classification_head = nn.Sequential(
#             nn.Linear(self.deberta.config.hidden_size * 2, 256),
#             nn.ReLU(),
#             nn.Linear(256, 1)  # Assuming binary classification
#         )

#     def forward(self, input_ids_a, attention_mask_a, input_ids_b, attention_mask_b):
#         outputs_a = self.deberta(input_ids=input_ids_a, attention_mask=attention_mask_a)
#         outputs_b = self.deberta(input_ids=input_ids_b, attention_mask=attention_mask_b)
        
#         cls_a = outputs_a.last_hidden_state[:, 0, :]  # CLS token
#         cls_b = outputs_b.last_hidden_state[:, 0, :]  # CLS token
        
#         combined = torch.cat((cls_a, cls_b), dim=1)
#         logits = self.classification_head(combined)
#         return logits

In [37]:
# # https://github.com/sinchir0/automated_essay_scoring/blob/main/automated_essay_scoring/exp/exp030.ipynb

# # https://dev.classmethod.jp/articles/huggingface-usage-custom-model/
# # https://github.com/huggingface/transformers/blob/94b3f544a1f5e04b78d87a2ae32a7ac252e22e31/src/transformers/models/deberta_v2/modeling_deberta_v2.py#L1313
# class CustomDebertaSequenceClassification(DebertaV2PreTrainedModel):
#     def __init__(self, config):
#         super().__init__(config)

#         num_labels = getattr(config, "num_labels", 2)
#         self.num_labels = num_labels

#         self.deberta = DebertaV2Model(config)
#         self.pooler = ContextPooler(config)
#         output_dim = self.pooler.output_dim

#         self.classifier = nn.Linear(output_dim, num_labels)
#         drop_out = getattr(config, "cls_dropout", None)
#         drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
#         self.dropout = StableDropout(drop_out)

#         # Initialize weights and apply final processing
#         self.post_init()

#     def get_input_embeddings(self):
#         return self.deberta.get_input_embeddings()

#     def set_input_embeddings(self, new_embeddings):
#         self.deberta.set_input_embeddings(new_embeddings)

#     def forward(
#         self,
#         input_ids: Optional[torch.Tensor] = None,
#         attention_mask: Optional[torch.Tensor] = None,
#         token_type_ids: Optional[torch.Tensor] = None,
#         # position_ids: Optional[torch.Tensor] = None,
#         inputs_embeds: Optional[torch.Tensor] = None,
#         labels: Optional[torch.Tensor] = None,
#         # output_attentions: Optional[bool] = None,
#         # output_hidden_states: Optional[bool] = None,
#         return_dict: Optional[bool] = None,
#     ) -> Union[Tuple, SequenceClassifierOutput]:
#         return_dict = (
#             return_dict if return_dict is not None else self.config.use_return_dict
#         )

#         outputs = self.deberta(
#             input_ids,
#             token_type_ids=token_type_ids,
#             attention_mask=attention_mask,
#             # position_ids=position_ids,
#             inputs_embeds=inputs_embeds,
#             # output_attentions=output_attentions,
#             # output_hidden_states=output_hidden_states,
#             return_dict=return_dict,
#         )

#         encoder_layer = outputs[0]
#         pooled_output = self.pooler(encoder_layer)
#         pooled_output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)

#         loss = None
#         if labels is not None:
#             loss_fn = nn.MSELoss()
#             logits = logits.view(-1).to(labels.dtype)
#             loss = loss_fn(logits, labels.view(-1))
#         # if labels is not None:
#         #     if self.config.problem_type is None:
#         #         if self.num_labels == 1:
#         #             # regression task
#         #             loss_fn = nn.MSELoss()
#         #             logits = logits.view(-1).to(labels.dtype)
#         #             loss = loss_fn(logits, labels.view(-1))
#         #         elif labels.dim() == 1 or labels.size(-1) == 1:
#         #             label_index = (labels >= 0).nonzero()
#         #             labels = labels.long()
#         #             if label_index.size(0) > 0:
#         #                 labeled_logits = torch.gather(
#         #                     logits,
#         #                     0,
#         #                     label_index.expand(label_index.size(0), logits.size(1)),
#         #                 )
#         #                 labels = torch.gather(labels, 0, label_index.view(-1))
#         #                 loss_fct = CrossEntropyLoss()
#         #                 loss = loss_fct(
#         #                     labeled_logits.view(-1, self.num_labels).float(),
#         #                     labels.view(-1),
#         #                 )
#         #             else:
#         #                 loss = torch.tensor(0).to(logits)
#         #         else:
#         #             log_softmax = nn.LogSoftmax(-1)
#         #             loss = -((log_softmax(logits) * labels).sum(-1)).mean()
#         #     elif self.config.problem_type == "regression":
#         #         loss_fct = MSELoss()
#         #         if self.num_labels == 1:
#         #             loss = loss_fct(logits.squeeze(), labels.squeeze())
#         #         else:
#         #             loss = loss_fct(logits, labels)
#         #     elif self.config.problem_type == "single_label_classification":
#         #         loss_fct = CrossEntropyLoss()
#         #         loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
#         #     elif self.config.problem_type == "multi_label_classification":
#         #         loss_fct = BCEWithLogitsLoss()
#         #         loss = loss_fct(logits, labels)
#         # if not return_dict:
#         #     output = (logits,) + outputs[1:]
#         #     return ((loss,) + output) if loss is not None else output

#         return SequenceClassifierOutput(
#             loss=loss,
#             logits=logits,
#             # hidden_states=outputs.hidden_states,
#             # attentions=outputs.attentions,
#         )

In [None]:
# https://github.com/sinchir0/automated_essay_scoring/blob/main/automated_essay_scoring/exp/exp030.ipynb

# https://dev.classmethod.jp/articles/huggingface-usage-custom-model/
# https://github.com/huggingface/transformers/blob/94b3f544a1f5e04b78d87a2ae32a7ac252e22e31/src/transformers/models/deberta_v2/modeling_deberta_v2.py#L1313
class CustomDebertaSequenceClassification(DebertaV2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        num_labels = getattr(config, "num_labels", 2)
        self.num_labels = num_labels

        self.deberta = DebertaV2Model(config)
        self.pooler = ContextPooler(config)
        output_dim = self.pooler.output_dim

        self.classifier = nn.Linear(output_dim, num_labels)
        drop_out = getattr(config, "cls_dropout", None)
        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
        self.dropout = StableDropout(drop_out)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.deberta.get_input_embeddings()

    def set_input_embeddings(self, new_embeddings):
        self.deberta.set_input_embeddings(new_embeddings)

    def forward(
        self,
        input_ids_a: Optional[torch.Tensor] = None,
        input_ids_b: Optional[torch.Tensor] = None,
        attention_mask_a: Optional[torch.Tensor] = None,
        attention_mask_b: Optional[torch.Tensor] = None,
        token_type_ids_a: Optional[torch.Tensor] = None,
        token_type_ids_b: Optional[torch.Tensor] = None,
        # position_ids: Optional[torch.Tensor] = None,
        inputs_embeds_a: Optional[torch.Tensor] = None,
        inputs_embeds_b: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        # output_attentions: Optional[bool] = None,
        # output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs_a = self.deberta(
            input_ids_a,
            token_type_ids=token_type_ids_a,
            attention_mask=attention_mask_a,
            # position_ids=position_ids,
            inputs_embeds=inputs_embeds_a,
            # output_attentions=output_attentions,
            # output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        outputs_b = self.deberta(
            input_ids_b,
            token_type_ids=token_type_ids_b,
            attention_mask=attention_mask_b,
            # position_ids=position_ids,
            inputs_embeds=inputs_embeds_b,
            # output_attentions=output_attentions,
            # output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        encoder_layer_a = outputs_a[0]
        encoder_layer_b = outputs_b[0]
        # TODO: vscodeのdebugを利用して、期待している動作をするかを確認する
        # TODO: ここから先を修正する
        pooled_output = self.pooler(encoder_layer)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fn = nn.MSELoss()
            logits = logits.view(-1).to(labels.dtype)
            loss = loss_fn(logits, labels.view(-1))
        # if labels is not None:
        #     if self.config.problem_type is None:
        #         if self.num_labels == 1:
        #             # regression task
        #             loss_fn = nn.MSELoss()
        #             logits = logits.view(-1).to(labels.dtype)
        #             loss = loss_fn(logits, labels.view(-1))
        #         elif labels.dim() == 1 or labels.size(-1) == 1:
        #             label_index = (labels >= 0).nonzero()
        #             labels = labels.long()
        #             if label_index.size(0) > 0:
        #                 labeled_logits = torch.gather(
        #                     logits,
        #                     0,
        #                     label_index.expand(label_index.size(0), logits.size(1)),
        #                 )
        #                 labels = torch.gather(labels, 0, label_index.view(-1))
        #                 loss_fct = CrossEntropyLoss()
        #                 loss = loss_fct(
        #                     labeled_logits.view(-1, self.num_labels).float(),
        #                     labels.view(-1),
        #                 )
        #             else:
        #                 loss = torch.tensor(0).to(logits)
        #         else:
        #             log_softmax = nn.LogSoftmax(-1)
        #             loss = -((log_softmax(logits) * labels).sum(-1)).mean()
        #     elif self.config.problem_type == "regression":
        #         loss_fct = MSELoss()
        #         if self.num_labels == 1:
        #             loss = loss_fct(logits.squeeze(), labels.squeeze())
        #         else:
        #             loss = loss_fct(logits, labels)
        #     elif self.config.problem_type == "single_label_classification":
        #         loss_fct = CrossEntropyLoss()
        #         loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        #     elif self.config.problem_type == "multi_label_classification":
        #         loss_fct = BCEWithLogitsLoss()
        #         loss = loss_fct(logits, labels)
        # if not return_dict:
        #     output = (logits,) + outputs[1:]
        #     return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            # hidden_states=outputs.hidden_states,
            # attentions=outputs.attentions,
        )

In [38]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME
)
tokenizer.add_tokens(
    [
        AddedToken("\n", normalized=False),
        AddedToken(" " * 2, normalized=False),
    ]
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=16)

# model = AutoModelForSequenceClassification.from_pretrained(
#     MODEL_NAME,
#     num_labels=NUM_LABELS
# )

model = ConcatDebertaModel(model_name=MODEL_NAME)

model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=16)

You are using a model of type deberta-v2 to instantiate a model of type deberta. This is not supported for all configurations of models and can yield errors.
Some weights of DebertaModel were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['deberta.encoder.layer.0.attention.self.in_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.0.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.0.attention.self.q_bias', 'deberta.encoder.layer.0.attention.self.v_bias', 'deberta.encoder.layer.1.attention.self.in_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.1.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.1.attention.self.q_bias', 'deberta.encoder.layer.1.attention.self.v_bias', 'deberta.encoder.layer.10.attention.self.in_p

AttributeError: 'ConcatDebertaModel' object has no attribute 'resize_token_embeddings'

# Tokenize

In [28]:
tmp = tokenizer(
    "aaa",
    max_length=10,
    truncation=True,
    padding="max_length",
)

{key + '_a': value for key, value in tmp.items()}

{'input_ids_a': [1, 266, 15175, 2, 0, 0, 0, 0, 0, 0],
 'token_type_ids_a': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask_a': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]}

In [30]:
# def tokenize(examples, max_token_length: int):
#     separator = " [SEP] "
    
#     joined_text = (
#         examples["last_prompt"]
#         + separator
#         + examples["last_response_a"]
#         + separator
#         + examples["last_response_b"]
#     )

#     tokenized = tokenizer(
#         joined_text,
#         max_length=max_token_length,
#         truncation=True,
#         padding="max_length",
#     )

    # return tokenizer(
    #     joined_text,
    #     max_length=max_token_length,
    #     truncation=True,
    #     padding="max_length",
    # )

def tokenize(examples, suffix, max_token_length: int):
    separator = " [SEP] "
    
    # TODO: ２つ以上の応答も追加する
    joined_text = (
        examples["last_prompt"]
        + separator
        + examples[f"last_response_{suffix}"]
    )

    tokenized = tokenizer(
        joined_text,
        max_length=max_token_length,
        truncation=True,
        padding="max_length",
    )

    return {key + f"_{suffix}": value for key, value in tokenized.items()}

train_dataset = train_dataset.map(
        tokenize,
        batched=False,
        fn_kwargs={
            "suffix": "a",
            "max_token_length": TRAINING_MAX_LENGTH
        },
        num_proc=NUM_PROC,
    ).map(
        tokenize,
        batched=False,
        fn_kwargs={
            "suffix": "b",
            "max_token_length": TRAINING_MAX_LENGTH
        },
        num_proc=NUM_PROC,
    )

Setting TOKENIZERS_PARALLELISM=false for forked processes.


Map (num_proc=8):   0%|          | 0/100 [00:00<?, ? examples/s]

Setting TOKENIZERS_PARALLELISM=false for forked processes.


Map (num_proc=8):   0%|          | 0/100 [00:00<?, ? examples/s]

In [31]:
train_dataset

Dataset({
    features: ['id', 'model_a', 'model_b', 'winner_model_a', 'winner_model_b', 'winner_tie', 'len_prompt', 'len_response_a', 'len_response_b', 'first_prompt', 'first_response_a', 'first_response_b', 'last_prompt', 'last_response_a', 'last_response_b', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'input_ids_a', 'token_type_ids_a', 'attention_mask_a', 'input_ids_b', 'token_type_ids_b', 'attention_mask_b'],
    num_rows: 100
})

# Train Test Split

In [32]:
def to_train_valid(ds):
    return DatasetDict({"train": ds["train"], "valid": ds["test"]})

train_dataset = train_dataset.cast_column('label', ClassLabel(num_classes=NUM_LABELS))

train_valid_dataset = to_train_valid(
    (
        train_dataset
        .train_test_split(
            test_size=VALID_DATA_SIZE,
            seed=SEED,
            stratify_by_column="label"
        )
    )
)

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

In [20]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds_prob = softmax(predictions, axis=-1)
    return {"log_loss": log_loss(labels, preds_prob)}

In [21]:
# スケジューラの設定
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_PATH,
    learning_rate=LR,
    per_device_train_batch_size=TRAIN_BS,
    gradient_accumulation_steps=GRAD_ACC_NUM,
    eval_accumulation_steps=GRAD_ACC_NUM,
    per_device_eval_batch_size=EVAL_BS,
    num_train_epochs=EPOCH,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=0.1,
    save_strategy="steps",
    save_steps=0.1,
    save_total_limit=1,
    logging_steps=2,
    seed=SEED,
    metric_for_best_model="eval_loss",
    warmup_ratio=0.1,
    lr_scheduler_type="constant_with_warmup",
    report_to=REPORT_TO,
    run_name=EXP_NAME,
    load_best_model_at_end=True,
    fp16=True,
    fp16_full_eval=True,
    gradient_checkpointing=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_valid_dataset["train"],
    eval_dataset=train_valid_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    #compute_metrics=compute_metrics,
)

In [22]:
if TRAINING:
    # モデルの学習
    trainer.train()
    # ログの保存に利用したストレージを削除
    os.system(f"rm -rf {MODEL_OUTPUT_PATH}/checkpoint-*")
    # モデルの保存
    trainer.save_model(MODEL_OUTPUT_PATH)
else:
    # TRAINED_MODEL_PATHを用いて、学習済のモデルを読み込む
    model = AutoModelForSequenceClassification.from_pretrained(
        TRAINED_MODEL_PATH,
        num_labels=NUM_LABELS,
    )

    args = TrainingArguments(
        ".",
        per_device_eval_batch_size=4,
        report_to="none",
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=args,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

[2024-07-08 22:39:31,066] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)




Step,Training Loss,Validation Loss
1,No log,1.103464
2,1.099800,1.098382
3,1.099800,1.09773




# valid_datasetの作成・保存

In [23]:
# TRAININGをINFERRENCEでMAX_TOKENを変えるために、validを作り直す
valid_dataset = train_dataset.filter(
    lambda example: example["id"]
    in train_valid_dataset["valid"]["id"],
    num_proc=NUM_PROC,
)

valid_dataset = valid_dataset.map(
    tokenize,
    batched=False,
    fn_kwargs={"max_token_length": INFERENCE_MAX_LENGTH},
    num_proc=NUM_PROC,
)


def add_valid_pred(example, idx, valid_pred):
    example["valid_pred"] = valid_pred[idx]
    return example


valid_pred = softmax(
    trainer.predict(valid_dataset).predictions, axis=-1
)

np.save(f"{MODEL_OUTPUT_PATH}/valid_prediction.npy", valid_pred)

valid_dataset = valid_dataset.map(
    add_valid_pred,
    with_indices=True,
    fn_kwargs={"valid_pred": valid_pred}
)

valid_dataset.save_to_disk(f"{MODEL_OUTPUT_PATH}/valid_dataset")

Setting TOKENIZERS_PARALLELISM=false for forked processes.


Filter (num_proc=8):   0%|          | 0/100 [00:00<?, ? examples/s]

Setting TOKENIZERS_PARALLELISM=false for forked processes.


Map (num_proc=8):   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/30 [00:00<?, ? examples/s]

# CVの計算

In [24]:
cv_score = log_loss(valid_dataset["label"], valid_pred)
print(f"CV Score: {cv_score}")

CV Score: 1.0959604981750728


In [25]:
# output_textを保存
with open(f"{MODEL_OUTPUT_PATH}/cv_score.txt", "w") as f:
    f.write(str(cv_score))

# AWSへのアップロード

In [26]:
# S3へのアップロード
if not DEBUG and UPLOAD_DATA_TO_S3:
    # uninstall
    !sudo rm /usr/bin/aws
    !sudo rm /usr/bin/aws_completer
    !sudo rm -rf /usr/local/aws-cli

    # install
    !curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
    !unzip -o -qq awscliv2.zip
    !sudo ./aws/install --update

    # upload
    output_name = MODEL_OUTPUT_PATH.split("/")[-1]
    os.system(
        f"aws s3 cp --recursive {MODEL_OUTPUT_PATH} s3://{COMPETITION_NAME}/trained_model/{output_name}"
    )

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 57.9M  100 57.9M    0     0  6054k      0  0:00:09  0:00:09 --:--:-- 6073k00:09  0:00:05  0:00:04 6186k
You can now run: /usr/local/bin/aws --version
upload: ../../trained_models/e005-use-first-end/special_tokens_map.json to s3://lmsys/trained_model/e005-use-first-end/special_tokens_map.json
upload: ../../trained_models/e005-use-first-end/cv_score.txt to s3://lmsys/trained_model/e005-use-first-end/cv_score.txt
upload: ../../trained_models/e005-use-first-end/tokenizer_config.json to s3://lmsys/trained_model/e005-use-first-end/tokenizer_config.json
upload: ../../trained_models/e005-use-first-end/added_tokens.json to s3://lmsys/trained_model/e005-use-first-end/added_tokens.json
upload: ../../trained_models/e005-use-first-end/config.json to s3://lmsys/trained_model/e005-use-first-end/config.json
upload: ../../trained_models/e005-u

In [27]:
# ダウンロード（参考）
# !sudo rm /usr/bin/aws
# !sudo rm /usr/bin/aws_completer
# !sudo rm -rf /usr/local/aws-cli

# !curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
# !unzip -o -qq awscliv2.zip
# !sudo ./aws/install --update

# !aws s3 cp --recursive s3://automated-essay-scoring/trained_model/e005-regression /notebooks/automated_essay_scoring/trained_models/e005-regression

# Kaggle Datasetへのupload

In [28]:
import os

os.system("mkdir -p ~/.kaggle/")
os.system(f"cp /notebooks/{COMPETITION_NAME}/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

0

In [29]:
if not DEBUG and UPLOAD_DATA_TO_KAGGLE:
    import os
    import json

    from kaggle.api.kaggle_api_extended import KaggleApi

    def dataset_create_new(dataset_name: str, upload_dir: str):
        # if "_" in dataset_name:
        #     raise ValueError("datasetの名称に_の使用は禁止です")
        dataset_metadata = {}
        dataset_metadata["id"] = f"sinchir0/{dataset_name}"
        dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
        dataset_metadata["title"] = dataset_name
        with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
            json.dump(dataset_metadata, f, indent=4)
        api = KaggleApi()
        api.authenticate()
        api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")

    print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
    dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

# ローカルからのデータの削除

In [30]:
if (UPLOAD_DATA_TO_S3 or UPLOAD_DATA_TO_KAGGLE):
    # ローカルからは削除
    os.system(f"rm -rf {MODEL_OUTPUT_PATH}")

In [31]:
if WANDB:
    wandb.finish()

In [32]:
print("finish Notebook!")

finish Notebook!
