# 目的
bgeでの学習を行う

In [1]:
# path setting
EXP_NAME = "e004-trn-bge-ret"
MODEL_NAME = "microsoft/deberta-v3-xsmall"
COMPETITION_NAME = "eedi-mining-misconceptions-in-mathematics"

DATA_PATH = "retrieved_data"
ENV_PATH = "env_file"
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME.split('/')[-1]}"
MODEL_OUTPUT_PATH = f"{COMPETITION_NAME}/trained_models/reranker/{EXP_NAME}"
RETRIEVED_DATA_NAME = "e003-ret-bge-ret25_map0.1841_recall0.5506.csv"

# experiment parameter
DEBUG = False
TRAINING = True
UPLOAD_DATA_TO_S3 = True
UPLOAD_DATA_TO_KAGGLE = True
WANDB = True

# model parameter
TRAINING_MAX_LENGTH = 256  # TODO: 十分な長さかどうかは調査が必要
INFERENCE_MAX_LENGTH = 256
SEED = 42
VALID_DATA_SIZE = 0.3
EPOCH = 2
LR = 2e-05
TRAIN_BS = 8
GRAD_ACC_NUM = 128 // TRAIN_BS  # 仮想的なバッチサイズはTRAIN_BS * GRAD_ACC_STEPとなる
EVAL_BS = 8
NUM_LABELS = 2  # regressionの場合は1, 2値分類の場合は2に設定する

In [2]:
!nvidia-smi

In [3]:
!python --version

In [4]:
def resolve_path(base_path: str) -> str:
    import os

    cwd = os.getcwd()
    print(cwd)
    if cwd == f"/notebooks":
        print("Jupyter Kernel By VSCode!")
        return f"/notebooks/{COMPETITION_NAME}/{base_path}"
    elif cwd == f"/notebooks/{COMPETITION_NAME}":
        print("nohup!")
        return base_path
    elif cwd == f"/notebooks/{COMPETITION_NAME}/{COMPETITION_NAME}/exp":
        print("Jupyter Lab!")
        return f"../../{base_path}"
    elif cwd == f"/root/{COMPETITION_NAME}/exp/reranker":
        print("VastAi! Reranker")
        return f"../../{base_path}"
    elif cwd == f"/root/{COMPETITION_NAME}/exp/retriever":
        print("VastAi! Retriever")
        return f"../../{base_path}"
    elif cwd == f"/root/{COMPETITION_NAME}":
        print("VastAi!")
        return base_path
    else:
        raise Exception("Unknown environment")


DATA_PATH = resolve_path(DATA_PATH)
print(DATA_PATH)
MODEL_OUTPUT_PATH = resolve_path(MODEL_OUTPUT_PATH)
print(MODEL_OUTPUT_PATH)
ENV_PATH = resolve_path(ENV_PATH)
print(ENV_PATH)

In [5]:
def validate_dataset_name(dataset_name: str) -> None:
    if len(dataset_name) < 6 or len(dataset_name) > 50:
        raise Exception(
            f"データセットの文字列は6~50文字にしてください。現在{len(DATASET_NAME)}文字"
        )
    if "_" in dataset_name:
        raise Exception("datasetの名称に_の使用は禁止です")


validate_dataset_name(DATASET_NAME)

# install

In [6]:
# %pip install -qq polars==1.7.1
# %pip install -qq transformers==4.44.2
# %pip install -qq sentencepiece==0.2.0
# %pip install -qq datasets==3.0.0
# %pip install -qq evaluate==0.4.3
# %pip install -qq seqeval==1.2.2
# %pip install -qq accelerate==0.34.2
# %pip install -qq python-dotenv==1.0.1
# %pip install -qq wandb==0.18.0
# %pip install -qq kaggle

# import

In [7]:
import os
import random

import polars as pl
import numpy as np
import torch
import wandb
from datasets import (
    Dataset,
    DatasetDict,
)
from tokenizers import AddedToken
from tqdm.auto import tqdm
from scipy.special import softmax
from sklearn.metrics import log_loss
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

In [8]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"
# NUM_PROC = os.cpu_count()
NUM_PROC = 16

In [9]:
import transformers
import datasets
import evaluate
import accelerate

assert transformers.__version__ == "4.44.2"
assert datasets.__version__ == "3.0.0"
assert evaluate.__version__ == "0.4.3"
assert accelerate.__version__ == "0.34.2"

In [10]:
# Seed the same seed to all
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(SEED)

In [11]:
from dotenv import load_dotenv

load_dotenv(f"{ENV_PATH}/.env")

True

# Wandb

In [12]:
if WANDB:
    wandb.login(key=os.environ["WANDB_API_KEY"])
    wandb.init(project=COMPETITION_NAME, name=EXP_NAME)
    REPORT_TO = "wandb"
else:
    REPORT_TO = "none"

REPORT_TO

'wandb'

# Data Import & Preprocess

In [13]:
train = pl.read_csv(f"{DATA_PATH}/{RETRIEVED_DATA_NAME}").rename({"target": "label"})

In [14]:
if DEBUG:
    train = pl.concat(
        [
            train.filter(pl.col("label") == 0).sample(fraction=1.0).head(90),
            train.filter(pl.col("label") == 1).sample(fraction=1.0).head(10),
        ]
    )

In [15]:
train_dataset = Dataset.from_polars(train)

In [16]:
train_dataset

Dataset({
    features: ['QuestionId', 'ConstructName', 'SubjectName', 'QuestionText', 'CorrectAnswer', 'AnswerType', 'AnswerText', 'AllText', 'AnswerAlphabet', 'QuestionId_Answer', 'MisconceptionId', 'PredictMisconceptionId', 'label', 'MisconceptionId_right', 'MisconceptionName', 'PredictMisconceptionName'],
    num_rows: 109250
})

In [17]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_tokens([AddedToken("\n", normalized=False)])
tokenizer.add_tokens([AddedToken(" " * 2, normalized=False)])

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=NUM_LABELS
)
model.resize_token_embeddings(len(tokenizer))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=16)

# Tokenize

In [18]:
train_dataset

Dataset({
    features: ['QuestionId', 'ConstructName', 'SubjectName', 'QuestionText', 'CorrectAnswer', 'AnswerType', 'AnswerText', 'AllText', 'AnswerAlphabet', 'QuestionId_Answer', 'MisconceptionId', 'PredictMisconceptionId', 'label', 'MisconceptionId_right', 'MisconceptionName', 'PredictMisconceptionName'],
    num_rows: 109250
})

In [19]:
def tokenize(examples, max_token_length: int):
    separator = " [SEP] "

    joined_text = (
        examples["ConstructName"]
        + separator
        + examples["SubjectName"]
        + separator
        + examples["QuestionText"]
        + separator
        + examples["AnswerText"]
        + separator  # TODO: ここもSEPで良いかどうか
        + examples["PredictMisconceptionName"]
    )

    return tokenizer(
        joined_text,
        max_length=max_token_length,
        truncation=True,
        padding="max_length",
    )


train_dataset = train_dataset.map(
    tokenize,
    batched=False,
    fn_kwargs={"max_token_length": TRAINING_MAX_LENGTH},
    num_proc=NUM_PROC,
)

Map (num_proc=16):   0%|          | 0/109250 [00:00<?, ? examples/s]

In [20]:
print(tokenizer.decode(train_dataset["input_ids"][0]))

In [21]:
print(tokenizer.decode(train_dataset["input_ids"][50]))

# Train Test Split

In [22]:
train_dataset

Dataset({
    features: ['QuestionId', 'ConstructName', 'SubjectName', 'QuestionText', 'CorrectAnswer', 'AnswerType', 'AnswerText', 'AllText', 'AnswerAlphabet', 'QuestionId_Answer', 'MisconceptionId', 'PredictMisconceptionId', 'label', 'MisconceptionId_right', 'MisconceptionName', 'PredictMisconceptionName', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 109250
})

In [23]:
train_valid_dataset = DatasetDict(
    {
        "train": train_dataset.filter(
            lambda example: example["QuestionId"] % 3 != 0, num_proc=NUM_PROC
        ),
        "valid": train_dataset.filter(
            lambda example: example["QuestionId"] % 3 == 0, num_proc=NUM_PROC
        ),
    }
)

Filter (num_proc=16):   0%|          | 0/109250 [00:00<?, ? examples/s]

Filter (num_proc=16):   0%|          | 0/109250 [00:00<?, ? examples/s]

In [24]:
print(train_valid_dataset)

In [25]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds_prob = softmax(predictions, axis=-1)
    return {"eval_loss": log_loss(labels, preds_prob)}

In [26]:
# スケジューラの設定
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_PATH,
    learning_rate=LR,
    per_device_train_batch_size=TRAIN_BS,
    gradient_accumulation_steps=GRAD_ACC_NUM,
    eval_accumulation_steps=GRAD_ACC_NUM,
    per_device_eval_batch_size=EVAL_BS,
    num_train_epochs=EPOCH,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=0.1,
    save_strategy="steps",
    save_steps=0.1,
    save_total_limit=1,
    logging_steps=2,
    seed=SEED,
    metric_for_best_model="eval_loss",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine_with_restarts",
    report_to=REPORT_TO,
    run_name=EXP_NAME,
    load_best_model_at_end=True,
    fp16=True,
    fp16_full_eval=True,
    gradient_checkpointing=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_valid_dataset["train"],
    eval_dataset=train_valid_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [27]:
if TRAINING:
    trainer.train()
    # ログの保存に利用したストレージを削除
    os.system(f"rm -rf {MODEL_OUTPUT_PATH}/checkpoint-*")
    # モデルの保存
    trainer.save_model(MODEL_OUTPUT_PATH)
else:
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_OUTPUT_PATH)
    args = TrainingArguments(
        ".",
        per_device_eval_batch_size=4,
        report_to="none",
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=args,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

Step,Training Loss,Validation Loss
114,0.0862,0.102761
228,0.1086,0.103491
342,0.0662,0.104004
456,0.098,0.102372
570,0.1723,0.10544
684,0.1429,0.101789
798,0.1248,0.097726
912,0.1254,0.096416
1026,0.0861,0.095778


# valid_datasetの作成・保存

In [28]:
# 謎に時間がかかるので、行わない
# TRAININGをINFERRENCEでMAX_TOKENを変えるために、validを作り直す
# valid_dataset = train_dataset.filter(
#     lambda example: example["QuestionId"] in train_valid_dataset["valid"]["QuestionId"],
#     num_proc=NUM_PROC,
# )

# valid_dataset = valid_dataset.map(
#     tokenize,
#     batched=False,
#     fn_kwargs={"max_token_length": INFERENCE_MAX_LENGTH},
#     num_proc=NUM_PROC,
# )

valid_dataset = train_valid_dataset["valid"]


def add_valid_pred(example, idx, valid_pred):
    example["valid_pred"] = valid_pred[idx]
    return example


valid_pred = softmax(trainer.predict(valid_dataset).predictions, axis=-1)

np.save(f"{MODEL_OUTPUT_PATH}/valid_prediction.npy", valid_pred)

valid_dataset = valid_dataset.map(
    add_valid_pred, with_indices=True, fn_kwargs={"valid_pred": valid_pred}
)
valid_dataset.save_to_disk(f"{MODEL_OUTPUT_PATH}/valid_dataset")

Map:   0%|          | 0/36500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/36500 [00:00<?, ? examples/s]

# CVの計算

In [29]:
# validが30000件なのは、×25をしているから
# 元々は、1200件*4(から、Nanが引かれた数)くらいなので、valid_data_for_cvもその程度になるはず
valid_data_for_cv = (
    (
        valid_dataset.to_polars()
        .with_columns(
            pl.col("valid_pred").map_elements(lambda x: x[1], return_dtype=pl.Float64)
        )
        .sort(by=["QuestionId_Answer", "valid_pred"], descending=[False, True])
        .group_by(["QuestionId_Answer"], maintain_order=True)
        .agg(pl.col("PredictMisconceptionId").alias("Predict"))
    )
    .join(
        valid_dataset.to_polars()[["QuestionId_Answer", "MisconceptionId"]].unique(),
        on=["QuestionId_Answer"],
    )
    .sort(by=["QuestionId_Answer"])
)

valid_data_for_cv

QuestionId_Answer,Predict,MisconceptionId
str,list[i64],i64
"""0_D""","[2532, 2518, … 373]",1672
"""1002_B""","[1739, 419, … 781]",1715
"""1002_C""","[1739, 419, … 781]",2308
"""1005_A""","[1739, 419, … 671]",760
"""1005_B""","[1739, 419, … 671]",1715
…,…,…
"""99_B""","[398, 935, … 355]",1815
"""99_D""","[398, 935, … 355]",255
"""9_A""","[362, 2380, … 1713]",1889
"""9_B""","[362, 2380, … 1713]",1234


In [30]:
# https://www.kaggle.com/code/cdeotte/how-to-train-open-book-model-part-1#MAP@3-Metric
def map_at_25(predictions, labels):
    map_sum = 0
    for x, y in zip(predictions, labels):
        z = [1 / i if y == j else 0 for i, j in zip(range(1, 26), x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)


map_at_25_score = map_at_25(
    valid_data_for_cv["Predict"], valid_data_for_cv["MisconceptionId"]
)
print(f"MAP@25 Score: {map_at_25_score}")

In [31]:
# output_textを保存
with open(f"{MODEL_OUTPUT_PATH}/cv_score.txt", "w") as f:
    f.write(str(map_at_25_score))

# AWSへのアップロード

In [32]:
# S3へのアップロード
if not DEBUG and UPLOAD_DATA_TO_S3:
    # uninstall
    !sudo rm /usr/bin/aws
    !sudo rm /usr/bin/aws_completer
    !sudo rm -rf /usr/local/aws-cli

    # install
    !curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
    !unzip -o -qq awscliv2.zip
    !sudo ./aws/install --update

    # upload
    output_name = MODEL_OUTPUT_PATH.split("/")[-1]
    os.system(
        f"aws s3 cp --recursive {MODEL_OUTPUT_PATH} s3://{COMPETITION_NAME}/trained_model/{output_name}"
    )

In [33]:
# ダウンロード（参考）
# !sudo rm /usr/bin/aws
# !sudo rm /usr/bin/aws_completer
# !sudo rm -rf /usr/local/aws-cli

# !curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
# !unzip -o -qq awscliv2.zip
# !sudo ./aws/install --update

# !aws s3 cp --recursive s3://automated-essay-scoring/trained_model/e005-regression /notebooks/automated_essay_scoring/trained_models/e005-regression

# Kaggle Datasetへのupload

In [34]:
if not DEBUG and UPLOAD_DATA_TO_KAGGLE:
    import os
    import json

    from kaggle.api.kaggle_api_extended import KaggleApi

    def dataset_create_new(dataset_name: str, upload_dir: str):
        # if "_" in dataset_name:
        #     raise ValueError("datasetの名称に_の使用は禁止です")
        dataset_metadata = {}
        dataset_metadata["id"] = f"sinchir0/{dataset_name}"
        dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
        dataset_metadata["title"] = dataset_name
        with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
            json.dump(dataset_metadata, f, indent=4)
        api = KaggleApi()
        api.authenticate()
        api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")

    print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
    dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

# ローカルからのデータの削除

In [35]:
# if not DEBUG and (UPLOAD_DATA_TO_S3 or UPLOAD_DATA_TO_KAGGLE):
#     # ローカルからは削除
#     os.system(f"rm -rf {MODEL_OUTPUT_PATH}")

In [36]:
if WANDB:
    wandb.finish()

In [37]:
print("finish Notebook!")