# atmacup20 1st place solution
[1位解法](https://www.guruguru.science/competitions/27/discussions/960838f8-36ee-4002-9992-3861c37b7d62/)のシングルモデルを再現するコードを共有します。

- 手元のRTX3090環境で実行時間約80min(1fold約15min)
- CV:0.69525、Public:0.6922、Private:0.7191<br>※なぜか乱数固定ができておらず、実行のたびに若干値がブレます

In [None]:
import os
import random
from pathlib import Path
from typing import Optional, Union
import datetime
import shutil

import numpy as np
import torch
from sklearn.metrics import roc_auc_score
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    ModernBertForSequenceClassification,
    PreTrainedTokenizerBase,
    Trainer,
    TrainingArguments,
)
from transformers.modeling_outputs import SequenceClassifierOutput
from datasets import Dataset
from tqdm.auto import tqdm
import polars as pl
from sklearn.model_selection import StratifiedGroupKFold
from box import Box
import yaml

In [None]:
# yaml
CONFIG = """
input_dir: ../../data/raw/input # ここは環境によって変更
work_dir: G:/マイドライブ/competitions/atma_udemy/sample_code/1th_place_solution # ここは環境によって変更

n_splits: 5
model: sbintuitions/modernbert-ja-30m
max_length: 1024
optim_type: adamw_torch
per_device_train_batch_size: 8
gradient_accumulation_steps: 8
per_device_eval_batch_size: 16
eval_steps: 50
max_steps: 1000
max_grad_norm: 10.0
lr: 2.0e-5
weight_decay: 0.01
warmup_steps: 25
lr_scheduler_type: cosine

classifier_pooling: cls

seed: 42
exp_name: atma20-bert-001

"""

config = Box(yaml.safe_load(CONFIG))

In [None]:
UDEMY_ACTIVITY_SCHEMA = {
    "社員番号": pl.Utf8,
    "コースID": pl.Int64,
    "コースタイトル": pl.Utf8,
    "レクチャーもしくはクイズ": pl.Utf8,
    "レクチャー/クイズID": pl.Int64,
    "レクチャー/クイズの題名": pl.Utf8,
    "開始日": pl.Datetime,
    "終了日": pl.Datetime,
    "推定完了率%": pl.Float64,
    "最終結果（クイズの場合）": pl.Float64,
    "マーク済み修了": pl.Boolean,
    "コースカテゴリー": pl.Utf8,
}
DX_SCHEMA = {
    "社員番号": pl.Utf8,
    "研修実施日": pl.Datetime,
    "研修カテゴリ": pl.Utf8,
    "研修名": pl.Utf8,
}
HR_SCHEMA = {
    "社員番号": pl.Utf8,
    "カテゴリ": pl.Utf8,
    "研修名": pl.Utf8,
    "実施日": pl.Utf8,
}
OVERTIME_WORK_BY_MONTH_SCHEMA = {
    "社員番号": pl.Utf8,
    "date": pl.Datetime,
    "hours": pl.Float64,
}

In [None]:
def seed_everything(seed: int = 42, deterministic: bool = False):
    """Set seeds"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = deterministic


def compute_metrics(eval_preds: EvalPrediction) -> dict:
    logits = eval_preds.predictions
    labels = eval_preds.label_ids
    probs = torch.from_numpy(logits).float().softmax(-1).numpy()[:, 1]
    auc = roc_auc_score(labels, probs)
    return {"auc": auc}


def make_cv(df: pl.DataFrame, n_splits: int, seed: int) -> pl.DataFrame:
    folds = np.zeros(len(df))
    kfold = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for fold, (train_index, valid_index) in enumerate(
        kfold.split(df, df.get_column("target"), df.get_column("社員番号"))
    ):
        folds[valid_index] = fold
    return df.with_columns(pl.Series(folds, dtype=pl.Int64).alias("fold"))

In [None]:
seed_everything(config.seed, deterministic=True)

# プロンプト作成

In [None]:
def preprocess_data(
    udemy_activity_df: pl.DataFrame,
    dx_df: pl.DataFrame,
    hr_df: pl.DataFrame,
    overtime_work_by_month_df: pl.DataFrame,
    position_history_df: pl.DataFrame,
) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame, pl.DataFrame, pl.DataFrame, dict]:
    udemy_activity_df = udemy_activity_df.with_columns(
        (
            pl.when(
                (pl.col("終了日") >= datetime.datetime(2022, 4, 1))
                & (pl.col("終了日") < datetime.datetime(2023, 4, 1))
            )
        )
        .then(pl.lit(2022))
        .when(
            (pl.col("終了日") >= datetime.datetime(2023, 4, 1))
            & (pl.col("終了日") < datetime.datetime(2024, 4, 1))
        )
        .then(pl.lit(2023))
        .when(
            (pl.col("終了日") >= datetime.datetime(2024, 4, 1))
            & (pl.col("終了日") < datetime.datetime(2025, 4, 1))
        )
        .then(pl.lit(2024))
        .when(
            (pl.col("終了日") >= datetime.datetime(2025, 4, 1))
            & (pl.col("終了日") < datetime.datetime(2026, 4, 1))
        )
        .then(pl.lit(2025))
        .alias("year")
    )
    dx_df = dx_df.with_columns(
        (
            pl.when(
                (pl.col("研修実施日") >= datetime.datetime(2022, 4, 1))
                & (pl.col("研修実施日") < datetime.datetime(2023, 4, 1))
            )
        )
        .then(pl.lit(2022))
        .when(
            (pl.col("研修実施日") >= datetime.datetime(2023, 4, 1))
            & (pl.col("研修実施日") < datetime.datetime(2024, 4, 1))
        )
        .then(pl.lit(2023))
        .when(
            (pl.col("研修実施日") >= datetime.datetime(2024, 4, 1))
            & (pl.col("研修実施日") < datetime.datetime(2025, 4, 1))
        )
        .then(pl.lit(2024))
        .alias("year")
    )
    hr_df = (
        hr_df.with_columns(
            pl.col("実施日")
            .str.extract(r"(\d{4}[-/]\d{1,2}[-/]\d{1,2})")
            .str.to_datetime()
            .alias("date")
        )
        .unique()
        .with_columns(
            (
                pl.when(
                    (pl.col("date") >= datetime.datetime(2022, 4, 1))
                    & (pl.col("date") < datetime.datetime(2023, 4, 1))
                )
            )
            .then(pl.lit(2022))
            .when(
                (pl.col("date") >= datetime.datetime(2023, 4, 1))
                & (pl.col("date") < datetime.datetime(2024, 4, 1))
            )
            .then(pl.lit(2023))
            .when(
                (pl.col("date") >= datetime.datetime(2024, 4, 1))
                & (pl.col("date") < datetime.datetime(2025, 4, 1))
            )
            .then(pl.lit(2024))
            .when(
                (pl.col("date") >= datetime.datetime(2025, 4, 1))
                & (pl.col("date") < datetime.datetime(2026, 4, 1))
            )
            .then(pl.lit(2025))
            .alias("year")
        )
    )
    position_history_df = position_history_df.with_columns(
        (pl.col("year") + 2000).alias("year")
    )

    return (
        udemy_activity_df,
        dx_df,
        hr_df,
        overtime_work_by_month_df,
        position_history_df,
    )


def process_data(
    df: pl.DataFrame,
    udemy_df: pl.DataFrame,
    dx_df: pl.DataFrame,
    hr_df: pl.DataFrame,
    overtime_df: pl.DataFrame,
    position_df: pl.DataFrame,
) -> pl.DataFrame:
    prompts = {}
    employee_ids = df.get_column("社員番号").unique(maintain_order=True).to_list()
    for employee_id in tqdm(employee_ids, desc="Processing Prompts"):
        udemy_activity = udemy_df.filter(pl.col("社員番号") == employee_id)
        dx = dx_df.filter(pl.col("社員番号") == employee_id)
        hr = hr_df.filter(pl.col("社員番号") == employee_id)
        overtime = overtime_df.filter(pl.col("社員番号") == employee_id)
        position = position_df.filter(pl.col("社員番号") == employee_id)

        prompt = []
        years = [2022, 2023, 2024]

        for year in years:
            prompt.append(f"{year}年")

            # position
            position_year = position.filter(pl.col("year") == year)
            if position_year.is_empty():
                prompt.append("情報なし")
                prompt.append("")
                continue

            assert len(position_year) == 1, "Multiple positions found for the same year"

            prompt.append(
                f"{position_year.get_column('勤務区分')[0]}({position_year.get_column('役職')[0]})"
            )

            # overtime
            overtime_year = overtime.filter(pl.col("date").dt.year() == year)
            avg_hours = overtime_year.get_column("hours").mean()
            max_hours = overtime_year.get_column("hours").max()
            min_hours = overtime_year.get_column("hours").min()
            avg_hours_5 = round(avg_hours / 5) * 5  # Round to nearest 5
            min_hours = round(min_hours / 5) * 5  # Round to nearest 5
            max_hours = round(max_hours / 5) * 5  # Round to nearest 5
            prompt.append(f"平均残業時間: 約{avg_hours_5}時間")
            prompt.append(f"最大残業時間: 約{max_hours}時間")
            prompt.append(f"最小残業時間: 約{min_hours}時間")

            # dx
            prompt.append("---")
            prompt.append("DX研修")
            dx_year = dx.filter(pl.col("year") == year)
            if not dx_year.is_empty():
                dx_year_head = (
                    (
                        dx_year.group_by("研修カテゴリ")
                        .len()
                        .sort("研修カテゴリ", descending=True)[:5]
                    )
                    .select(
                        (
                            pl.col("研修カテゴリ").str.replace_all("_", " ")
                            + ": "
                            + pl.col("len").cast(pl.Utf8)
                            + "回"
                        ).alias("dx_prompt")
                    )
                    .to_series()
                    .to_list()
                )
                prompt.extend(dx_year_head)

            # hr
            prompt.append("---")
            prompt.append("人事研修")
            hr_year = hr.filter(pl.col("year") == year)
            if not hr_year.is_empty():
                hr_year_head = (
                    hr_year.group_by("カテゴリ")
                    .len()
                    .sort("len", descending=True)
                    .with_columns((pl.col("カテゴリ")).alias("hr_prompt"))
                    .get_column("hr_prompt")
                    .to_list()
                )
                prompt.extend(hr_year_head)

            # udemy_activity
            prompt.append("---")
            prompt.append("動画研修")
            udemy_year = udemy_activity.filter(
                pl.col("year") == year,
                pl.col("マーク済み修了"),
                pl.col("コースカテゴリー").is_not_null(),
            )
            if not udemy_year.is_empty():
                udemy_year_head = (
                    (
                        udemy_year.group_by("コースカテゴリー")
                        .len()
                        .sort("コースカテゴリー", descending=True)[:5]
                    )
                    .select(
                        (
                            pl.col("コースカテゴリー")
                            + ": "
                            + pl.col("len").cast(pl.Utf8)
                            + "回"
                        ).alias("udemy_prompt")
                    )
                    .to_series()
                    .to_list()
                )
                prompt.extend(udemy_year_head)

            prompt.append("")

        prompts[employee_id] = "\n".join(prompt).strip()

    df = df.with_columns(
        pl.col("社員番号").alias("employee_id"),
        pl.col("category").alias("category"),
        (pl.col("category") + "\n" + pl.col("社員番号").replace(prompts)).alias("text"),
        pl.col("target").alias("labels"),
    )

    return df.select("employee_id", "category", "text", "labels", "fold")

In [None]:
# load data
DATA_DIR = Path(config.input_dir)
WORK_DIR = Path(config.work_dir)
shutil.rmtree(WORK_DIR, ignore_errors=True)
WORK_DIR.mkdir(exist_ok=True)

train_df = pl.read_csv(DATA_DIR / "train.csv")
dx_df = pl.read_csv(DATA_DIR / "dx.csv", schema=DX_SCHEMA)
hr_df = pl.read_csv(DATA_DIR / "hr.csv", schema=HR_SCHEMA)
overtime_work_by_month_df = pl.read_csv(
    DATA_DIR / "overtime_work_by_month.csv", schema=OVERTIME_WORK_BY_MONTH_SCHEMA
)
udemy_activity_df = pl.read_csv(
    DATA_DIR / "udemy_activity.csv", schema=UDEMY_ACTIVITY_SCHEMA
)
position_history_df = pl.read_csv(DATA_DIR / "position_history.csv")

train_df = make_cv(train_df, n_splits=config.n_splits, seed=config.seed)

# preprocess data
(udemy_activity_df, dx_df, hr_df, overtime_work_by_month_df, position_history_df) = (
    preprocess_data(
        udemy_activity_df, dx_df, hr_df, overtime_work_by_month_df, position_history_df
    )
)

# プロンプト作成
train_data = process_data(
    train_df,
    udemy_activity_df,
    dx_df,
    hr_df,
    overtime_work_by_month_df,
    position_history_df,
)
train_data.write_csv(WORK_DIR / "train_data.csv")

In [None]:
train_data.head(1)

In [None]:
print(train_data['text'][0])

# エンコーディング

In [None]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.model, trust_remote_code=True)
tokenizer.save_pretrained(WORK_DIR / "tokenizer")

In [None]:
def encode_prompt(
    batch: dict, tokenizer: PreTrainedTokenizerBase, max_length: int
) -> dict:
    encoded = tokenizer(
        batch["text"], truncation=True, padding=False, max_length=max_length
    )
    return {**encoded, "labels": batch["labels"], "fold": batch["fold"]}

In [None]:
# polars -> huggingface dataset
train_ds = Dataset.from_polars(train_data)

# encoding
train_ds = train_ds.map(
    lambda batch: encode_prompt(batch, tokenizer, max_length=config.max_length),
    batched=True,
    remove_columns=["text"],
)

In [None]:
train_ds

# Modern Bert

In [None]:
class ModernBertClassifier(ModernBertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        sliding_window_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        indices: Optional[torch.Tensor] = None,
        cu_seqlens: Optional[torch.Tensor] = None,
        max_seqlen: Optional[int] = None,
        batch_size: Optional[int] = None,
        seq_len: Optional[int] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )
        self._maybe_set_compile()

        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            sliding_window_mask=sliding_window_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            indices=indices,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            batch_size=batch_size,
            seq_len=seq_len,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        last_hidden_state = outputs[0]

        if self.config.classifier_pooling == "cls":
            last_hidden_state = last_hidden_state[:, 0]
        elif self.config.classifier_pooling == "mean":
            last_hidden_state = (last_hidden_state * attention_mask.unsqueeze(-1)).sum(
                dim=1
            ) / attention_mask.sum(dim=1, keepdim=True)

        pooled_output = self.head(last_hidden_state)
        pooled_output = self.drop(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,)
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [None]:
# training
oof_predictions = []
for fold in range(config.n_splits):
    print(f"### Fold {fold + 1}/{config.n_splits} ###")

    tr_ds = train_ds.filter(lambda example: example["fold"] != fold)
    val_ds = train_ds.filter(lambda example: example["fold"] == fold)

    model = ModernBertClassifier.from_pretrained(
        config.model,
        classifier_pooling=config.classifier_pooling,
        num_labels=2,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )

    training_args = TrainingArguments(
        output_dir=WORK_DIR / f"fold{fold + 1}",
        report_to="none",
        max_steps=config.max_steps,
        per_device_train_batch_size=config.per_device_train_batch_size,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        per_device_eval_batch_size=config.per_device_eval_batch_size,
        do_train=True,
        do_eval=True,
        do_predict=True,
        logging_strategy="steps",
        eval_strategy="steps",
        save_strategy="steps",
        logging_steps=config.eval_steps,
        eval_steps=config.eval_steps,
        save_steps=config.eval_steps,
        save_total_limit=1,
        # load_best_model_at_end=True,
        # metric_for_best_model="eval_loss",
        # greater_is_better=False,
        optim=config.optim_type,
        bf16=True,
        learning_rate=config.lr,
        weight_decay=config.weight_decay,
        warmup_steps=config.warmup_steps,
        lr_scheduler_type=config.lr_scheduler_type,
        max_grad_norm=config.max_grad_norm,
        seed=config.seed,
        data_seed=config.seed,
    )
    trainer = Trainer(
        args=training_args,
        model=model,
        train_dataset=tr_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer),
    )
    trainer.train()

    # Save the model
    model_path = WORK_DIR / f"model_fold{fold + 1}"
    trainer.save_model(model_path)

    # Remove intermediate checkpoints to save disk space
    fold_dir = WORK_DIR / f"fold{fold + 1}"
    if fold_dir.exists():
        shutil.rmtree(fold_dir)

    probs = (
        torch.from_numpy(trainer.predict(val_ds).predictions)
        .float()
        .softmax(-1)
        .numpy()[:, 1]
    )
    oof_df = val_ds.to_polars().with_columns(
        pl.Series("prediction", probs, dtype=pl.Float32)
    )
    oof_predictions.append(oof_df)

    fold_score = roc_auc_score(
        oof_df.get_column("labels").to_numpy(),
        oof_df.get_column("prediction").to_numpy(),
    )
    print(f"Fold {fold + 1} AUC: {fold_score:.5f}")

oof_prediction_df = pl.concat(oof_predictions, how="vertical").sort(
    "employee_id", "category"
)
oof_prediction_df.select(
    "employee_id", "category", "fold", "labels", "prediction"
).write_csv(WORK_DIR / "oof_predictions.csv")
valid_score = roc_auc_score(
    oof_prediction_df.get_column("labels").to_numpy(),
    oof_prediction_df.get_column("prediction").to_numpy(),
)
print(f"valid_score\t:\t {valid_score:.5f}")

In [None]:
# test preprocess
# prepare
DATA_DIR = Path(config.input_dir)
WORK_DIR = Path(config.work_dir)

# data
test_df = pl.read_csv(DATA_DIR / "test.csv").with_columns(
    pl.lit(-1, dtype=pl.Int64).alias("target"),  # dummy
    pl.lit(-1, dtype=pl.Int64).alias("fold"),  # dummy
)
dx_df = pl.read_csv(DATA_DIR / "dx.csv", schema=DX_SCHEMA)
hr_df = pl.read_csv(DATA_DIR / "hr.csv", schema=HR_SCHEMA)
overtime_work_by_month_df = pl.read_csv(
    DATA_DIR / "overtime_work_by_month.csv", schema=OVERTIME_WORK_BY_MONTH_SCHEMA
)
udemy_activity_df = pl.read_csv(
    DATA_DIR / "udemy_activity.csv", schema=UDEMY_ACTIVITY_SCHEMA
)
position_history_df = pl.read_csv(DATA_DIR / "position_history.csv")

# preprocess data
(udemy_activity_df, dx_df, hr_df, overtime_work_by_month_df, position_history_df) = (
    preprocess_data(
        udemy_activity_df, dx_df, hr_df, overtime_work_by_month_df, position_history_df
    )
)

tokenizer = AutoTokenizer.from_pretrained(WORK_DIR / "tokenizer")

test_data = process_data(
    test_df,
    udemy_activity_df,
    dx_df,
    hr_df,
    overtime_work_by_month_df,
    position_history_df,
)
test_data.write_csv(WORK_DIR / "test_data.csv")
test_ds = Dataset.from_polars(test_data)
test_ds = test_ds.map(
    lambda batch: encode_prompt(batch, tokenizer, max_length=config.max_length),
    batched=True,
    remove_columns=["text"],
).remove_columns(["fold", "labels"])


In [None]:
# test prediction
test_predictions = []
for fold in range(config.n_splits):
    print(f"### Predicting Fold {fold + 1}/{config.n_splits} ###")

    model = ModernBertClassifier.from_pretrained(
        WORK_DIR / f"model_fold{fold + 1}",
        classifier_pooling=config.classifier_pooling,
        num_labels=2,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    predict_args = TrainingArguments(
        report_to="none",
        per_device_eval_batch_size=config.per_device_eval_batch_size,
        bf16=True,
        optim=config.optim_type,
        seed=config.seed,
        data_seed=config.seed,
        do_train=False,
        do_eval=False,
        do_predict=True,
    )
    predict_trainer = Trainer(
        args=predict_args, model=model, data_collator=DataCollatorWithPadding(tokenizer)
    )

    probs = (
        torch.from_numpy(predict_trainer.predict(test_ds).predictions)
        .float()
        .softmax(-1)
        .numpy()[:, 1]
    )
    pred_df = test_ds.to_polars().with_columns(
        pl.lit(fold, dtype=pl.Int64).alias("fold"),
        pl.Series("prediction", probs, dtype=pl.Float32),
    )
    test_predictions.append(pred_df)

# save test predictions
test_prediction_df = pl.concat(test_predictions, how="vertical").select(
    "employee_id", "category", "fold", "prediction"
)

submission_df = (
    test_prediction_df.group_by("employee_id", "category")
    .agg(pl.col("prediction").mean())
    .select("employee_id", "category", "prediction")
    .sort("employee_id", "category")
    .select(pl.col("prediction").alias("target"))
)

test_prediction_df.write_csv(WORK_DIR / "test_predictions.csv")
submission_df.write_csv(WORK_DIR / f"{config.exp_name}.csv")