In [None]:
from dataclasses import dataclass
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

from sklearn.model_selection import  GroupKFold
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
@dataclass
class Config:
    ver = 0
    n_splits = 5
    output_dir: str = "output"
    model_name: str = 'modernbert-ja-70m'
    checkpoint: str = "sbintuitions/modernbert-ja-30m" #
    max_length: int = 1024
    optim_type: str = "adamw_torch"
    per_device_train_batch_size: int = 4
    gradient_accumulation_steps: int = 4
    per_device_eval_batch_size: int = 8
    n_epochs: int = 10
    lr: float = 2e-4
    warmup_steps: int = 20
    seed = 2025


In [None]:
config = Config()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(Config.checkpoint, trust_remote_code=False, use_fast=False)

In [None]:
class CustomTokenizer:
    def __init__(
        self, 
        tokenizer: PreTrainedTokenizerBase, 
        max_length: int
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch: dict) -> dict:
        tokenized = self.tokenizer(batch["prompt"], max_length=self.max_length, truncation=True)
        return {**tokenized, 
                "labels": batch["labels"]}

encode = CustomTokenizer(tokenizer, max_length=config.max_length)

In [None]:
encode = CustomTokenizer(tokenizer, max_length=config.max_length)

In [None]:
class TestCustomTokenizer:
    def __init__(
        self, 
        tokenizer: PreTrainedTokenizerBase, 
        max_length: int
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch: dict) -> dict:
        tokenized = self.tokenizer(batch["prompt"], max_length=self.max_length, truncation=True)
        return {**tokenized}

In [None]:
test_encode = TestCustomTokenizer(tokenizer, max_length=config.max_length)

In [None]:
def compute_metrics(eval_preds: EvalPrediction) -> dict:
    logits = eval_preds.predictions
    labels = eval_preds.label_ids
    probs = torch.from_numpy(logits).float().softmax(-1).numpy()[:, 1]
    auc = roc_auc_score(labels, probs)
    acc = accuracy_score(y_true=labels, y_pred=probs > 0.5)
    return {"auc": auc, "acc": acc}

In [None]:
def read_data():
    # 基本的なデータ
    train_df = pd.read_csv('../../data/raw/input/train.csv')
    test_df = pd.read_csv('../../data/raw/input/test.csv')
    submission_df = pd.read_csv('../../data/raw/input/sample_submission.csv')    
    return train_df, test_df, submission_df

train_df, test_df, submission_df = read_data()

test_ds = Dataset.from_pandas(test_df[['prompt']])
test_ds = test_ds.map(test_encode, batched=True)
