In [1]:
DATA_PATH = "../../data"

In [3]:
import polars as pl

train = (
    pl.read_csv(f"{DATA_PATH}/train.csv")
    .with_columns(
        pl.col("prompt").str.json_decode(),
        pl.col("response_a").str.json_decode(),
        pl.col("response_b").str.json_decode(),
    )
    .with_columns(  # 長さの情報を追加する
        pl.col("prompt")
        .map_elements(lambda x: len(x), return_dtype=pl.Int64)
        .alias("len_prompt"),
        pl.col("response_a")
        .map_elements(lambda x: len(x), return_dtype=pl.Int64)
        .alias("len_response_a"),
        pl.col("response_b")
        .map_elements(lambda x: len(x), return_dtype=pl.Int64)
        .alias("len_response_b"),
    )
    .with_columns(  # 最後のレスポンスのみを取得する
        pl.col("prompt")
        .map_elements(lambda x: x[-1], return_dtype=pl.String)
        .alias("last_prompt"),
        pl.col("response_a")
        .map_elements(lambda x: x[-1], return_dtype=pl.String)
        .alias("last_response_a"),
        pl.col("response_b")
        .map_elements(lambda x: x[-1], return_dtype=pl.String)
        .alias("last_response_b"),
    )
    .with_columns(  # 最後のレスポンスがNoneの場合を空文字にする、約60件程度
        pl.col("last_response_a").fill_null(""),
        pl.col("last_response_b").fill_null(""),
    )
    .with_columns(  # labelを付与する
        pl.when(pl.col("winner_model_a") == 1)
        .then(0)
        .when(pl.col("winner_model_b") == 1)
        .then(1)
        .when(pl.col("winner_tie") == 1)
        .then(2)
        .alias("label"),
    )
    .select(  # 元のprompt, responseを削除する
        pl.exclude(["prompt", "response_a", "response_b"])
    )
)

In [6]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

fold_arr = np.zeros(train.height)
sgkf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

for idx, (_, val_idx) in enumerate(
    sgkf.split(train, train["label"])
):
    fold_arr[val_idx] = idx

train = train.with_columns(pl.Series(fold_arr).cast(pl.Int64).alias("fold"))

In [8]:
# idとlabelをdictにして保存する
id_label_dict = dict(zip(train["id"], train["label"]))

In [9]:
import pickle
with open(f"{DATA_PATH}/label_stratified_fold.pkl", "wb") as f:
    pickle.dump(id_label_dict, fid_label_dict

{30192: 0,
 53567: 1,
 65089: 2,
 96401: 0,
 198779: 1,
 292873: 1,
 313413: 0,
 370945: 1,
 441448: 1,
 481524: 1,
 497862: 1,
 587904: 0,
 604575: 1,
 738614: 0,
 862324: 2,
 863398: 1,
 887722: 0,
 914644: 1,
 933555: 2,
 1120158: 1,
 1256092: 2,
 1404102: 1,
 1440765: 2,
 1458108: 2,
 1491225: 2,
 1594211: 2,
 1639617: 2,
 1744093: 0,
 1813737: 1,
 1827787: 2,
 1842252: 0,
 2051408: 2,
 2154496: 1,
 2298796: 0,
 2388511: 0,
 2802516: 1,
 2857714: 0,
 2912862: 1,
 2944182: 2,
 3254113: 2,
 3258431: 0,
 3259481: 2,
 3373963: 0,
 3445782: 1,
 3475655: 1,
 3499263: 0,
 3503031: 1,
 3504181: 2,
 3519254: 2,
 3567106: 1,
 3578663: 2,
 3590999: 2,
 3622781: 0,
 3643104: 2,
 3710170: 2,
 3760933: 0,
 3773792: 2,
 3777134: 2,
 3994811: 2,
 3995635: 2,
 4186011: 1,
 4349090: 0,
 4356730: 0,
 4486480: 2,
 4510489: 2,
 4587071: 1,
 4615863: 1,
 4683272: 2,
 4790276: 2,
 4961077: 0,
 4970917: 0,
 4990514: 1,
 5061737: 2,
 5069186: 2,
 5166668: 1,
 5187535: 2,
 5188727: 0,
 5378146: 1,
 5498037: