In [8]:
!pip install lightgbm==4.4.0



In [15]:
from sklearn.model_selection import train_test_split, StratifiedKFold
import numpy as np
import lightgbm as lgb
import polars as pl
from datasets import load_from_disk

In [12]:
data = load_from_disk("../../trained_models/e027-make-valid/valid_dataset").to_polars()
pred = np.load("../../trained_models/e027-make-valid/valid_prediction.npy")

In [21]:
all_data = pl.concat([data.select(pl.exclude("valid_pred")), pl.DataFrame(pred, schema=["winner_a", "winner_b", "tie"])], how="horizontal")

In [36]:
all_data

id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,fold,input_ids,attention_mask,labels,winner_a,winner_b,tie,turn_num,prompt_word_num,response_a_word_num,response_b_word_num
i64,str,str,str,str,str,i64,i64,i64,i64,list[i32],list[i8],i64,f32,f32,f32,i32,i32,i32,i32
2653302469,"""vicuna-7b""","""llama-2-13b-chat""","""[""can you spot the clever pun …","""[""The nickname \""typlo\"" conta…","""[""Sure, I'd be happy to help! …",0,0,1,2,"[2, 235322, … 1]","[1, 1, … 1]",2,0.209556,0.206912,0.583532,1,52,334,307
3366614613,"""claude-2.1""","""mistral-medium""","""[""What other features can we a…","""[""Here are some additional fea…","""[""Here are some additional fea…",0,1,0,2,"[2, 235322, … 1]","[1, 1, … 1]",1,0.311937,0.23917,0.448893,9,17995,11321,21616
4229518975,"""vicuna-33b""","""llama-2-70b-chat""","""[""Sally (a girl) has 6 brother…","""[""Sally has 3 sisters.""]""","""[""Each of Sally's brothers has…",0,0,1,2,"[2, 235322, … 1]","[1, 1, … 1]",2,0.138966,0.068726,0.792308,1,92,20,117
95091053,"""claude-2.1""","""yi-34b-chat""","""[""hello how are you""]""","""[""I'm doing well, thanks for a…","""[""Thank you for asking! As an …",1,0,0,2,"[2, 235322, … 1]","[1, 1, … 1]",0,0.569409,0.252705,0.177887,1,17,34,155
437433165,"""wizardlm-13b""","""koala-13b""","""[""Explain the phrase 'kill two…","""[""The phrase \""kill two birds …","""[""\""Kill two birds with one st…",0,0,1,2,"[2, 235322, … 1]","[1, 1, … 1]",2,0.364599,0.281328,0.354073,1,74,481,766
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
3943120934,"""openchat-3.5""","""mistral-medium""","""[""Why has France greatly de-in…","""[""France has not greatly de-in…","""[""It's important to note that …",0,1,0,2,"[2, 235322, … 1]","[1, 1, … 1]",1,0.131563,0.62681,0.241627,1,71,1846,1947
2289729730,"""vicuna-33b""","""guanaco-33b""","""[""How many basic principles ac…","""[""Saltzer and Schroeder introd…","""[""Saltzer and Schroeder's basi…",0,1,0,2,"[2, 235322, … 1]","[1, 1, … 1]",1,0.182021,0.421766,0.396213,1,71,1310,1591
3023733933,"""mixtral-8x7b-instruct-v0.1""","""gpt-3.5-turbo-1106""","""[""Write a paragraph about eati…","""[""In the most resplendent and …","""[""Verily, as the sun ascended …",1,0,0,2,"[2, 235322, … 1]","[1, 1, … 1]",0,0.334755,0.438739,0.226506,1,129,1066,751
2886153616,"""llama-2-70b-chat""","""vicuna-33b""","""[""what is the point of aerial …","""[""Aerial marker balls, also kn…","""[""Aerial marker balls, also kn…",1,0,0,2,"[2, 235322, … 1]","[1, 1, … 1]",0,0.503419,0.140822,0.355759,2,133,4860,2870


In [38]:
import itertools

# feature engineering
def to_list(text: str) -> list:
    return eval(text, {"null": ""})

def flatten(text_list: list) :
    return list(itertools.chain.from_iterable(text_list))

train = all_data.with_columns(
    pl.col("prompt").map_elements(lambda x : len(to_list(x)), return_dtype=pl.Int32).alias("turn_num"),
    pl.col("prompt").map_elements(lambda x : len(flatten(to_list(x))), return_dtype=pl.Int32).alias("prompt_word_num"),
    pl.col("response_a").map_elements(lambda x : len(flatten(to_list(x))), return_dtype=pl.Int32).alias("response_a_word_num"),
    pl.col("response_b").map_elements(lambda x : len(flatten(to_list(x))), return_dtype=pl.Int32).alias("response_b_word_num"),
)

In [44]:
use_col = [
    "turn_num",
    "prompt_word_num",
    "response_a_word_num",
    "response_b_word_num"
]

TARGET_COL = "labels"
N_FOLD = 3
SEED = 42
NUM_LABEL = 3

In [45]:
# Foldの作成
fold_arr = np.zeros(train.height)
fold = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

for idx, (_, val_idx) in enumerate(fold.split(train, train.get_column(TARGET_COL))):
    fold_arr[val_idx] = idx

train = train.with_columns(
    pl.Series(fold_arr).cast(pl.Int64).alias("fold")
)

In [46]:
# LightGBMで用いるパラメータを指定
params = {
    "objective": "multiclass",
    'num_class': NUM_LABEL,
    "max_depth": -1,
    "min_data_in_leaf": 10,  # 1つの葉に入る最小のデータ数
    "num_leaves": 24,  # 2**max_depthより少し小さめにすると過学習を防げる
    "learning_rate": 0.01,  # 1回のiterationで学習を進める割合、大きいと学習が早く終わる。小さいと学習は長いが高精度になりやすい。
    "bagging_freq": 5,  # 指定した回数ごとにbaggingを行う
    "feature_fraction": 0.7,  # 1回のiterationで利用する特徴量(列方向)の割合
    "bagging_fraction": 0.6,  # 1回のiterationで利用するデータ(行方向)の割合
    "verbose": -1,  # 出力するログレベルの変更、-1はFatalなログのみを出力
    "seed": SEED,  # ランダムシードの固定
    "lambda_l1": 0.4,  # 正則化のためのパラメータ
    "lambda_l2": 0.4,  # 正則化のためのパラメータ
}

In [47]:
# テストデータに対する推論、特徴量重要度(後述)を計算するために、モデルを保存するobjectを作成
models = []

# ① Cross Validationによる学習の実施
for fold in range(N_FOLD):
    print(f"Start fold {fold}")

    # ② foldごとにtrainとvalidに分ける
    train_fold = train.filter(pl.col("fold") != fold)
    valid_fold = train.filter(pl.col("fold") == fold)

    # ③ X(説明変数)とy(目的変数)に分ける
    X_train = train_fold.select(use_col)
    X_valid = valid_fold.select(use_col)
    y_train = train_fold.select(TARGET_COL)
    y_valid = valid_fold.select(TARGET_COL)

    # ④ LightGBMが認識可能な形にデータセットを変換
    # polars.DataFrame から pandas.DataFrame への変更を行っている
    lgb_train = lgb.Dataset(X_train.to_pandas(), y_train.to_pandas())
    lgb_eval = lgb.Dataset(
        X_valid.to_pandas(), y_valid.to_pandas(), reference=lgb_train
    )

    # ⑤ モデルの学習
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=10000,  # 学習のiteration回数
        valid_sets=[lgb_train, lgb_eval],
        callbacks=[
            lgb.early_stopping(
                stopping_rounds=100
            ),  # Early stopingの回数、binary_loglossが改善しないiterationが100回続いたら学習を止める
            lgb.log_evaluation(100),  # 指定したiteration回数ごとにlogを出力する
        ],
    )

    # ⑥ モデルを保存
    models.append([fold, model])

Start fold 0
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 1.05216	valid_1's multi_logloss: 1.06596
[200]	training's multi_logloss: 1.03119	valid_1's multi_logloss: 1.05756
[300]	training's multi_logloss: 1.01683	valid_1's multi_logloss: 1.05462
[400]	training's multi_logloss: 1.00522	valid_1's multi_logloss: 1.05453
Early stopping, best iteration is:
[335]	training's multi_logloss: 1.01255	valid_1's multi_logloss: 1.0542
Start fold 1
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 1.0484	valid_1's multi_logloss: 1.06901
[200]	training's multi_logloss: 1.02631	valid_1's multi_logloss: 1.06321
[300]	training's multi_logloss: 1.0114	valid_1's multi_logloss: 1.0625
[400]	training's multi_logloss: 0.999095	valid_1's multi_logloss: 1.06345
Early stopping, best iteration is:
[304]	training's multi_logloss: 1.01091	valid_1's multi_logloss: 1.06245
Start fold 2
Training until validation scores don't i