# 目的
LightGBMによる2ndStage

In [1]:
EXP_NAME = "e113-weight-ave"
COMPETITION_NAME = "atmacup17"

DATA_PATH = "data"
ENV_PATH = "env_file"
MODEL_OUTPUT_PATH = f"trained_models/{EXP_NAME}"
TARGET_COL = "Recommended IND"

# experiment parameter
DEBUG = False
TRAINING = True
UPLOAD_DATA_TO_S3 = True
# UPLOAD_DATA_TO_KAGGLE = True
WANDB = True

In [2]:
import polars as pl
import numpy as np

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import seaborn as sns

In [3]:
pl.__version__

'1.0.0'

In [4]:
lgb.__version__

'4.5.0'

In [5]:
def resolve_path(base_path: str) -> str:
    import os

    cwd = os.getcwd()
    print(cwd)
    if cwd == f"/notebooks":
        print("Jupyter Kernel By VSCode!")
        return "kernel", f"/notebooks/{COMPETITION_NAME}/{base_path}"
    elif cwd == f"/notebooks/{COMPETITION_NAME}":
        print("nohup!")
        return base_path
    elif cwd == f"/notebooks/{COMPETITION_NAME}/{COMPETITION_NAME}/exp":
        print("Jupyter Lab!")
        return "nohup", f"../../{base_path}"
    elif cwd == f"/content":
        print("Google Colab!")
        return "colab", f"/content/drive/MyDrive/Kaggle/{COMPETITION_NAME}/{base_path}"
    elif cwd.startswith("/home/shinichiro.saito"):
        print("GCP!")
        return "GCP", f"/home/shinichiro.saito/{COMPETITION_NAME}/{base_path}"
    else:
        raise Exception("Unknown environment")


ENV_NAME, DATA_PATH = resolve_path(DATA_PATH)
print(DATA_PATH)
_, MODEL_OUTPUT_PATH = resolve_path(MODEL_OUTPUT_PATH)
print(MODEL_OUTPUT_PATH)
_, ENV_PATH = resolve_path(ENV_PATH)

/home/shinichiro.saito/atmacup17/exp
GCP!
/home/shinichiro.saito/atmacup17/data
/home/shinichiro.saito/atmacup17/exp
GCP!
/home/shinichiro.saito/atmacup17/trained_models/e113-weight-ave
/home/shinichiro.saito/atmacup17/exp
GCP!


In [6]:
import json

with open(f"{DATA_PATH}/rec_stratified_fold.json") as f:
    label_stratified_fold = json.load(f)

In [7]:
train = pl.read_csv(f"{DATA_PATH}/train_with_index.csv")
test = pl.read_csv(f"{DATA_PATH}/test.csv")
clothing_master = pl.read_csv(f"{DATA_PATH}/clothing_master.csv")

train = train.join(clothing_master, on="Clothing ID", how="left").with_columns(
    pl.col("index").replace(label_stratified_fold).alias("fold"),  # foldを追加する
    pl.col("Clothing ID").cast(pl.String),
)
test = test.join(clothing_master, on="Clothing ID", how="left").with_columns(
    pl.col("Clothing ID").cast(pl.String),
)

In [42]:
train.filter(pl.col("fold") == 2)["Recommended IND"].value_counts()

Recommended IND,count
i64,u32
1,2734
0,599


# feature engineering

## DeBERTa xsmall oof pred

In [9]:
xsmall_oof = pl.read_csv(
    f"../trained_models/e004-0-1-2-ens/xsmall-fold-fix-oof.csv"
).rename({"valid_pred": "xsmall_pred"})

xsmall_pred = pl.read_csv(
    f"../trained_models/e004-0-1-2-ens/e004-0-1-2-ens-cv0.9590.csv"
)

In [10]:
train = train.join(
    xsmall_oof.select(pl.col(["index", "xsmall_pred"])), on="index", how="left"
)
test = test.with_columns(pl.Series(xsmall_pred).alias("xsmall_pred"))

# DeBERTa(all_info auxiliary loss) large oof pred

In [11]:
large_aux_oof_fold0 = pl.read_csv(
    "../trained_models/e032-lrg-fold0/valid_dataset_e032-lrg-fold0.csv"
)
large_aux_oof_fold1 = pl.read_csv(
    "../trained_models/e033-lrg-fold1/valid_dataset_e033-lrg-fold1.csv"
)
large_aux_oof_fold2 = pl.read_csv(
    "../trained_models/e034-lrg-fold2/valid_dataset_e034-lrg-fold2.csv"
)

large_aux_pred_fold0 = pl.read_csv(
    "../trained_models/e032-lrg-fold0/submission_e032-lrg-fold0_cv0.9720.csv"
)
large_aux_pred_fold1 = pl.read_csv(
    "../trained_models/e033-lrg-fold1/submission_e033-lrg-fold1_cv0.9715.csv"
)
large_aux_pred_fold2 = pl.read_csv(
    "../trained_models/e034-lrg-fold2/submission_e034-lrg-fold2_cv0.9669.csv"
)

large_aux_oof = pl.concat(
    [large_aux_oof_fold0, large_aux_oof_fold1, large_aux_oof_fold2], how="vertical"
)
large_aux_pred = (
    large_aux_pred_fold0 + large_aux_pred_fold1 + large_aux_pred_fold2
) / 3

In [12]:
train = train.join(
    large_aux_oof.select(pl.col(["index", "valid_pred"])), on="index", how="left"
).rename({"valid_pred": "large_aux_pred"})
test = test.with_columns(pl.Series(large_aux_pred).alias("large_aux_pred"))

# DeBERTa(all info) large

In [13]:
large_all_info_oof = pl.read_csv(f"../trained_models/e037-ens/large-oof.csv")
large_all_info_pred = pl.read_csv(f"../trained_models/e037-ens/ens37-cv0.9696.csv")

In [14]:
train = train.join(
    large_all_info_oof.select(pl.col(["index", "valid_pred"])), on="index", how="left"
).rename({"valid_pred": "large_all_info_pred"})
test = test.with_columns(pl.Series(large_all_info_pred).alias("large_all_info_pred"))

# Average Ensemble

index,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,fold,xsmall_pred,large_aux_pred,large_all_info_pred
i64,str,i64,str,str,i64,i64,i64,str,str,str,i64,f64,f64,f64
0,"""0""",25,"""3-season skirt!""","""Adorable, well-made skirt! lin…",5,1,4,"""General""","""Bottoms""","""Skirts""",2,0.9985335,0.996858,0.9991135
1,"""0""",39,"""Very cute""","""Love the asymmetrical hem. wai…",5,1,0,"""General""","""Bottoms""","""Skirts""",2,0.9983802,0.9859634,0.99898
2,"""0""",42,"""Beautiful! fruns small for typ…","""I love this skirt! i wasn't su…",5,1,5,"""General""","""Bottoms""","""Skirts""",1,0.998649,0.99576,0.9971439
3,"""0""",45,,"""I was really pleased with this…",5,1,9,"""General""","""Bottoms""","""Skirts""",0,0.998242,0.9971384,0.9974885
4,"""0""",57,"""Unique, pretty asymmetric skir…","""I saw this skirt in retailer s…",5,1,1,"""General""","""Bottoms""","""Skirts""",1,0.996746,0.9920002,0.9959849
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
9995,"""232""",57,"""Runs big on top""",,3,1,5,"""General""","""Dresses""","""Dresses""",1,0.980506,0.924959,0.97151
9996,"""232""",58,,"""I loved the dress, but just no…",1,1,5,"""General""","""Dresses""","""Dresses""",0,0.9908035,0.941761,0.9614253
9997,"""232""",60,"""I was really disappointed""","""I was really hoping this dress…",2,0,7,"""General""","""Dresses""","""Dresses""",1,0.032651,0.054349,0.030647
9998,"""232""",62,"""Too heavy""","""The design is beautiful but it…",2,0,0,"""General""","""Dresses""","""Dresses""",1,0.033243,0.059921,0.053305


In [29]:
def get_optimized_weight(x1, x2, x3, y1, weights):
    X_combined = weights[0] * x1 + weights[1] * x2 + weights[2] * x3
    return roc_auc_score(y1, X_combined)


get_optimized_weight(
    train["xsmall_pred"],
    train["large_aux_pred"],
    train["large_all_info_pred"],
    train["Recommended IND"],
    weights=[1 / 4, 1 / 4, 1 / 2],
)

0.9714792900291009

In [32]:
from sklearn.metrics import roc_auc_score
import numpy as np
from tqdm.auto import tqdm

# 重みの範囲とステップを設定
step = 0.01
weight_range = np.arange(0, 1 + step, step)


# 最適な重みを探す関数
def find_optimal_weights(x1, x2, x3, y1):
    best_score = 0
    best_weights = None

    # すべての重みの組み合わせを試行
    for w1 in tqdm(weight_range):
        for w2 in weight_range:
            w3 = 1 - w1 - w2  # 重みの合計が1になるようにする
            if w3 < 0 or w3 > 1:
                continue

            # ROC AUCスコアを計算
            score = get_optimized_weight(x1, x2, x3, y1, weights=[w1, w2, w3])
            if score > best_score:
                best_score = score
                best_weights = [w1, w2, w3]

    return best_weights, best_score


# 使用例
optimal_weights, optimal_score = find_optimal_weights(
    train["xsmall_pred"],
    train["large_aux_pred"],
    train["large_all_info_pred"],
    train["Recommended IND"],
)

print(f"Optimal weights: {optimal_weights}")
print(f"Optimal ROC AUC Score: {optimal_score}")


  0%|          | 0/101 [00:00<?, ?it/s]

Optimal weights: [0.07, 0.47000000000000003, 0.4599999999999999]
Optimal ROC AUC Score: 0.9721017134375918


# Submission

In [33]:
pred = (
    optimal_weights[0] * test["xsmall_pred"]
    + optimal_weights[1] * test["large_aux_pred"]
    + optimal_weights[2] * test["large_all_info_pred"]
)

In [35]:
import os

os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)

sample_submission = pl.read_csv(f"{DATA_PATH}/sample_submission.csv")

if DEBUG:
    sample_submission = sample_submission.head(100)

(
    sample_submission.with_columns(pl.Series(pred).alias("target")).write_csv(
        f"{MODEL_OUTPUT_PATH}/submission_{EXP_NAME}_cv{optimal_score:.4f}.csv"
    )
)