In [62]:
%%time
# Standard library imports
from dataclasses import dataclass
from pathlib import Path
import sys
import json

# Third party imports
import polars as pl
from pandera.typing.polars import LazyFrame

# Magic commands
%matplotlib inline
%load_ext autoreload
%autoreload 2


@dataclass
class Config:
    project_dir: Path = Path("../")
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))

with open(str(config.project_dir / "configs/default.json"), "r") as f:
    default_config = json.load(f)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
CPU times: user 1.26 ms, sys: 2.07 ms, total: 3.33 ms
Wall time: 3.82 ms


In [63]:
%%time
df_train = pl.read_csv(config.raw_dir / "train.csv")
df_test_x = pl.read_csv(config.raw_dir / "test.csv")
df_test_y = pl.read_csv(config.raw_dir / "gender_submission.csv")

CPU times: user 1.77 ms, sys: 8.12 ms, total: 9.89 ms
Wall time: 13.8 ms


### EDA

1. データのスキーマ確認
2. データの欠損状況確認
3. データの記述統計量確認

In [64]:
df_train.schema

Schema([('PassengerId', Int64),
        ('Survived', Int64),
        ('Pclass', Int64),
        ('Name', String),
        ('Sex', String),
        ('Age', Float64),
        ('SibSp', Int64),
        ('Parch', Int64),
        ('Ticket', String),
        ('Fare', Float64),
        ('Cabin', String),
        ('Embarked', String)])

In [65]:
df_train.null_count()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,177,0,0,0,0,687,2


In [66]:
df_train.describe()

statistic,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
str,f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
"""count""",891.0,891.0,891.0,"""891""","""891""",714.0,891.0,891.0,"""891""",891.0,"""204""","""889"""
"""null_count""",0.0,0.0,0.0,"""0""","""0""",177.0,0.0,0.0,"""0""",0.0,"""687""","""2"""
"""mean""",446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
"""std""",257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
"""min""",1.0,0.0,1.0,"""Abbing, Mr. Anthony""","""female""",0.42,0.0,0.0,"""110152""",0.0,"""A10""","""C"""
"""25%""",224.0,0.0,2.0,,,20.0,0.0,0.0,,7.925,,
"""50%""",446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
"""75%""",669.0,1.0,3.0,,,38.0,1.0,0.0,,31.0,,
"""max""",891.0,1.0,3.0,"""van Melkebeke, Mr. Philemon""","""male""",80.0,8.0,6.0,"""WE/P 5735""",512.3292,"""T""","""S"""


In [67]:
df_test_y.null_count()

PassengerId,Survived
u32,u32
0,0


### データ整形

In [68]:
from src.data_processor.process import (
    replace_null_data_with_default,
    select_cols,
    one_hot_encoding,
)

# 必要な特徴量のみを選択
feature_name_list = default_config["features"]
target_name_list = ["Survived"]

# PassengerIdを除外してデータを選択
df_train_x = select_cols(df_train, feature_name_list)
df_train_y = select_cols(df_train, target_name_list)

# 欠損値の処理
df_train_x_processed = replace_null_data_with_default(
    df_train_x,
    replace_dict={
        "Age": df_train_x["Age"].mean(),
        "Embarked": "Unknown",
    },
)

# テストデータの処理（PassengerIdを除外）
df_test_x = select_cols(df_test_x, feature_name_list)
df_test_y = select_cols(df_test_y, target_name_list)
df_test_x_processed = replace_null_data_with_default(
    df_test_x,
    replace_dict={
        "Age": df_train_x["Age"].mean(),
        "Fare": df_train_x["Fare"].mean(),
        "Embarked": "Unknown",
    },
)

# One-hotエンコーディング
df_train_x_processed = one_hot_encoding(df_train_x_processed, ["Sex", "Embarked"])
df_test_x_processed = one_hot_encoding(df_test_x_processed, ["Sex", "Embarked"])

In [15]:
from sklearn.model_selection import StratifiedKFold
from src.models.lightgbm.preprocess import create_input_data_for_lightgbm

from src.optimizer.lightgbm import LightGBMOptimizer
import numpy as np
from sklearn.metrics import accuracy_score

# オプティマイザーの初期化
optimizer = LightGBMOptimizer(
    X_train=df_train_x_processed,
    y_train=df_train_y,
    score_function=accuracy_score,
    n_trials=50,
    n_splits=5,
    random_state=42,
)

# パラメータの最適化を実行
lgbt_best_params, lgbt_best_score = optimizer.optimize(
    direction="maximize"  # accuracy_scoreを最大化
)

print("Best parameters:", lgbt_best_params)
print("Best score:", lgbt_best_score)

[I 2024-12-08 19:34:15,470] A new study created in memory with name: no-name-31e139ac-561c-4250-ab70-fae9c7ad7ad0
[I 2024-12-08 19:34:23,288] Trial 0 finished with value: 0.8215052413533362 and parameters: {'num_leaves': 59, 'learning_rate': 0.033279986646538876, 'feature_fraction': 0.8383578286043007, 'bagging_fraction': 0.9402755493682811, 'bagging_freq': 1, 'min_data_in_leaf': 33, 'min_gain_to_split': 0.032610709768143994}. Best is trial 0 with value: 0.8215052413533362.
[I 2024-12-08 19:34:27,983] Trial 1 finished with value: 0.8372230242922603 and parameters: {'num_leaves': 45, 'learning_rate': 0.021265534807701447, 'feature_fraction': 0.525843770393261, 'bagging_fraction': 0.6811417558966864, 'bagging_freq': 2, 'min_data_in_leaf': 48, 'min_gain_to_split': 0.0038714551978341885}. Best is trial 1 with value: 0.8372230242922603.
[I 2024-12-08 19:34:32,186] Trial 2 finished with value: 0.8338773460548616 and parameters: {'num_leaves': 59, 'learning_rate': 0.013580416039218344, 'featu

Best parameters: {'num_leaves': 21, 'learning_rate': 0.010117582896351784, 'feature_fraction': 0.747869659650269, 'bagging_fraction': 0.9969731624672682, 'bagging_freq': 5, 'min_data_in_leaf': 37, 'min_gain_to_split': 0.0011472213629940076}
Best score: 0.8462055112673404


In [35]:
from src.models.lightgbm.model import LightGBM

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
from sklearn.metrics import accuracy_score

params = {
    **default_config["default_lgbm_params"],
    **lgbt_best_params,
}
oof_predictions = np.zeros(len(df_train_x_processed))
scores = []
for i, (train_, val_) in enumerate(kf.split(df_train_x_processed, df_train_y)):
    x_train, x_val = df_train_x_processed[train_], df_train_x_processed[val_]
    y_train = df_train_y[train_]
    y_val = df_train_y[val_]

    lgb_train = create_input_data_for_lightgbm(x_train, y_train)
    lgb_eval = create_input_data_for_lightgbm(x_val, y_val, reference=lgb_train)

    model = LightGBM()
    model.train(
        params=params,
        train_set=lgb_train,
        valid_sets=lgb_eval,
    )
    y_val_pred = model.predict(x_val)
    y_val_pred_binary = (y_val_pred > 0.5).astype(int)
    oof_predictions[val_] = y_val_pred_binary
    score = accuracy_score(y_val, y_val_pred_binary)
    scores.append(score)
print(f"LightGBM CV scores: {scores}")
print(f"LightGBM Mean CV score: {np.mean(scores)}")

LightGBM CV scores: [0.8547486033519553, 0.8426966292134831, 0.8370786516853933, 0.7921348314606742, 0.8033707865168539]
LightGBM Mean CV score: 0.8260059004456719


In [37]:
from sklearn.metrics import accuracy_score
from src.optimizer.xgboost import XGBoostOptimizer
from src.models.xgboost.preprocess import create_input_data_for_xgboost
from src.models.xgboost.model import XGBoost

# オプティマイザーの初期化
optimizer = XGBoostOptimizer(
    X_train=df_train_x_processed,
    y_train=df_train_y,
    score_function=accuracy_score,
    n_trials=50,
    n_splits=5,
    random_state=42,
)

# パラメータの最適化を実行
xgb_best_params, xgb_best_score = optimizer.optimize(
    direction="maximize"  # accuracy_scoreを最大化
)

print("Best parameters:", xgb_best_params)
print("Best score:", xgb_best_score)

[I 2024-12-08 21:56:46,130] A new study created in memory with name: no-name-c982f8fb-093d-4441-bc86-861b1623b192
[I 2024-12-08 21:56:49,138] Trial 0 finished with value: 0.8260310087251271 and parameters: {'max_depth': 4, 'eta': 0.019906240098265453, 'min_child_weight': 3.611485479426343, 'gamma': 0.9031414981562726, 'colsample_bytree': 0.6405883323817723, 'subsample': 0.7818264024115498, 'lambda': 8.803524184226437e-08, 'alpha': 7.477362061995759e-06, 'scale_pos_weight': 0.6226081583820311}. Best is trial 0 with value: 0.8260310087251271.
[I 2024-12-08 21:56:53,325] Trial 1 finished with value: 0.7597639821731217 and parameters: {'max_depth': 5, 'eta': 0.12303190862982, 'min_child_weight': 6.188440077948183, 'gamma': 0.33606017901164054, 'colsample_bytree': 0.5952998826189368, 'subsample': 0.8214535674157071, 'lambda': 1.3833583166358534e-08, 'alpha': 1.1584723425355988e-07, 'scale_pos_weight': 6.486898563958554}. Best is trial 0 with value: 0.8260310087251271.
[I 2024-12-08 21:56:57

Best parameters: {'max_depth': 6, 'eta': 0.029698873080431218, 'min_child_weight': 2.9723163336771794, 'gamma': 0.18651314811849784, 'colsample_bytree': 0.4700105540299814, 'subsample': 0.7165860209050052, 'lambda': 2.8351401191601414e-05, 'alpha': 0.030050851073758388, 'scale_pos_weight': 0.5628611561255372}
Best score: 0.8428535559600778


In [38]:
# XGBoostでの実装

# XGBoostのパラメータ設定
params = {**default_config["default_xgb_params"], **xgb_best_params}

oof_predictions_xgb = np.zeros(len(df_train_x_processed))
scores_xgb = []

for i, (train_, val_) in enumerate(kf.split(df_train_x_processed, df_train_y)):
    x_train, x_val = df_train_x_processed[train_], df_train_x_processed[val_]
    y_train = df_train_y[train_]
    y_val = df_train_y[val_]

    xgb_train = create_input_data_for_xgboost(x_train, y_train)
    xgb_eval = create_input_data_for_xgboost(x_val, y_val)

    model = XGBoost()
    model.train(
        params=params,
        train_set=xgb_train,
        valid_sets=xgb_eval,
    )

    y_val_pred = model.predict(x_val)
    y_val_pred_binary = (y_val_pred > 0.5).astype(int)
    oof_predictions_xgb[val_] = y_val_pred_binary
    score = accuracy_score(y_val, y_val_pred_binary)
    scores_xgb.append(score)

print(f"XGBoost CV scores: {scores_xgb}")
print(f"XGBoost Mean CV score: {np.mean(scores_xgb)}")

XGBoost CV scores: [0.8603351955307262, 0.8314606741573034, 0.8539325842696629, 0.8314606741573034, 0.8089887640449438]
XGBoost Mean CV score: 0.8372355784319879


In [47]:
from sklearn.metrics import mean_squared_error
from src.optimizer.catboost import CatBoostOptimizer
from src.models.catboost.preprocess import create_input_data_for_catboost
from src.models.catboost.model import CatBoost

# オプティマイザーの初期化
optimizer = CatBoostOptimizer(
    X_train=df_train_x_processed,
    y_train=df_train_y,
    score_function=mean_squared_error,
    n_trials=50,
    n_splits=5,
    random_state=42,
)

# パラメータの最適化を実行
ctb_best_params, ctb_best_score = optimizer.optimize(direction="maximize")

print("Best parameters:", ctb_best_params)
print("Best score:", ctb_best_score)

[I 2024-12-08 22:18:38,377] A new study created in memory with name: no-name-a61244d7-d559-47af-b352-d4f7dcc0bceb
[I 2024-12-08 22:18:39,247] Trial 0 finished with value: 0.17625384470529157 and parameters: {'iterations': 2285, 'learning_rate': 0.2111875749059806, 'depth': 10, 'l2_leaf_reg': 9.146411177061944e-05, 'border_count': 122, 'bagging_temperature': 0.7763289280292456, 'random_strength': 0.0001390982018390234, 'colsample_bylevel': 0.5762431010061662}. Best is trial 0 with value: 0.17625384470529157.
[I 2024-12-08 22:18:39,848] Trial 1 finished with value: 0.16047956813759337 and parameters: {'iterations': 2000, 'learning_rate': 0.017748736127213952, 'depth': 10, 'l2_leaf_reg': 4.2791784833440555e-07, 'border_count': 73, 'bagging_temperature': 0.4118561772333905, 'random_strength': 3.772078929563512, 'colsample_bylevel': 0.538178405329807}. Best is trial 0 with value: 0.17625384470529157.
[I 2024-12-08 22:18:40,260] Trial 2 finished with value: 0.1515096353022409 and parameters:

Best parameters: {'iterations': 1293, 'learning_rate': 0.29207800664427236, 'depth': 10, 'l2_leaf_reg': 2.5762014587710995e-05, 'border_count': 58, 'bagging_temperature': 0.6234998913828402, 'random_strength': 3.7822991690104456e-06, 'colsample_bylevel': 0.9727236348205446}
Best score: 0.19194651936476054


In [46]:
# # CatBoostでの実装
# from src.models.catboost.preprocess import create_input_data_for_catboost
# from src.models.catboost.model import CatBoost

# # CatBoostのパラメータ設定
params = {
    **default_config["default_catboost_params"],
    **ctb_best_params,
}


oof_predictions_cb = np.zeros(len(df_train_x_processed))
scores_cb = []

for i, (train_, val_) in enumerate(kf.split(df_train_x_processed, df_train_y)):
    x_train, x_val = df_train_x_processed[train_], df_train_x_processed[val_]
    y_train = df_train_y[train_]
    y_val = df_train_y[val_]

    cb_train = create_input_data_for_catboost(x_train, y_train)
    cb_eval = create_input_data_for_catboost(x_val, y_val)

    model = CatBoost()
    model.train(
        params=params,
        train_set=cb_train,
        valid_sets=cb_eval,
    )

    y_val_pred = model.predict(x_val)
    y_val_pred_binary = (y_val_pred > 0.5).astype(int)
    oof_predictions_cb[val_] = y_val_pred_binary
    score = accuracy_score(y_val, y_val_pred_binary)
    scores_cb.append(score)

print(f"CatBoost CV scores: {scores_cb}")
print(f"CatBoost Mean CV score: {np.mean(scores_cb)}")

CatBoost CV scores: [0.8268156424581006, 0.8595505617977528, 0.8370786516853933, 0.8089887640449438, 0.7752808988764045]
CatBoost Mean CV score: 0.821542903772519
