<a href="https://colab.research.google.com/github/satoshibasaki/competition/blob/main/signate_46th_beginner_v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 【第46回_Beginner限定コンペ】携帯電話の機能データからの価格帯分類


携帯電話の機能から販売価格を分類しよう！

## ライブラリの読み込み

In [3]:
!pip install optuna optuna-integration catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
from sklearn.svm import SVC
# from lightgbm import LGBMClassifier
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import optuna
import optuna.visualization as vis
# import optuna.integration.lightgbm as lgb
from optuna.integration import CatBoostPruningCallback

## データの読み込み

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
dir = "/content/drive/MyDrive/Data/signate_46th_begginer"
df_train_all = pd.read_csv(dir + '/train.csv')
df_test = pd.read_csv(dir + '/test.csv')
df_test_id = df_test["id"]
sample_submission = pd.read_csv(dir + '/sample_submission.csv')

テストデータセットとサンプルファイルのデータ数が合わない。

id=1 がサンプルには無いのが原因。

## データの前処理

In [7]:
# id は不要なので落とす
df_train_all = df_train_all.drop('id', axis=1)
df_test = df_test.drop('id', axis=1)

# ターゲット変数 y は分けておく
y = df_train_all["price_range"]
X = df_train_all.drop("price_range", axis=1)  # ここから特徴量は X, df_test に対して作成・追加する

### 新しい特徴量の作成・追加

In [8]:
# スクリーン面積: sc_area
X["sc_area"] = np.multiply(X["sc_h"], X["sc_w"])
df_test["sc_area"] = np.multiply(df_test["sc_h"], df_test["sc_w"])

# 画面比率: sc_ratio => NG (inf)
# X["sc_ratio"] = np.divide(X["sc_h"], X["sc_w"])
# df_test["sc_ratio"] = np.divide(df_test["sc_h"], df_test["sc_w"])

# 連続通信時間比: talk_time_ratio
# X["talk_time_ratio"] = X["talk_time"] / X["battery_power"]
# df_test["talk_time_ratio"] = df_test["talk_time"] / df_test["battery_power"]

# ピクセル密度: px_density => 値が大きすぎてNG (inf)
# X["px_density"] = np.divide(np.multiply(X["px_height"], X["px_width"]), X["sc_area"])
# df_test["px_density"] = np.divide(np.multiply(df_test["px_height"], df_test["px_width"]), df_test["sc_area"])

# プロセッサ速度指数: ps_idx
X["ps_idx"] = np.multiply(X["clock_speed"], X["n_cores"])
df_test["ps_idx"] = np.multiply(df_test["clock_speed"], df_test["n_cores"])

# コア当たりのRAM: ram_per_core
# X["ram_per_core"] = np.divide(X["ram"], X["n_cores"])
# df_test["ram_per_core"] = np.divide(df_test["ram"], df_test["n_cores"])

# メモリ効率: mem_eff
# X["mem_eff"] = np.divide(X["ram"], X["mobile_wt"])
# df_test["mem_eff"] = np.divide(df_test["ram"], df_test["mobile_wt"])

# ストレージ効率: str_eff
# X["str_eff"] = np.divide(X["int_memory"], X["mobile_wt"])
# df_test["str_eff"] = np.divide(df_test["int_memory"], df_test["mobile_wt"])

# バッテリー密度: bp_wt
# X["bp_wt"] = np.divide(X["battery_power"], X["mobile_wt"])
# df_test["bp_wt"] = np.divide(df_test["battery_power"], df_test["mobile_wt"])

# カメラ品質指数: cq_idx
X["cq_idx"] = X["pc"] + X["fc"]
df_test["cq_idx"] = df_test["pc"] + df_test["fc"]

# 通信機能数: conn_ft_cnt
X["conn_ft_cnt"] = X["blue"] + X["four_g"] + X["three_g"] + X["wifi"]
df_test["conn_ft_cnt"] = df_test["blue"] + df_test["four_g"] + df_test["three_g"] + df_test["wifi"]

# 全体の機能数: all_ft_cnt
X["all_ft_cnt"] = X["blue"] + X["dual_sim"]+ X["touch_screen"] + X["four_g"] + X["three_g"] + X["wifi"]
df_test["all_ft_cnt"] = df_test["blue"]+ df_test["dual_sim"]+ df_test["touch_screen"] + df_test["four_g"] + df_test["three_g"] + df_test["wifi"]

In [9]:
# 特徴量同士の相関関係を表示
# corr = X.corr()
# plt.figure(figsize=(10, 8))  # サイズを大きく設定
# sns.heatmap(corr, annot=True, annot_kws={"size":6}, cmap='coolwarm')
# plt.show()

### 特徴量の尺度を揃える

In [10]:
std_columns = ['battery_power', 'clock_speed', 'fc',
        'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc',
        'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'sc_area',
        'ps_idx', 'cq_idx', 'conn_ft_cnt', 'all_ft_cnt',
        ]

# Standardize training data
sc = StandardScaler().fit(X[std_columns])
# X_train[std_columns] = sc.transform(X_train[std_columns])
X[std_columns] = sc.transform(X[std_columns])

# Standardize validation data
# X_valid[std_columns] = sc.transform(X_valid[std_columns])

# Standardize test data
df_test[std_columns] = sc.transform(df_test[std_columns])

### データセットを訓練・検証データに分割

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Optuna でモデルのハイパーパラメータをチューニングする

### 1. ランダムフォレスト

In [None]:
def rf_objective(trial):

    # 調整したいハイパーパラメータの範囲
    max_depth = trial.suggest_int("max_depth", 2, 32)
    n_estimators = trial.suggest_int("n_estimators", 10, 200)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)

    # モデルの定義
    cls = RandomForestClassifier(
        max_depth=max_depth,
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    score = cross_val_score(cls, X, y, cv=5, scoring='f1_macro')
    f1_score = score.mean()
    return score.mean()

study = optuna.create_study(direction='maximize')
study.optimize(rf_objective, n_trials=100)

print('Best trial: , ', study.best_trial.params)
print('Best score: , ', study.best_trial.value)

[I 2024-05-23 23:16:22,346] A new study created in memory with name: no-name-55e0b6bd-f92a-4aaa-86f8-a5b97f24296d
[I 2024-05-23 23:16:24,535] Trial 0 finished with value: 0.42198926499421174 and parameters: {'max_depth': 4, 'n_estimators': 111, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.42198926499421174.
[I 2024-05-23 23:16:25,773] Trial 1 finished with value: 0.45181148963561774 and parameters: {'max_depth': 12, 'n_estimators': 67, 'min_samples_split': 10, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.45181148963561774.
[I 2024-05-23 23:16:28,697] Trial 2 finished with value: 0.4539271677665225 and parameters: {'max_depth': 26, 'n_estimators': 186, 'min_samples_split': 13, 'min_samples_leaf': 15}. Best is trial 2 with value: 0.4539271677665225.
[I 2024-05-23 23:16:30,148] Trial 3 finished with value: 0.4592756165246613 and parameters: {'max_depth': 26, 'n_estimators': 89, 'min_samples_split': 18, 'min_samples_leaf': 4}. Best is trial 3 with 

Best trial: ,  {'max_depth': 13, 'n_estimators': 166, 'min_samples_split': 18, 'min_samples_leaf': 13}
Best score: ,  0.47069390011351286


In [None]:
# 探索履歴のプロット
opt_history = vis.plot_optimization_history(study)
opt_history.show()

# ハイパーパラメータの重要度のプロット
param_importances = vis.plot_param_importances(study)
param_importances.show()

[W 2024-05-24 00:52:24,695] You need to set up the pruning feature to utilize `plot_intermediate_values()`


In [None]:
# ベストパラメータを使ってモデルの訓練

# Best trial: ,  {'max_depth': 13, 'n_estimators': 166, 'min_samples_split': 18, 'min_samples_leaf': 13}
# Best score: ,  0.47069390011351286

params = {'max_depth': 13,
          'n_estimators': 166,
          'min_samples_split': 18,
          'min_samples_leaf': 13}

rf = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
# rf_results = evaluate_model(rf, X, y)
rf.fit(X, y)

# テストデータでの推論
rf_pred = rf.predict(df_test)
# print(rf_pred.shape)
# print(rf_pred)

(800,)
[3 1 3 1 1 3 3 3 3 1 1 2 1 1 2 3 1 3 0 1 3 1 2 1 1 1 1 1 3 2 0 3 2 2 0 2 0
 0 1 0 3 3 1 1 2 2 3 1 2 2 2 2 1 1 1 1 2 1 3 2 1 0 1 3 2 2 2 2 1 1 3 3 3 2
 3 1 1 0 2 1 1 1 0 2 2 3 3 1 1 0 3 1 1 3 1 2 1 1 2 2 0 1 0 1 2 0 3 2 3 1 2
 0 2 1 1 3 3 1 2 1 2 3 1 2 0 1 2 2 0 1 1 3 3 2 2 1 0 2 1 3 2 3 1 3 2 3 1 1
 2 0 2 3 2 2 3 1 2 2 1 2 2 3 1 1 2 3 1 3 1 1 2 3 3 3 2 1 3 1 2 2 1 2 0 2 2
 2 1 3 3 1 1 1 2 3 1 0 3 1 1 1 3 2 1 2 2 3 2 1 1 0 1 3 1 1 1 2 2 2 3 3 1 0
 1 1 1 1 1 2 1 1 2 1 2 1 3 1 1 2 2 1 1 0 0 0 3 3 0 1 2 3 0 3 2 1 0 2 0 1 3
 0 3 2 3 2 1 3 2 3 1 2 1 1 1 3 1 3 1 1 2 1 1 2 1 3 2 0 2 3 0 3 1 2 1 3 1 3
 2 1 2 1 1 1 3 3 1 2 2 1 1 3 0 1 1 2 2 1 1 2 0 1 2 1 2 1 3 2 1 1 1 2 1 1 1
 1 3 1 3 2 2 2 3 1 1 3 3 3 1 1 3 3 2 2 3 0 3 1 0 1 2 1 3 1 1 1 1 1 2 0 1 3
 1 0 3 2 0 1 3 3 3 1 1 2 1 1 1 2 1 2 3 2 1 3 2 1 3 1 3 2 3 2 1 0 3 0 1 3 3
 2 1 2 1 0 1 3 2 1 1 2 1 3 3 2 1 1 1 1 1 3 0 3 3 1 1 1 2 0 2 2 1 3 3 3 1 2
 1 3 3 2 1 1 1 1 2 2 3 2 0 3 2 1 2 1 2 2 2 3 2 0 3 0 1 2 1 2 0 3 3 2 1 0 2
 0 2 2 2 3 3 1 3 3

### 2. SVC

In [None]:
def svc_objective(trial):

    # 調整したいハイパーパラメータの範囲
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
    params = {
        'C': trial.suggest_float('C', 1e-6, 1e2, log=True),
        'kernel': kernel
    }

    # モデルの定義
    cls = SVC(**params, random_state=42)

    score = cross_val_score(cls, X, y, cv=5, scoring='f1_macro')
    f1_score = score.mean()
    return score.mean()

study = optuna.create_study(direction='maximize')
study.optimize(svc_objective, n_trials=100)

print('Best trial: ', study.best_trial.params)
print('Best score: ', study.best_trial.value)

[I 2024-05-24 01:05:48,305] A new study created in memory with name: no-name-f29a233b-7c8e-448d-ae60-1b403f16a758
[I 2024-05-24 01:05:48,750] Trial 0 finished with value: 0.35337904692495353 and parameters: {'kernel': 'rbf', 'C': 0.2306451718788334}. Best is trial 0 with value: 0.35337904692495353.
[I 2024-05-24 01:05:49,123] Trial 1 finished with value: 0.4007004261610317 and parameters: {'kernel': 'linear', 'C': 0.21202713758748176}. Best is trial 1 with value: 0.4007004261610317.
[I 2024-05-24 01:05:49,443] Trial 2 finished with value: 0.12962962962962962 and parameters: {'kernel': 'linear', 'C': 3.0236388132303163e-05}. Best is trial 1 with value: 0.4007004261610317.
[I 2024-05-24 01:05:49,917] Trial 3 finished with value: 0.12962962962962962 and parameters: {'kernel': 'rbf', 'C': 0.006477490868455745}. Best is trial 1 with value: 0.4007004261610317.
[I 2024-05-24 01:05:50,233] Trial 4 finished with value: 0.12962962962962962 and parameters: {'kernel': 'linear', 'C': 0.000133314733

Best trial: ,  {'kernel': 'rbf', 'C': 0.7253529746614299}
Best score: ,  0.4483449607237124


In [None]:
# 探索履歴のプロット
opt_history = vis.plot_optimization_history(study)
opt_history.show()

# ハイパーパラメータの重要度のプロット
param_importances = vis.plot_param_importances(study)
param_importances.show()

In [None]:
# ベストパラメータを使ってモデルの訓練

# Best trial: ,  {'kernel': 'rbf', 'C': 0.7253529746614299}
# Best score: ,  0.4483449607237124

params = {'C': 0.7253529746614299,
          'kernel': 'rbf'}

svc = SVC(**params, probability=True, random_state=42)
# rf_results = evaluate_model(rf, X, y)
svc.fit(X, y)

# テストデータでの推論
svc_pred = svc.predict(df_test)

# print(svc_pred.shape)
# print(svc_pred)

### 3. XGBoost

In [None]:
def xgb_objective(trial):

    # 調整したいハイパーパラメータの範囲
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 8),
        'max_depth': trial.suggest_int('max_depth', 1, 4),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 0.1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 0.1, log=True),
        'gamma': trial.suggest_float('gamma', 0.0001, 0.1, log=True),
    }

    # モデルの定義
    cls = XGBClassifier(**params, objective='multiclass', random_state=42, n_jobs=-1)

    score = cross_val_score(cls, X, y, cv=5, scoring='f1_macro')
    f1_score = score.mean()
    return score.mean()

study = optuna.create_study(direction='maximize')
study.optimize(xgb_objective, n_trials=100)

print('Best trial: ', study.best_trial.params)
print('Best score: ', study.best_trial.value)

[I 2024-05-24 21:34:57,826] A new study created in memory with name: no-name-65668db2-cfc5-4091-b0eb-cbe564f91b3c
[I 2024-05-24 21:34:58,688] Trial 0 finished with value: 0.43504435127499475 and parameters: {'learning_rate': 0.01396808031100182, 'min_child_weight': 4, 'max_depth': 2, 'colsample_bytree': 0.9939550785703735, 'subsample': 0.5892207348340746, 'reg_alpha': 0.06304092413606736, 'reg_lambda': 0.0012005248082420073, 'gamma': 0.0009411814436477706}. Best is trial 0 with value: 0.43504435127499475.
[I 2024-05-24 21:34:59,727] Trial 1 finished with value: 0.4542157663258587 and parameters: {'learning_rate': 0.11533602460488761, 'min_child_weight': 3, 'max_depth': 3, 'colsample_bytree': 0.8754996475460597, 'subsample': 0.9110647299381514, 'reg_alpha': 0.0055692184650854544, 'reg_lambda': 0.017020711970240132, 'gamma': 0.04800226852175068}. Best is trial 1 with value: 0.4542157663258587.
[I 2024-05-24 21:35:01,031] Trial 2 finished with value: 0.459227983315205 and parameters: {'le

Best trial: ,  {'learning_rate': 0.12383207707591747, 'min_child_weight': 5, 'max_depth': 4, 'colsample_bytree': 0.46905238052158094, 'subsample': 0.8236466968757384, 'reg_alpha': 0.003685214348877713, 'reg_lambda': 0.05253824861610229, 'gamma': 0.0002250569171273546}
Best score: ,  0.4902677266384255


In [None]:
# 探索履歴のプロット
opt_history = vis.plot_optimization_history(study)
opt_history.show()

# ハイパーパラメータの重要度のプロット
param_importances = vis.plot_param_importances(study)
param_importances.show()

In [59]:
# ベストパラメータを使ってモデルの訓練

# Best trial: {'learning_rate': 0.12383207707591747, 'min_child_weight': 5, 'max_depth': 4, 'colsample_bytree': 0.46905238052158094, 'subsample': 0.8236466968757384, 'reg_alpha': 0.003685214348877713, 'reg_lambda': 0.05253824861610229, 'gamma': 0.0002250569171273546}
# Best score: 0.491854649903965

params = {
    'learning_rate': 0.12383207707591747,
    'min_child_weight': 5,
    'max_depth': 4,
    'colsample_bytree': 0.46905238052158094,
    'subsample': 0.8236466968757384,
    'reg_alpha': 0.003685214348877713,
    'reg_lambda': 0.05253824861610229,
    'gamma': 0.0002250569171273546
    }

xgb_model = XGBClassifier(**params, objective='multiclass', random_state=42, n_jobs=-1)
xgb_model.fit(X, y)

# テストデータでの推論
xgb_pred = xgb_model.predict(df_test)

print(xgb_pred.shape)
print(xgb_pred)

(800,)
[3 0 3 1 0 3 0 2 1 1 2 2 3 1 2 3 1 3 0 1 2 1 2 3 1 1 3 1 3 2 0 3 3 2 1 2 0
 0 0 0 3 3 1 0 2 2 3 2 2 2 2 2 3 1 1 1 2 0 3 2 0 1 2 1 2 2 2 2 1 1 0 1 2 2
 3 1 0 0 2 2 1 1 0 2 2 3 2 1 0 0 3 2 1 3 1 2 1 0 2 2 0 1 0 2 2 0 3 2 3 1 2
 0 2 0 1 3 3 1 0 1 0 3 1 1 0 1 2 2 0 2 1 0 1 2 2 3 0 2 1 3 2 3 2 3 2 3 3 1
 2 1 2 2 2 2 3 1 2 2 3 3 2 3 2 1 2 3 0 3 1 1 2 3 1 3 2 1 2 1 2 1 2 2 0 0 2
 2 1 3 2 2 3 1 2 3 1 0 3 1 2 0 3 2 0 2 2 3 2 1 1 1 1 1 1 1 1 2 2 2 3 2 2 0
 1 1 1 1 1 2 1 1 3 1 2 0 0 3 2 2 2 2 2 0 0 0 3 0 0 2 3 0 0 3 2 1 3 2 0 1 3
 0 3 2 3 2 1 3 2 3 1 2 1 2 1 2 0 0 2 1 2 1 1 2 0 3 2 1 2 3 0 3 2 2 1 3 1 3
 2 3 2 2 0 1 2 0 1 2 2 1 3 3 2 1 1 2 2 1 2 2 0 1 2 1 2 3 0 2 1 3 1 2 0 1 1
 1 2 1 3 2 2 2 3 3 3 2 1 3 1 2 3 3 2 2 2 0 3 1 3 3 2 1 0 1 1 1 1 0 2 1 3 2
 1 3 1 2 2 1 3 3 3 1 1 2 3 1 2 2 1 2 3 2 1 3 2 0 3 2 3 2 3 2 0 0 3 0 1 3 3
 2 1 2 1 0 1 3 2 1 3 2 1 3 2 2 0 2 1 0 1 3 0 1 3 2 1 1 2 0 1 2 0 0 3 3 1 2
 3 3 1 1 3 1 2 0 2 2 3 3 0 3 2 1 2 3 2 2 2 3 0 1 2 0 1 2 0 2 2 3 3 2 1 0 2
 0 1 2 2 1 3 3 3 1

### 4. LightGBM

In [None]:
import time
start = time.time()

def objective(trial):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid, reference=dtrain)

    param = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class': 4,
        'num_iterations': trial.suggest_int('num_iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'random_state': 42,
        'verbose': -1,
        'n_jobs': -1
    }

    verbose_eval = 0

    gbm = lgb.train(param, dtrain, valid_sets=[dvalid],
                    callbacks=[
            lgb.early_stopping(stopping_rounds=10, verbose=True),
            lgb.log_evaluation(verbose_eval)
        ])
    preds = gbm.predict(X_valid).astype(np.float32)
    pred_labels = np.argmax(preds, axis=1)
    f1_macro = f1_score(y_valid, pred_labels, average='macro')
    return f1_macro

study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.CRITICAL)
study.optimize(objective, n_trials=100)

print('\nNumber of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best score: ', study.best_trial.value)

print(f'所要時間{time.time() - start}秒')


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[14]	valid_0's multi_logloss: 1.06621
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Did not meet early stopping. Best iteration is:
[197]	valid_0's multi_logloss: 1.07456



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[181]	valid_0's multi_logloss: 1.04251



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[102]	valid_0's multi_logloss: 1.06159



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[402]	valid_0's multi_logloss: 1.06058



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[664]	valid_0's multi_logloss: 1.16288



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[181]	valid_0's multi_logloss: 1.27719
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[209]	valid_0's multi_logloss: 1.04818
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[12]	valid_0's multi_logloss: 1.06365



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[466]	valid_0's multi_logloss: 1.07106



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[28]	valid_0's multi_logloss: 1.04973



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[18]	valid_0's multi_logloss: 1.05671



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[13]	valid_0's multi_logloss: 1.07
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[54]	valid_0's multi_logloss: 1.05135
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Did not meet early stopping. Best iteration is:
[292]	valid_0's multi_logloss: 1.06423
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[26]	valid_0's multi_logloss: 1.05293
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[109]	valid_0's multi_logloss: 1.03555
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Did not meet early stopping. Best iteration is:
[410]	valid_0's multi_logloss: 1.09971
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Did not meet early stopping. Best iteration is:
[270]	valid_0's multi_logloss: 1.05121
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[12]	valid_0's multi_logloss: 1.09365
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[428]	valid_0's multi_logloss: 1.14298



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[219]	valid_0's multi_logloss: 1.05253



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[163]	valid_0's multi_logloss: 1.04517
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[141]	valid_0's multi_logloss: 1.03476



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[45]	valid_0's multi_logloss: 1.05313
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[150]	valid_0's multi_logloss: 1.04584
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[12]	valid_0's multi_logloss: 1.07133



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[51]	valid_0's multi_logloss: 1.03325
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[21]	valid_0's multi_logloss: 1.05408



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[68]	valid_0's multi_logloss: 1.02162
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[38]	valid_0's multi_logloss: 1.04293
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[98]	valid_0's multi_logloss: 1.03874
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[77]	valid_0's multi_logloss: 1.04054
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[29]	valid_0's multi_logloss: 1.06491



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[42]	valid_0's multi_logloss: 1.05187
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[124]	valid_0's multi_logloss: 1.05534
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[386]	valid_0's multi_logloss: 1.04172
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[119]	valid_0's multi_logloss: 1.03416
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[86]	valid_0's multi_logloss: 1.03721
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[193]	valid_0's multi_logloss: 1.04421
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[13]	valid_0's multi_logloss: 1.07634



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[73]	valid_0's multi_logloss: 1.04469
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[410]	valid_0's multi_logloss: 1.04053
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[156]	valid_0's multi_logloss: 1.02909
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[38]	valid_0's multi_logloss: 1.04423



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[26]	valid_0's multi_logloss: 1.04916
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[32]	valid_0's multi_logloss: 1.04354
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[23]	valid_0's multi_logloss: 1.05069



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[56]	valid_0's multi_logloss: 1.05513
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[100]	valid_0's multi_logloss: 1.03934
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[79]	valid_0's multi_logloss: 1.05049



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[28]	valid_0's multi_logloss: 1.04566
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[25]	valid_0's multi_logloss: 1.04931
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[29]	valid_0's multi_logloss: 1.05153



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[26]	valid_0's multi_logloss: 1.04594
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[60]	valid_0's multi_logloss: 1.03306
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[21]	valid_0's multi_logloss: 1.0456



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[188]	valid_0's multi_logloss: 1.04699
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[58]	valid_0's multi_logloss: 1.05311



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[44]	valid_0's multi_logloss: 1.05024
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[131]	valid_0's multi_logloss: 1.04608
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[30]	valid_0's multi_logloss: 1.05153



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[15]	valid_0's multi_logloss: 1.04896
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[56]	valid_0's multi_logloss: 1.03988



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[33]	valid_0's multi_logloss: 1.0532
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[20]	valid_0's multi_logloss: 1.03988



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[137]	valid_0's multi_logloss: 1.03498



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[140]	valid_0's multi_logloss: 1.03322



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[140]	valid_0's multi_logloss: 1.04058



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[25]	valid_0's multi_logloss: 1.06676
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[77]	valid_0's multi_logloss: 1.04597
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[216]	valid_0's multi_logloss: 1.03926



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[471]	valid_0's multi_logloss: 1.04057



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[231]	valid_0's multi_logloss: 1.04082



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[122]	valid_0's multi_logloss: 1.0277
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[238]	valid_0's multi_logloss: 1.05131
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Did not meet early stopping. Best iteration is:
[437]	valid_0's multi_logloss: 1.04731



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[136]	valid_0's multi_logloss: 1.04767
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[123]	valid_0's multi_logloss: 1.03403
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[78]	valid_0's multi_logloss: 1.04891
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[229]	valid_0's multi_logloss: 1.04239
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[123]	valid_0's multi_logloss: 1.03784
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[152]	valid_0's multi_logloss: 1.03907
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[138]	valid_0's multi_logloss: 1.03989
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[232]	valid_0's multi_logloss: 1.04045
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[300]	valid_0's multi_logloss: 1.03859
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[235]	valid_0's multi_logloss: 1.03726
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[195]	valid_0's multi_logloss: 1.03696
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Early stopping, best iteration is:
[365]	valid_0's multi_logloss: 1.04115
Training until validation scores don't improve for 10 rounds



Found `num_iterations` in params. Will use it instead of argument



Did not meet early stopping. Best iteration is:
[374]	valid_0's multi_logloss: 1.1618



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[129]	valid_0's multi_logloss: 1.02926



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[181]	valid_0's multi_logloss: 1.04635



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[207]	valid_0's multi_logloss: 1.09305



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[503]	valid_0's multi_logloss: 1.0409



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[317]	valid_0's multi_logloss: 1.03982



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[136]	valid_0's multi_logloss: 1.03879



Found `num_iterations` in params. Will use it instead of argument


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[21]	valid_0's multi_logloss: 1.05107
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[431]	valid_0's multi_logloss: 1.05526



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[138]	valid_0's multi_logloss: 1.04109



Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[263]	valid_0's multi_logloss: 1.04777

Number of finished trials: 100
Best trial: {'num_iterations': 907, 'learning_rate': 0.18325539987347003, 'lambda_l1': 1.7515990799195345, 'lambda_l2': 1.1256460383989682, 'num_leaves': 43, 'feature_fraction': 0.5390242272131383, 'bagging_fraction': 0.9367255981016527, 'bagging_freq': 2, 'min_child_samples': 74}
Best score:  0.5300418884582024
所要時間72.77289271354675秒


In [None]:
# 探索履歴のプロット
opt_history = vis.plot_optimization_history(study)
opt_history.show()

# ハイパーパラメータの重要度のプロット
param_importances = vis.plot_param_importances(study)
param_importances.show()

In [60]:
# ベストパラメータを使ってモデルの訓練

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
dtrain = lgb.Dataset(X_train, label=y_train)
dvalid = lgb.Dataset(X_valid, label=y_valid, reference=dtrain)

# Best trial: {'num_iterations': 907, 'learning_rate': 0.18325539987347003, 'lambda_l1': 1.7515990799195345, 'lambda_l2': 1.1256460383989682, 'num_leaves': 43, 'feature_fraction': 0.5390242272131383, 'bagging_fraction': 0.9367255981016527, 'bagging_freq': 2, 'min_child_samples': 74}
# Best score:  0.5300418884582024

param = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class': 4,
    'num_iterations': 907,
    'learning_rate': 0.18325539987347003,
    'lambda_l1': 1.7515990799195345,
    'lambda_l2': 1.1256460383989682,
    'num_leaves': 43,
    'feature_fraction': 0.5390242272131383,
    'bagging_fraction': 0.9367255981016527,
    'bagging_freq': 2,
    'min_child_samples': 74,
    'random_state': 42,
    'verbose': -1
    }

verbose_eval = 0
lgb_model = lgb.train(param, dtrain, valid_sets=[dvalid],
                callbacks=[
        lgb.early_stopping(stopping_rounds=10, verbose=True),
        lgb.log_evaluation(verbose_eval)
    ])

# テストデータでの推論
lgb_preds = lgb_model.predict(df_test).astype(np.float32)
lgb_preds = np.argmax(lgb_preds, axis=1)

print(lgb_preds.shape)
print(lgb_preds)


Found `num_iterations` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[38]	valid_0's multi_logloss: 1.04423
(800,)
[3 2 3 1 2 2 0 3 2 1 1 2 1 1 3 3 1 2 0 1 2 3 2 3 1 1 3 1 3 2 0 0 1 2 0 2 0
 0 0 2 0 3 3 0 2 2 1 0 2 2 2 2 3 1 1 1 2 3 3 2 1 0 2 3 2 2 2 2 1 0 0 3 2 2
 3 1 0 0 2 1 1 0 0 2 2 3 2 0 3 0 2 2 2 3 1 2 3 0 2 2 0 1 0 2 2 0 3 2 3 1 2
 0 2 0 1 3 3 1 2 1 0 3 3 1 0 1 2 2 2 1 1 3 3 2 2 3 0 2 1 3 2 0 2 3 2 1 3 1
 2 0 2 0 2 2 3 0 2 2 1 2 2 3 1 1 2 3 0 3 1 1 2 3 3 3 2 1 2 1 2 2 1 2 0 0 2
 2 1 3 2 2 0 1 2 3 1 0 3 1 1 0 3 2 1 2 2 3 2 1 1 1 3 2 3 0 1 2 2 2 3 1 2 3
 1 1 2 1 1 2 1 1 3 1 2 0 1 3 2 2 2 3 2 0 0 0 3 0 0 2 3 0 0 3 2 1 3 2 0 1 2
 0 0 2 3 2 1 2 2 3 1 2 1 2 1 3 0 2 0 0 2 0 1 2 1 3 2 1 2 3 2 3 3 2 2 3 0 3
 2 3 2 1 1 2 2 0 1 2 2 1 0 3 0 1 1 2 1 1 1 2 0 1 2 1 2 3 0 2 1 1 1 2 1 3 2
 1 1 1 3 2 2 2 3 3 3 2 3 3 1 0 3 3 2 2 3 2 3 1 3 3 2 1 1 1 1 1 1 0 2 0 3 2
 1 0 3 2 2 1 1 3 3 1 1 2 3 1 3 2 1 2 3 2 1 3 2 0 3 2 3 2 3 2 0 0 3 0 1 3 3
 2 1 2 1 0 1 0 2 1 1 2 1 3 2 2 1 2

### 5. CatBoost

In [50]:
def objective(trial):

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    param = {
        'loss_function': 'MultiClass',
        # "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
        "eval_metric": "Accuracy",
        "random_state": 42,
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = cb.CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
    gbm.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=0,
        early_stopping_rounds=20,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = gbm.predict(X_valid)
    # pred_labels = np.rint(preds)
    # accuracy = accuracy_score(y_valid, pred_labels)
    # accuracy = accuracy_score(y_valid, preds)
    f1_macro = f1_score(y_valid, preds, average='macro')

    # return accuracy
    return f1_macro

In [51]:
study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize",
    )
study.optimize(objective, n_trials=100, timeout=600, n_jobs=-1)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-05-27 00:39:38,585] A new study created in memory with name: no-name-549e0cf4-56eb-4448-b442-01bad7eca2fe

CatBoostPruningCallback is experimental (supported from v3.0.0). The interface can change in the future.


CatBoostPruningCallback is experimental (supported from v3.0.0). The interface can change in the future.

[I 2024-05-27 00:39:38,958] Trial 1 finished with value: 0.12962962962962962 and parameters: {'learning_rate': 0.03215577985250393, 'colsample_bylevel': 0.014852591870153345, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.4114880133294928}. Best is trial 1 with value: 0.12962962962962962.

CatBoostPruningCallback is experimental (supported from v3.0.0). The interface can change in the future.

[I 2024-05-27 00:39:39,008] Trial 0 finished with value: 0.25952185792349725 and parameters: {'learning_rate': 0.0015661614489300793, 'colsample_bylevel': 0.04249741619347061, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 

Number of finished trials: 100
Best trial:
  Value: 0.5144574419815232
  Params: 
    learning_rate: 0.5740110283756216
    colsample_bylevel: 0.07233891150849797
    depth: 6
    boosting_type: Ordered
    bootstrap_type: Bayesian
    bagging_temperature: 1.9072492513603367


In [52]:
# 探索履歴のプロット
opt_history = vis.plot_optimization_history(study)
opt_history.show()

# ハイパーパラメータの重要度のプロット
param_importances = vis.plot_param_importances(study)
param_importances.show()

In [62]:
# ベストパラメータを使ってモデルの訓練

# Best trial: {learning_rate: 0.5740110283756216
    # colsample_bylevel: 0.07233891150849797
    # depth: 6
    # boosting_type: Ordered
    # bootstrap_type: Bayesian
    # bagging_temperature: 1.9072492513603367}
# Best score:  0.5144574419815232

param = {
        'loss_function': 'MultiClass',
        'learning_rate': 0.5740110283756216,
        "colsample_bylevel": 0.07233891150849797,
        "depth": 6,
        "boosting_type": "Ordered",
        "bootstrap_type": "Bayesian",
        "used_ram_limit": "3gb",
        "eval_metric": "Accuracy",
        "random_state": 42
        }

cat_model = cb.CatBoostClassifier(**param)

pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
cat_model.fit(
    X_train,
    y_train,
    # X, y,
    eval_set=[(X_valid, y_valid)],
    verbose=2,
    early_stopping_rounds=20,
    callbacks=[pruning_callback],
)

# テストデータでの推論
cat_preds = np.reshape(cat_model.predict(df_test), -1)

print(cat_preds.shape)
print(cat_preds)


CatBoostPruningCallback is experimental (supported from v3.0.0). The interface can change in the future.



0:	learn: 0.4697917	test: 0.4375000	best: 0.4375000 (0)	total: 13.8ms	remaining: 13.7s
2:	learn: 0.4906250	test: 0.4583333	best: 0.4625000 (1)	total: 56ms	remaining: 18.6s
4:	learn: 0.4927083	test: 0.4708333	best: 0.4708333 (4)	total: 74.3ms	remaining: 14.8s
6:	learn: 0.5114583	test: 0.4458333	best: 0.4708333 (4)	total: 88ms	remaining: 12.5s
8:	learn: 0.5208333	test: 0.4458333	best: 0.4708333 (4)	total: 108ms	remaining: 11.9s
10:	learn: 0.5614583	test: 0.5083333	best: 0.5083333 (10)	total: 161ms	remaining: 14.5s
12:	learn: 0.5895833	test: 0.5125000	best: 0.5125000 (11)	total: 175ms	remaining: 13.3s
14:	learn: 0.6145833	test: 0.5166667	best: 0.5166667 (14)	total: 184ms	remaining: 12.1s
16:	learn: 0.6250000	test: 0.5041667	best: 0.5166667 (14)	total: 193ms	remaining: 11.2s
18:	learn: 0.6354167	test: 0.5041667	best: 0.5166667 (14)	total: 218ms	remaining: 11.2s
20:	learn: 0.6520833	test: 0.5291667	best: 0.5291667 (20)	total: 234ms	remaining: 10.9s
22:	learn: 0.6572917	test: 0.5166667	best:

## VotingClassifier を使ってアンサンブル学習

In [67]:
estimators = [('xgb', xgb_model), ('cat', cat_model)]

vote = VotingClassifier(
    estimators=estimators,
    voting="soft",
    n_jobs=-1
)

vote.fit(X_train, y_train)

y_pred = vote.predict(X_valid)

# 評価メトリックの計算
accuracy = accuracy_score(y_valid, y_pred)
f1_macro = f1_score(y_valid, y_pred, average='macro')

# 結果の表示
print('Accuracy:', accuracy)
print('F1 Macro:', f1_macro)

Accuracy: 0.5541666666666667
F1 Macro: 0.5156629410270187


## モデルの評価： テストデータを用いた推論

In [68]:
# テストデータを用いた推論

pred_test = vote.predict(df_test)

pred_test = pred_test.astype(int)  # float => int
print(pred_test[:20])

[0 0 3 1 2 1 0 1 1 2 2 2 3 1 2 3 1 2 0 3]


In [69]:
# submission ファイル作成

df_submission = pd.DataFrame({'id': df_test_id, 'price_range': pred_test})
df_submission.to_csv('submission.csv', index=False, header=None)