In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
import optuna
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv('./output/中間データ/train_preprocessed.csv')
train_df.head()

Unnamed: 0,new_y,popular,temperature,kcal,unpopular,temp,day,week,weather,curry,soldout,grilled,fun,stewed,other_country,fried,chicken
0,-7.961051000000001e-17,0,-0.619215,-0.325465,1,1.027475,-0.358541,1.519216,0.999611,0,1,1.0,0,0.0,0.0,0.0,1.0
1,-0.0533462,0,-1.613037,0.721475,1,-0.819978,0.440438,-0.862925,-1.054742,0,1,1.0,0,0.0,0.0,0.0,1.0
2,-1.01393,0,0.409683,-2.097211,1,1.457316,1.353556,0.643387,0.999611,0,0,1.0,0,0.0,0.0,0.0,1.0
3,-0.8899836,0,0.012154,-0.050308,1,-0.992503,-0.130261,-0.862925,-1.054742,0,1,1.0,0,0.0,0.0,0.0,1.0
4,-0.4561716,0,0.245995,1.191256,1,-1.07512,-0.472681,-0.862925,-0.797948,0,1,1.0,0,0.0,0.0,0.0,1.0


In [3]:
X_train = train_df.drop(columns=['new_y'])
y_train = train_df['new_y']

In [4]:
# 時系列CV
tscv = TimeSeriesSplit(n_splits=5)

# Optunaによるハイパーパラメータ探索
def objective(trial):
    params = {
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 10.0, log=True),
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'iterations': 1000,
        'early_stopping_rounds': 50,
        'loss_function': 'RMSE',
        'verbose': 0,
        'random_seed': 42
    }

    val_scores = []

    for train_idx, valid_idx in tscv.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

        train_pool = Pool(X_tr, y_tr)
        valid_pool = Pool(X_val, y_val)

        model = CatBoostRegressor(**params)
        model.fit(train_pool, eval_set=valid_pool)
        
        y_pred = model.predict(X_val)
        score = np.sqrt(mean_squared_error(y_val, y_pred))
        val_scores.append(score)

    return np.mean(val_scores)

In [5]:
# Optuna実行
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# ベストパラメータ
best_params = study.best_trial.params
best_params.update({
    'iterations': 1000,
    'loss_function': 'RMSE',
    'verbose': 0,
    'random_seed': 1234
})

# 最終モデルで全データ学習
final_model = CatBoostRegressor(**best_params)
final_model.fit(X_train, y_train)

# トレーニングRMSE計算
y_train_pred = final_model.predict(X_train)
final_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"CatBoostのトレーニングRMSE: {final_rmse:.4f}")

[I 2025-05-18 20:49:46,816] A new study created in memory with name: no-name-08595db1-e2f7-49a9-b2d5-34b5c237f848
[I 2025-05-18 20:49:47,081] Trial 0 finished with value: 0.4805781502725354 and parameters: {'depth': 4, 'learning_rate': 0.25161070656892526, 'l2_leaf_reg': 4.441016027768339, 'random_strength': 0.5612400896019003, 'bagging_temperature': 0.08782988793788027}. Best is trial 0 with value: 0.4805781502725354.
[I 2025-05-18 20:49:47,369] Trial 1 finished with value: 0.48154647253505534 and parameters: {'depth': 5, 'learning_rate': 0.09509163135719695, 'l2_leaf_reg': 0.023000788123409433, 'random_strength': 0.16018838794180423, 'bagging_temperature': 0.64447262247155}. Best is trial 0 with value: 0.4805781502725354.
[I 2025-05-18 20:49:48,380] Trial 2 finished with value: 0.5409223254857698 and parameters: {'depth': 7, 'learning_rate': 0.1221603610395405, 'l2_leaf_reg': 0.13230468482574664, 'random_strength': 0.46412474286185557, 'bagging_temperature': 0.8529270721236218}. Best

CatBoostのトレーニングRMSE: 0.0175


In [6]:
# 保存用のディクショナリにまとめる
model_package = {
    'model': final_model,
    'feature_name': X_train.columns.tolist()
}

# 保存
with open('./output/モデル/cat_model.pkl', 'wb') as f:
    pickle.dump(model_package, f)