In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

import pickle
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv('./output/中間データ/train_preprocessed.csv')
train_df.head()

Unnamed: 0,datetime,y,week,soldout,name,kcal,payday,weather,temperature,y_1lag,...,new_y,day,kcal_missing_flag,new_new_y,curry,popular,unpopular,fun,month,temp
0,2014-01-09,129.0,-0.932992,1,鶏チリソース,1.048743,0.0,-1.143733,-1.095479,1.794384,...,0.321153,-0.808939,0,0.634808,0,1,0,0,-1.458826,0.642341
1,2014-01-10,87.0,-0.908046,0,手作りロースカツ,1.217961,1.0,-0.012464,-1.763701,1.831072,...,-1.848836,-0.695212,0,-1.58719,0,1,1,0,-1.458826,-1.080469
2,2014-01-14,129.0,-0.073296,1,鶏の照り焼きマスタード,-0.948028,0.0,1.223188,-1.683054,0.290186,...,0.410879,-0.240304,0,0.443303,0,0,0,0,-1.458826,-0.872544
3,2014-01-15,134.0,0.220722,0,さんま辛味焼,1.556397,0.0,-1.143733,-2.017165,1.831072,...,0.718736,-0.126577,0,0.660572,0,1,0,0,-1.458826,-1.733949
4,2014-01-16,107.0,-0.932992,0,カレイ唐揚げ野菜あんかけ,0.371872,0.0,-0.012464,-1.544801,2.014511,...,-0.661561,-0.012851,0,-0.367739,0,0,1,0,-1.458826,-0.5161


In [3]:
X_train = train_df.drop(columns=['y', 'name', 'datetime', 'new_y', 'new_new_y'])
y_train = train_df['new_y']

In [4]:
# 再現性確保のための固定シード
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)

# Optunaのシードも固定
sampler = optuna.samplers.TPESampler(seed=SEED)

# LightGBMにも同様に
base_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'random_state': SEED,
    'verbosity': -1
}

In [5]:
tscv = TimeSeriesSplit(n_splits=5)

#### ハイパーパラメータチューニング

In [6]:
# STEP 1: 大きめのlearning_rateで n_estimators を最適化
def find_best_n_estimators(params, X, y):
    best_n = 100
    best_score = float('inf')
    for n in range(50, 1001, 50):
        scores = []
        for train_idx, val_idx in tscv.split(X):
            trial_params = params.copy()
            trial_params['n_estimators'] = n  # ← ここで上書き
            model = lgb.LGBMRegressor(**trial_params)
            model.fit(X.iloc[train_idx], y.iloc[train_idx])
            pred = model.predict(X.iloc[val_idx])
            score = np.sqrt(mean_squared_error(y.iloc[val_idx], pred))
            scores.append(score)
        avg_score = np.mean(scores)
        if avg_score < best_score:
            best_score = avg_score
            best_n = n
    return best_n

best_n1 = find_best_n_estimators(base_params, X_train, y_train)
base_params['n_estimators'] = best_n1

In [7]:
# STEP 2: 木のパラメータをチューニング
def objective_tree(trial):
    params = {
        **base_params,
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }
    scores = []
    for train_idx, val_idx in tscv.split(X_train):
        model = lgb.LGBMRegressor(**params)
        model.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
        pred = model.predict(X_train.iloc[val_idx])
        score = np.sqrt(mean_squared_error(y_train.iloc[val_idx], pred))
        scores.append(score)
    return np.mean(scores)

study_tree = optuna.create_study(direction='minimize', sampler=sampler)
study_tree.optimize(objective_tree, n_trials=50)
base_params.update(study_tree.best_params)

[I 2025-05-06 19:39:46,944] A new study created in memory with name: no-name-a79c6d6d-7115-4605-806c-2a400142e908
[I 2025-05-06 19:39:46,988] Trial 0 finished with value: 1.0178883454785195 and parameters: {'max_depth': 5, 'num_leaves': 194, 'min_child_samples': 47, 'subsample': 0.8926792918568847, 'colsample_bytree': 0.8899879040594018}. Best is trial 0 with value: 1.0178883454785195.
[I 2025-05-06 19:39:47,003] Trial 1 finished with value: 1.0061720127662317 and parameters: {'max_depth': 6, 'num_leaves': 97, 'min_child_samples': 81, 'subsample': 0.9790696768418525, 'colsample_bytree': 0.9379663173710473}. Best is trial 1 with value: 1.0061720127662317.
[I 2025-05-06 19:39:47,026] Trial 2 finished with value: 1.0136356966466307 and parameters: {'max_depth': 7, 'num_leaves': 160, 'min_child_samples': 70, 'subsample': 0.85635101349145, 'colsample_bytree': 0.6851253773951975}. Best is trial 1 with value: 1.0061720127662317.
[I 2025-05-06 19:39:47,335] Trial 3 finished with value: 0.70102

In [8]:
# STEP 3: 正則化パラメータのチューニング
def objective_reg(trial):
    params = {
        **base_params,
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0)
    }
    scores = []
    for train_idx, val_idx in tscv.split(X_train):
        model = lgb.LGBMRegressor(**params)
        model.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
        pred = model.predict(X_train.iloc[val_idx])
        score = np.sqrt(mean_squared_error(y_train.iloc[val_idx], pred))
        scores.append(score)
    return np.mean(scores)

study_reg = optuna.create_study(direction='minimize', sampler=sampler)
study_reg.optimize(objective_reg, n_trials=50)
base_params.update(study_reg.best_params)

[I 2025-05-06 19:39:54,149] A new study created in memory with name: no-name-d5b599bb-74cd-418c-815d-8ed7ddf46c1e
[I 2025-05-06 19:39:54,443] Trial 0 finished with value: 0.7007850526172368 and parameters: {'reg_alpha': 0.5594715878720191, 'reg_lambda': 3.035968531092423}. Best is trial 0 with value: 0.7007850526172368.
[I 2025-05-06 19:39:54,652] Trial 1 finished with value: 0.7312543496974617 and parameters: {'reg_alpha': 2.829723215252657, 'reg_lambda': 0.03382030995001395}. Best is trial 0 with value: 0.7007850526172368.
[I 2025-05-06 19:39:54,857] Trial 2 finished with value: 0.7438205906510265 and parameters: {'reg_alpha': 3.087208544021485, 'reg_lambda': 4.560614432165772}. Best is trial 0 with value: 0.7007850526172368.
[I 2025-05-06 19:39:55,038] Trial 3 finished with value: 0.7554311885306638 and parameters: {'reg_alpha': 3.952620665285167, 'reg_lambda': 4.960407330941807}. Best is trial 0 with value: 0.7007850526172368.
[I 2025-05-06 19:39:55,195] Trial 4 finished with value

In [9]:
# STEP 4: learning_rateを下げてn_estimators再最適化
base_params['learning_rate'] = 0.01
best_n2 = find_best_n_estimators(base_params, X_train, y_train)
base_params['n_estimators'] = best_n2

#### 全データで学習

In [10]:
# モデル全体で再学習
final_model = lgb.LGBMRegressor(**base_params)
final_model.fit(X_train, y_train)

# 最終モデルのトレーニングRMSEを計算
y_train_pred = final_model.predict(X_train)
final_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

print(f"LightGBMのトレーニングRMSE: {final_rmse:.4f} ")

LightGBMのトレーニングRMSE: 0.1874 


In [11]:
# モデルを保存
with open('./output/モデル/lgb_model.pkl', 'wb') as f:
    pickle.dump(final_model, f)