In [1]:
import sys
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import lightgbm as lgb
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error

import logging
logging.basicConfig(level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# 変数

In [2]:
preprocess_num = "P2"
ML = "ML1"

# csvをimportする

In [3]:
train = pd.read_csv(f"../preprocess_results/{preprocess_num}_train.csv", sep=',')
train = train.drop(columns = ["id"])
train.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,1,1,1,7.0,1,0,2,0,11.611723,112.15875
1,1,0,2,10.0,1,1,1,3,27.078537,68.88056
2,4,1,2,2.0,1,0,1,5,16.64376,39.1732
3,2,2,2,8.0,1,0,1,3,12.93722,80.60793
4,0,0,1,1.0,1,1,1,3,17.749338,86.02312


In [4]:
test = pd.read_csv(f"../preprocess_results/{preprocess_num}_test.csv", sep=',')
test = test.drop(columns = ["id","Price"])
test.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,3,1,2,2.0,0,0,2,3,20.671147
1,2,0,1,7.0,0,1,0,3,13.564105
2,0,0,0,9.0,0,1,1,1,11.809799
3,0,2,0,1.0,1,0,1,3,18.477036
4,5,2,0,2.0,1,1,2,0,9.907953


In [5]:
sample_submission = pd.read_csv(f"../data/sample_submission.csv", sep=',')
sample_submission.head()

Unnamed: 0,id,Price
0,300000,81.411
1,300001,81.411
2,300002,81.411
3,300003,81.411
4,300004,81.411


# 学習データとテストデータに分割

In [6]:
X = train.drop(columns=['Price'])
y = train["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# パラメーターチューリング

In [7]:
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 20, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "n_estimators": 1000,
        "early_stopping_rounds": 10,
        "verbose" : 0,
    }

    # 交差検証
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_list = []

    for train_idx, valid_idx in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

        # モデルの学習
        model = lgb.LGBMRegressor(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            eval_metric="rmse",
            callbacks=[optuna.integration.LightGBMPruningCallback(trial, "rmse")],
        )
        
        # 予測
        y_pred = model.predict(X_val)

        # RMSEの計算
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        rmse_list.append(rmse)

    return np.mean(rmse_list)

In [None]:
%%time
# Optunaで最適化
study = optuna.create_study(
    direction="minimize", 
    sampler=optuna.samplers.TPESampler(), 
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10)
)
study.optimize(objective, n_trials=50)

# 最適なパラメータ
best_params = study.best_params

# 最適化履歴を可視化
optuna.visualization.matplotlib.plot_optimization_history(study)
plt.show()

[I 2025-02-19 02:29:03,065] A new study created in memory with name: no-name-81a434ac-04a1-4546-bfe4-d33bbccd4ece
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),




















[I 2025-02-19 02:32:45,801] Trial 0 finished with value: 38.8819182920637 and parameters: {'learning_rate': 0.1039522958368663, 'num_leaves': 140, 'max_depth': 5, 'min_child_samples': 46, 'subsample': 0.5823760376485578, 'colsample_bytree': 0.8396721622553416, 'reg_alpha': 2.7331712807145025e-05, 'reg_lambda': 0.0035114721408573877}. Best is trial 0 with value: 38.8819182920637.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),












In [None]:
print("Best Parameters:", best_params)

# GBDT

In [None]:
# 最適なパラメータでモデルを学習
best_model = lgb.LGBMRegressor(**best_params)
print(f"best_model: {best_model}")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_train))

for train_idx, valid_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]  # ilocを使用
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]  # 同様にy_trainもilocでアクセス

    best_model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric="rmse",
    )

    oof_preds[valid_idx] = best_model.predict(X_val)

# OOFスコア
rmse_oof = mean_squared_error(y_train, oof_preds, squared=False)
print(f"OOF RMSE: {rmse_oof:.4f}")

In [None]:
# テストデータで評価
y_test_pred = best_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
print(f"Test RMSE: {rmse_test:.4f}")

In [None]:
# 本番データで予測出し
y_prod_pred = best_model.predict(test)

# csvをmodel_resultsに作成

In [None]:
sample_submission["Price"] = y_prod_pred
display(sample_submission.head())

path = f"../model_results/{preprocess_num}_{ML}.csv"
sample_submission.to_csv(path, index=False)