<a href="https://colab.research.google.com/github/sousci/myColab_GREEN-DATA-Challenge-2025/blob/main/20250602.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

A100使用

## 🔰 Step 0. Colab用セットアップ

In [None]:
!pip install lightgbm



## 📥 Step 1. データの読み込みと前処理

In [None]:
import pandas as pd
import numpy as np

# データ読み込み
PATH = '/content/drive/MyDrive/SIGNATE/SMBC_Group_GREENDATA_Challenge_2025/'
train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')
feature_desc = pd.read_csv(PATH + 'feature_description.csv')

# 前処理関数定義
def preprocess_data(df, is_train=True):
    df = df.copy()

    # 時刻変換（UTCタイムゾーンに変換）
    df['time'] = pd.to_datetime(df['time'], utc=True)
    df['hour'] = df['time'].dt.hour
    df['dayofweek'] = df['time'].dt.dayofweek
    df['month'] = df['time'].dt.month
    df['is_weekend'] = df['dayofweek'] >= 5

    # 天気などカテゴリのOne-Hotエンコーディング
    cat_cols = df.select_dtypes(include='object').columns.tolist()
    cat_cols = [col for col in cat_cols if col not in ['time']]
    df = pd.get_dummies(df, columns=cat_cols)

    # 分岐
    if is_train:
        y = df['price_actual']
        X = df.drop(columns=['time', 'price_actual'])
        return X, y
    else:
        X = df.drop(columns=['time'])
        return X

# 前処理実行
X_train, y_train = preprocess_data(train, is_train=True)
X_test = preprocess_data(test, is_train=False)

## ⚙️ Step 2. 特徴量の整合性（列の追加）

In [None]:
# テストデータに不足している列を補完
missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0
X_test = X_test[X_train.columns]

## 📊 Step 3. モデルの学習と評価（LightGBM + 時系列CV）

In [None]:
from lightgbm import LGBMRegressor
from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

In [None]:
model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    random_state=42
)

tscv = TimeSeriesSplit(n_splits=3)
val_scores = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[early_stopping(stopping_rounds=50), log_evaluation(100)]
    )

    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    val_scores.append(mae)
    print(f"Fold {fold+1} MAE: {mae:.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002919 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10166
[LightGBM] [Info] Number of data points in the train set: 6570, number of used features: 231
[LightGBM] [Info] Start training from score 61.611324
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l2: 302.737
[200]	valid_0's l2: 299.851
Early stopping, best iteration is:
[183]	valid_0's l2: 299.064
Fold 1 MAE: 14.6870
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002229 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10512
[LightGBM] [Info] Number of data points in the train set: 13140, number of used features: 267
[LightGBM] [Info] Start training from score 53.824721
Training until validation scores don't improve 

## 📤 Step 4. 予測と提出ファイルの作成

In [None]:
# テストデータ予測
y_test_pred = model.predict(X_test)

# 提出ファイル作成
submit = pd.DataFrame({
    "time": test['time'],
    "0": y_test_pred
})

In [None]:
# prompt: # 今日の日付と時刻をYYYYMMDDhhmm形式で、'submission_YYYYMMDDhhmm.csv'という名前で保存

from datetime import datetime

# 今日の日付と時刻を取得
now = datetime.now()

# ファイル名を生成
filename = f'submission_{now.strftime("%Y%m%d%H%M")}.csv'

# 提出ファイルを保存
submit.to_csv(PATH + filename, index=False, header=False)

## ✅ モデル評価コード（RMSE）

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# バリデーションスコア（すでに各foldのMAEは val_scores に保存済み）
# RMSE評価のために、再度 TimeSeriesSplit を使って予測・評価を行う

tscv = TimeSeriesSplit(n_splits=3)
rmse_scores = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[early_stopping(stopping_rounds=50), log_evaluation(100)]
    )

    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)
    print(f"Fold {fold+1} RMSE: {rmse:.4f}")

# 全体の平均RMSE
print(f"\n✅ 平均RMSE: {np.mean(rmse_scores):.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004166 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10166
[LightGBM] [Info] Number of data points in the train set: 6570, number of used features: 231
[LightGBM] [Info] Start training from score 61.611324
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l2: 302.737
[200]	valid_0's l2: 299.851
Early stopping, best iteration is:
[183]	valid_0's l2: 299.064
Fold 1 RMSE: 17.2935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008799 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10512
[LightGBM] [Info] Number of data points in the train set: 13140, number of used features: 267
[LightGBM] [Info] Start training from score 53.824721
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's 