In [None]:
import os

import requests

# 気象庁データの取得
# jma_url = "https://www.jma.go.jp/bosai/forecast/data/forecast/130000.json"
jma_url = "https://weather.tsukumijima.net/api/forecast/city/130010"
jma_json = requests.get(jma_url).json()


In [None]:
jma_json

In [None]:
import pandas as pd

weather_df = pd.read_csv("../data/weather_data.csv", encoding="shift-jis", skiprows=[0,1,2,4,5])

In [None]:
weather_df

In [None]:

weather_df = weather_df[["年月日", "最高気温(℃)", "最低気温(℃)", "天気概況(昼：06時〜18時)"]]
weather_df = weather_df.rename(columns={
    "年月日": "date",
    "最高気温(℃)": "max_temp",
    "最低気温(℃)": "min_temp",
    "天気概況(昼：06時〜18時)": "weather",
})

In [None]:
weather_df["date"] = pd.to_datetime(weather_df["date"], format="%Y/%m/%d")

In [None]:
weather_df.head(n=3)

In [None]:
weather_df.info()

In [None]:
import zipfile

In [None]:
zip_dir = os.path.expanduser("../data/power_usage")
result = []

In [None]:
for zip_name in sorted(os.listdir(zip_dir)):
    if not zip_name.endswith(".zip"):
        continue

    zip_path = os.path.join(zip_dir, zip_name)

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        for csv_filename in zip_ref.namelist():

            if not csv_filename.endswith(".csv"):
                continue

            with zip_ref.open(csv_filename) as csv_file:
                try:
                    df = pd.read_csv(csv_file, encoding="shift-jis", skiprows=54)
                    max_power = df["当日実績(５分間隔値)(万kW)"].max()
                    result.append({
                        "date": csv_filename.split("_")[0],
                        "max_power": max_power,
                    })
                except Exception as e:
                    print(f"Error reading {csv_filename}: {e}")


In [None]:
power_usage_df = pd.DataFrame(result)
power_usage_df["date"] = pd.to_datetime(power_usage_df["date"], format="%Y%m%d")

In [None]:
integrated_df = pd.merge(weather_df, power_usage_df, on="date", how="inner")

In [None]:
integrated_df

In [None]:
integrated_df["weather"].unique()

In [None]:
def weather_check(weather: str) -> str:
    """天気の文字列を基本的なカテゴリに分類する関数
    
    Args:
        weather: 元の天気の説明文字列
        
    Returns:
        str: 分類された天気カテゴリ
            快晴、晴れ、晴れ時々曇り、晴れ時々雨、曇り、曇り時々雨、雨、
            雷雨、晴れ（雷あり）、曇り（雷あり）、雷、霧・もや、その他、不明 (NaN値の場合)
    """
    if pd.isna(weather):
        return "不明"

    # 雪系
    if any(keyword in weather for keyword in ["雪", "ゆき"]):
        return "雪"

    # 雷系
    if "雷" in weather:
        if any(keyword in weather for keyword in ["雨", "あめ"]):
            return "雷雨"
        if any(keyword in weather for keyword in ["晴", "日射"]):
            return "晴れ(雷あり)"
        if any(keyword in weather for keyword in ["曇", "くもり"]):
            return "曇り(雷あり)"
        return "雷"

    # 晴れ系
    if "快晴" in weather:
        return "快晴"
    if any(keyword in weather for keyword in ["晴", "日射"]):
        if any(keyword in weather for keyword in ["曇", "くもり"]):
            return "晴れ時々曇り"
        if any(keyword in weather for keyword in ["雨", "あめ", "雷"]):
            return "晴れ時々雨"
        return "晴れ"

    # 曇り系
    if any(keyword in weather for keyword in ["曇", "くもり"]):
        if any(keyword in weather for keyword in ["雨", "あめ"]):
            return "曇り時々雨"
        return "曇り"

    # 雨系
    if any(keyword in weather for keyword in ["雨", "あめ"]):
        return "雨"

    # その他
    return "その他"

integrated_df["weather_category"] = integrated_df["weather"].apply(weather_check)
integrated_df = integrated_df.drop(columns=["weather"])

In [None]:
integrated_df["weather_category"].value_counts()

In [None]:
integrated_df

In [None]:
import holidays
import numpy as np

JP_HOLIDAY = holidays.Japan()

In [None]:
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    """データフレーム全体に対して特徴量を作成する関数
    
    Args:
        df: pd.DataFrame（date, max_temp, min_temp, weather_category列を含む）
        
    Returns:
        pd.DataFrame: 特徴量を追加したデータフレーム
    """
    result_df = df.copy()

    # 数値系特徴量
    result_df["avg"] = (df["max_temp"] + df["min_temp"]) / 2
    result_df["rng"] = df["max_temp"] - df["min_temp"]
    result_df["cdd"] = (result_df["avg"] - 18).clip(lower=0)
    result_df["hdd"] = (18 - result_df["avg"]).clip(lower=0)
    result_df["hot"] = (df["max_temp"] >= 30).astype(int)
    result_df["cold"] = (df["min_temp"] <= 5).astype(int)

    # カレンダー系特徴量
    result_df["year"] = df["date"].dt.year
    result_df["month"] = df["date"].dt.month
    result_df["day"] = df["date"].dt.day
    result_df["dow"] = df["date"].dt.weekday
    result_df["dow_sin"] = np.sin(2 * np.pi * result_df["dow"] / 7)
    result_df["dow_cos"] = np.cos(2 * np.pi * result_df["dow"] / 7)
    result_df["mon_sin"] = np.sin(2 * np.pi * result_df["month"] / 12)
    result_df["mon_cos"] = np.cos(2 * np.pi * result_df["month"] / 12)
    # 休日フラグ
    result_df["weekend"] = (result_df["dow"] >= 5).astype(int)
    # 祝日フラグ
    result_df["holiday"] = result_df["date"].apply(lambda x: int(x in JP_HOLIDAY))

    return result_df

In [None]:
from typing import Dict

from omegaconf import DictConfig, OmegaConf

from feature_encoder import FeatureEncoder

In [None]:
df = make_features(integrated_df)

In [None]:
encoders_dict = {}


In [None]:
config_path = "config.yaml"
config = OmegaConf.load(config_path)
print("Config keys:", config.keys())
print("Encoders config:", config.get("encoders", "Not found"))

In [None]:
def encode_features(df: pd.DataFrame, config: DictConfig, encoders_dict: Dict[str, FeatureEncoder]):
    """特徴量をエンコードする関数
    
    Args: Dictconfig
        config: 設定ファイルの内容
        encoders_dict: エンコーダー辞書
        df: pd.DataFrame（特徴量を含む）
        
    Returns:
        pd.DataFrame: エンコードされたデータフレーム
    """
    if "encoders" in config:
        for params in config["encoders"]:
            if params["name"] not in encoders_dict:
                encoder = FeatureEncoder(**params)
                df = encoder.fit_transform(df)
                encoders_dict[params["name"]] = encoder
            else:
                encoder = encoders_dict[params["name"]]
                df = encoder.transform(df)
    return df, encoders_dict


In [None]:
df, encoders_dict = encode_features(df, config, encoders_dict)

In [None]:
df

In [None]:
import lightgbm as lgb

In [None]:
model = lgb.LGBMRegressor()

In [None]:
df.info()

In [None]:
df_train = df[df["date"] < "2024-10-01"]
df_test = df[df["date"] >= "2024-10-01"]

In [None]:
X_train = df_train.drop(columns = ["max_power", "date"])
y_train = df_train["max_power"]
X_test = df_test.drop(columns = ["max_power", "date"])
y_test = df_test["max_power"]

In [None]:
model.fit(X_train, y_train)

In [None]:
model.predict(X_test)

In [None]:
model.feature_name_

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(y_test, model.predict(X_test))

In [None]:
def plot_feature_importance(
    model_importance: lgb.LGBMModel,
    plot_features: int,
    save_dir: str,
    save_name: str,
) -> None:
    """
    特徴量重要度のプロットを行う
    Args:
        model_importance (AbstractTrainer): 特徴量重要度を取得するモデル
        plot_features (int): 表示する特徴量の数
        save_dir (str): 保存先のディレクトリ
        save_name (str): 保存するファイル名
    """
    df_importance = pd.DataFrame(
        {
            "feature": model_importance.feature_name_,
            "importance": model_importance.feature_importances_,
        },
    )
    df_importance = df_importance.sort_values("importance", ascending=False)
    plt.figure(figsize=(10, plot_features / 2))
    sns.barplot(
        x="importance",
        y="feature",
        data=df_importance.iloc[:plot_features],
    )
    plt.tight_layout()

In [None]:
plot_feature_importance(
    model,
    plot_features=20,
    save_dir="./feature_importance",
    save_name="feature_importance",
)