In [9]:
# gonna be deleted

In [1]:
from pathlib import Path
from dataclasses import dataclass
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


@dataclass
class Config:
    REPO_ROOT: Path = Path("../../")
    data_dir: Path = REPO_ROOT / "data"


config = Config()

sys.path.append(str(config.REPO_ROOT))

In [2]:
from python.src.preprocess import (
    preprocess_bike_data,
)

df_bike = pd.read_csv(config.data_dir / "bike+sharing+dataset" / "day.csv")
df_bike_processed = preprocess_bike_data(df=df_bike)
assert df_bike.shape[0] == df_bike_processed.shape[0]

In [4]:
df_bike_processed

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,hum,windspeed,cnt,days_since_2011
0,WINTER,2011,JAN,NO HOLIDAY,SAT,NO WORKING DAY,MISTY,8.0,81.0,11.0,985,0
1,WINTER,2011,JAN,NO HOLIDAY,SUN,NO WORKING DAY,MISTY,9.0,70.0,17.0,801,1
2,WINTER,2011,JAN,NO HOLIDAY,MON,WORKING DAY,GOOD,1.0,44.0,17.0,1349,2
3,WINTER,2011,JAN,NO HOLIDAY,TUE,WORKING DAY,GOOD,1.0,59.0,11.0,1562,3
4,WINTER,2011,JAN,NO HOLIDAY,WED,WORKING DAY,GOOD,3.0,44.0,13.0,1600,4
...,...,...,...,...,...,...,...,...,...,...,...,...
726,WINTER,2012,DEC,NO HOLIDAY,THU,WORKING DAY,MISTY,4.0,65.0,23.0,2114,726
727,WINTER,2012,DEC,NO HOLIDAY,FRI,WORKING DAY,MISTY,4.0,59.0,10.0,3095,727
728,WINTER,2012,DEC,NO HOLIDAY,SAT,NO WORKING DAY,MISTY,4.0,75.0,8.0,1341,728
729,WINTER,2012,DEC,NO HOLIDAY,SUN,NO WORKING DAY,GOOD,4.0,48.0,24.0,1796,729


In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin


def rulefit(X, y, max_depth=2, n_estimators=100, cv=5, random_state=None):
    """RuleFitアルゴリズムの実装

    Args:
        X (pd.DataFrame): 説明変数
        y (pd.Series): 目的変数
        max_depth (int, optional): 決定木の深さの最大値. Defaults to 2.
        n_estimators (int, optional): 決定木の数. Defaults to 100.
        cv (int, optional): LassoCVのクロスバリデーションのfold数. Defaults to 5.
        random_state (int, optional): 乱数シード. Defaults to None.

    Returns:
        tuple: best_feature, best_rules
    """

    # 決定木モデルの学習
    gb = GradientBoostingRegressor(
        max_depth=max_depth, n_estimators=n_estimators, random_state=random_state
    )
    gb.fit(X, y)

    # 決定木のルールを抽出
    rules = []
    for tree in gb.estimators_:
        for rule in extract_rules_from_tree(tree[0], X.columns):
            rules.append(rule)

    # One-hotエンコーディングでルールを特徴量に変換
    rules_df = (
        pd.DataFrame(rules).fillna("null").astype(str)
    )  # null値を文字列"null"に変換
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    encoded_rules = ohe.fit_transform(rules_df)
    encoded_rules_df = pd.DataFrame(encoded_rules, columns=ohe.get_feature_names_out())

    # 説明変数とルール特徴量を結合
    X_rules = pd.concat([X, encoded_rules_df], axis=1).astype(float)

    # LassoCVで特徴量選択と係数推定
    lasso = LassoCV(cv=cv, random_state=random_state)
    lasso.fit(X_rules, y)

    # 重みとルールを格納するデータフレームを作成
    coefs = pd.DataFrame({"coefficient": lasso.coef_})
    coefs["description"] = X_rules.columns
    coefs["importance"] = np.abs(lasso.coef_)
    coefs = coefs[coefs["importance"] > 0]
    coefs = coefs.sort_values("importance", ascending=False)

    return coefs


def extract_rules_from_tree(tree, feature_names):
    """決定木からルールを抽出する関数"""
    left = tree.tree_.children_left
    right = tree.tree_.children_right
    threshold = tree.tree_.threshold
    features = [feature_names[i] for i in tree.tree_.feature]
    value = tree.tree_.value

    def recurse(left, right, child, lineage=None):
        if lineage is None:
            lineage = []
        if child in left:
            parent = np.where(left == child)[0].item()
            split = "<= {}".format(threshold[parent])
        else:
            parent = np.where(right == child)[0].item()
            split = "> {}".format(threshold[parent])

        lineage.append((features[parent], split))

        if child in left or child in right:
            if child in left:
                recurse(left, right, left[child], lineage=lineage)
            if child in right:
                recurse(left, right, right[child], lineage=lineage)
        return lineage

    rules = []

    for i in range(len(left)):
        if left[i] != -1 or right[i] != -1:  # 内部ノードの場合
            rule = []
            rule.extend(recurse(left, right, left[i]))
            rule.extend(recurse(left, right, right[i]))
            rules.append(rule)
    return rules


class RuleFitTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, max_depth=2, n_estimators=100, cv=5, random_state=None):
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.cv = cv
        self.random_state = random_state

    def fit(self, X, y):
        self.coefs_ = rulefit(
            X, y, self.max_depth, self.n_estimators, self.cv, self.random_state
        )
        return self

    def transform(self, X):
        return X

    def fit_transform(self, X, y=None, **fit_params):
        if y is None:
            raise ValueError("y cannot be None for fit_transform")
        self.fit(X, y)
        return self.coefs_


# bikeデータの前処理 (Rコードと同様)
# df_bike_processedが定義されていると仮定
df_bike_processed["temp"] = df_bike_processed["temp"].round(0)
df_bike_processed["hum"] = df_bike_processed["hum"].round(0)
df_bike_processed["windspeed"] = df_bike_processed["windspeed"].round(0)

# 説明変数と目的変数を定義
X = df_bike_processed[
    [
        "season",
        "yr",
        "mnth",
        "holiday",
        "weekday",
        "workingday",
        "weathersit",
        "temp",
        "hum",
        "windspeed",
    ]
]
y = df_bike_processed["cnt"]

# 数値変数とカテゴリ変数を区別
numerical_features = ["temp", "hum", "windspeed"]
categorical_features = [
    "season",
    "yr",
    "mnth",
    "holiday",
    "weekday",
    "workingday",
    "weathersit",
]

# ColumnTransformerで数値変数とカテゴリ変数に異なる前処理を適用
preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"),
            numerical_features,
        ),
        (
            "cat",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            categorical_features,
        ),
    ],
    remainder="passthrough",  # その他の特徴量はそのまま通過させる
)

# RuleFitTransformerのインスタンスを作成
rulefit_transformer = RuleFitTransformer(random_state=0)

# パイプラインの作成 (RuleFitTransformerは含まない)
preprocessor_pipeline = make_pipeline(
    preprocessor, FunctionTransformer(lambda x: pd.DataFrame(x))
)

# データを前処理
X_transformed = preprocessor_pipeline.fit_transform(X)

# RuleFitを適用
coefs = rulefit_transformer.fit_transform(X_transformed, y)

# 結果を表示 (上位5件)
print(coefs.head(5))

ValueError: can only convert an array of size 1 to a Python scalar