In [26]:
# gonna be deleted

In [1]:
from pathlib import Path
from dataclasses import dataclass
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


@dataclass
class Config:
    REPO_ROOT: Path = Path("../../")
    data_dir: Path = REPO_ROOT / "data"


config = Config()

sys.path.append(str(config.REPO_ROOT))

In [3]:
from python.src.preprocess import (
    preprocess_rfcc,
)

df_rfcc = pd.read_csv(config.data_dir / "risk_factors_cervical_cancer.csv")
df_rfcc_processed = preprocess_rfcc(df=df_rfcc)
assert df_rfcc.shape[0] == df_rfcc_processed.shape[0]

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.dummy import DummyClassifier

In [25]:
# oneR
# https://www.kaggle.com/code/prabhat12/oner-zeror

In [8]:
df_rfcc_processed.columns

Index(['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes', 'Smokes (years)',
       'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD',
       'IUD (years)', 'STDs', 'STDs (number)', 'STDs: Number of diagnosis',
       'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
       'Biopsy'],
      dtype='object')

In [21]:
# 連続値特徴量の量子化
continuous_features = [
    "Age",
    "Number of sexual partners",
    "First sexual intercourse",
    "Num of pregnancies",
    "Smokes (years)",
    "Hormonal Contraceptives (years)",
    "IUD (years)",
    "STDs (number)",
]
discretizer = KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile")
df_rfcc_processed[continuous_features] = discretizer.fit_transform(
    df_rfcc_processed[continuous_features]
)


# OneRの実装
def oneR(X, y):
    best_feature = None
    best_accuracy = 0
    best_rules = {}

    for feature in X.columns:
        rules = {}
        for value in np.unique(X[feature]):
            # 各値に対して最も頻度の高いクラスを予測値とするルールを作成
            most_frequent_class = y[X[feature] == value].mode()[0]
            rules[value] = most_frequent_class

        # ダミークラシファイアで精度を計算 (OneRでは単純なルールベースの分類器を使用)
        dummy_clf = DummyClassifier(
            strategy="constant", constant=rules[value]
        )  # valueの最後の値を使う
        dummy_clf.fit(X[[feature]], y)  # featureで学習させる
        accuracy = dummy_clf.score(X[[feature]], y)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_feature = feature
            best_rules = rules

    return best_feature, best_rules


# OneRを適用
X = df_rfcc_processed.drop("Biopsy", axis=1)
y = df_rfcc_processed["Biopsy"]
best_feature, best_rules = oneR(X, y)

# 結果を表示
print(f"Best Feature: {best_feature}")
for value, prediction in best_rules.items():
    print(f"  {best_feature} = {value:.0f}  ->  prediction: {prediction}")


# 結果をデータフレームに変換 (表示用)
rules_df = pd.DataFrame(
    {best_feature: best_rules.keys(), "prediction": best_rules.values()}
)
print(rules_df)

Best Feature: Age
  Age = 0  ->  prediction: Healthy
  Age = 1  ->  prediction: Healthy
  Age = 2  ->  prediction: Healthy
  Age = 3  ->  prediction: Healthy
   Age prediction
0  0.0    Healthy
1  1.0    Healthy
2  2.0    Healthy
3  3.0    Healthy




In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer


def oner(df, target_col):
    """
    OneRアルゴリズムの実装

    Args:
        df: データフレーム
        target_col: 目的変数のカラム名

    Returns:
        dict: 選択された特徴量名と、ルール(特徴量値: 予測値)の辞書
             特徴量が数値の場合、ルールは(しきい値: 予測値)の辞書になる
    """

    y = df[target_col]
    best_feature = None
    best_error = float("inf")
    best_rules = {}
    best_est = None  # 数値特徴量の離散化器を保存

    for feature in df.columns:
        if feature == target_col:
            continue

        # 連続値特徴量の離散化 (5分位数を使用)
        if pd.api.types.is_numeric_dtype(df[feature]):
            est = KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile")
            x_discretized = est.fit_transform(df[[feature]])
            x_discretized = pd.Series(
                x_discretized.flatten(), name=feature, index=df.index
            ).astype(int)  # 整数に変換
        else:
            x_discretized = df[feature]
            est = None

        # クロス集計表の作成
        cross_table = pd.crosstab(x_discretized, y)

        # ルールの生成と誤り率の計算
        rules = {}
        total_error = 0
        for value in cross_table.index:
            prediction = cross_table.loc[
                value
            ].idxmax()  # 最も頻度の高いクラスを予測値とする
            rules[value] = prediction
            total_error += (
                cross_table.loc[value].sum() - cross_table.loc[value, prediction]
            )

        # 最良特徴量の更新
        if total_error < best_error:
            best_error = total_error
            best_feature = feature
            best_rules = rules
            best_est = est

    # 数値特徴量の場合、ルールをしきい値で表現するように変換
    if best_est is not None:
        new_rules = {}
        for i, threshold in enumerate(
            best_est.bin_edges_[0][1:]
        ):  # 各binの上限値をしきい値とする
            new_rules[threshold] = best_rules[i]  # 対応する予測値を格納
        best_rules = new_rules

    return {"feature": best_feature, "rules": best_rules, "estimator": best_est}


# df_rfcc_processed を使用したOneRの実行例
target_column = "Biopsy"
result = oner(df_rfcc_processed, target_column)

# 結果の表示
print(f"Selected Feature: {result['feature']}")
for value, prediction in result["rules"].items():
    if result["estimator"] is not None:
        print(
            f"IF {result['feature']} <= {value:.2f} THEN {target_column} = '{prediction}'"
        )
    else:
        print(
            f"IF {result['feature']} == '{value}' THEN {target_column} = '{prediction}'"
        )

Selected Feature: STDs: Time since first diagnosis
IF STDs: Time since first diagnosis == '1.0' THEN Biopsy = 'Healthy'
IF STDs: Time since first diagnosis == '10.0' THEN Biopsy = 'Healthy'
IF STDs: Time since first diagnosis == '11.0' THEN Biopsy = 'Healthy'
IF STDs: Time since first diagnosis == '12.0' THEN Biopsy = 'Healthy'
IF STDs: Time since first diagnosis == '15.0' THEN Biopsy = 'Cancer'
IF STDs: Time since first diagnosis == '16.0' THEN Biopsy = 'Healthy'
IF STDs: Time since first diagnosis == '18.0' THEN Biopsy = 'Healthy'
IF STDs: Time since first diagnosis == '19.0' THEN Biopsy = 'Healthy'
IF STDs: Time since first diagnosis == '2.0' THEN Biopsy = 'Healthy'
IF STDs: Time since first diagnosis == '21.0' THEN Biopsy = 'Healthy'
IF STDs: Time since first diagnosis == '22.0' THEN Biopsy = 'Healthy'
IF STDs: Time since first diagnosis == '3.0' THEN Biopsy = 'Healthy'
IF STDs: Time since first diagnosis == '4.0' THEN Biopsy = 'Healthy'
IF STDs: Time since first diagnosis == '5.0'



In [22]:
# 結果をデータフレームに変換 (表示用、変更)
rules_df = pd.DataFrame(
    {best_feature: best_rules.keys(), "prediction": best_rules.values()}
)

# 表示用にbinsの範囲を文字列で表現
if best_feature in continuous_features:  # 量子化された特徴量の場合
    bins = discretizer.bin_edges_[continuous_features.index(best_feature)]
    rules_df[best_feature] = pd.cut(
        rules_df[best_feature],
        bins=bins,
        right=False,
        include_lowest=True,
        duplicates="drop",
    )


print(rules_df)

          Age prediction
0  [0.0, 1.0)    Healthy
1  [1.0, 2.0)    Healthy
2  [2.0, 3.0)    Healthy
3  [3.0, 4.0)    Healthy
