In [34]:
import pandas as pd
import glob
import os
import numpy as np

# output/IB ディレクトリ内のすべてのtxtファイルのパスを取得
path = '../output/IB'
all_files = glob.glob(os.path.join(path, "*.txt"))

# 各ファイルを読み込み、一つのデータフレームに結合
all_dfs = []
for filename in all_files:
    df = pd.read_csv(filename, sep='\t')
    all_dfs.append(df)

if all_dfs:
    # 全てのデータフレームを結合
    combined_df = pd.concat(all_dfs, ignore_index=True)

    # Chi-square値がNaNの場合は0として扱う（古いデータ対応）
    combined_df['Chi_squared'] = combined_df['Chi_squared'].fillna(0)

    # 品質フィルタリング（基準を満たすルールのみ）
    # 1. サポート率が10%以上
    # 2. Chi-square値が3.84以上（5%有意水準）またはデータが古い場合は0
    filtered_df = combined_df[
        (combined_df['support_rate'] >= 0.1) &
        ((combined_df['Chi_squared'] >= 3.84) | (combined_df['Chi_squared'] == 0))
    ].copy()

    # 総合スコアを計算（Chi-squareを考慮）
    # スコア = support_rate × support_count × (1 + chi_squared/10)
    # Chi-squareが高いほどボーナスが増える
    filtered_df['chi_bonus'] = 1 + filtered_df['Chi_squared'] / 10
    filtered_df['score'] = (
        filtered_df['support_rate'] *
        filtered_df['support_count'] *
        filtered_df['chi_bonus']
    )

    # 重複ルールを削除（同じAttr1, Attr2, Attr3の組み合わせの中で最高スコアのみを保持）
    filtered_df = filtered_df.sort_values(by='score', ascending=False)
    filtered_df_unique = filtered_df.drop_duplicates(subset=['Attr1', 'Attr2', 'Attr3'], keep='first')

    # 総合スコアで降順にソート
    top_rules = filtered_df_unique.sort_values(by='score', ascending=False).head(50)

    # 結果を表示
    print("=" * 80)
    print("High-Quality Rule Extraction (Chi-square Considered)")
    print("=" * 80)
    print(f"Total rules: {len(combined_df)}")
    print(f"After quality filter: {len(filtered_df)}")
    print(f"Duplicate rules removed: {len(filtered_df) - len(filtered_df_unique)}")
    print(f"Unique rules: {len(filtered_df_unique)}")
    print(f"Excluded (quality): {len(combined_df) - len(filtered_df)}")
    print("\nQuality Criteria:")
    print("  - Support Rate ≥ 10%")
    print("  - Chi-square ≥ 3.84 (5% significance level)")
    print("\nScore Formula:")
    print("  score = support_rate × support_count × (1 + chi_squared/10)")
    print("=" * 80)
    print("\nTop 50 Rules (Unique):")

    # 表示用のカラムを選択
    display_cols = ['Attr1', 'Attr2', 'Attr3', 'support_rate', 'support_count',
                    'Chi_squared', 'HighSup', 'LowVar', 'HighChi', 'score']
    display(top_rules[display_cols])

    # Chi-square統計
    print("\n" + "=" * 80)
    print("Chi-square Statistics:")
    print("=" * 80)
    chi_valid = filtered_df_unique[filtered_df_unique['Chi_squared'] > 0]
    if len(chi_valid) > 0:
        print(f"Rules with Chi-square value: {len(chi_valid)}")
        print(f"Mean: {chi_valid['Chi_squared'].mean():.3f}")
        print(f"Median: {chi_valid['Chi_squared'].median():.3f}")
        print(f"Max: {chi_valid['Chi_squared'].max():.3f}")
        print(f"High significance (≥6.63): {len(chi_valid[chi_valid['Chi_squared'] >= 6.63])}")
    else:
        print("No Chi-square values found (possibly old data)")

else:
    print("No files found.")

High-Quality Rule Extraction (Chi-square Considered)
Total rules: 200000
After quality filter: 200000
Duplicate rules removed: 79133
Unique rules: 120867
Excluded (quality): 0

Quality Criteria:
  - Support Rate ≥ 10%
  - Chi-square ≥ 3.84 (5% significance level)

Score Formula:
  score = support_rate × support_count × (1 + chi_squared/10)

Top 50 Rules (Unique):


Unnamed: 0,Attr1,Attr2,Attr3,support_rate,support_count,Chi_squared,HighSup,LowVar,HighChi,score
28879,9434_Stay(t-1),6178_Stay(t-1),0,0.8435,4440,5.887,1,1,0,5949.903918
4910,9434_Stay(t-2),6178_Stay(t-1),0,0.8437,4441,4.993,1,1,0,5617.68474
123972,9434_Stay(t-3),6178_Stay(t-1),0,0.8429,4437,4.688,1,1,0,5493.234594
144742,9434_Stay(t-0),7186_Stay(t-2),0,0.8193,4313,5.287,1,1,0,5401.876844
174900,9434_Stay(t-2),7186_Stay(t-3),0,0.8205,4319,5.241,1,1,0,5401.013372
89021,9434_Stay(t-0),6178_Stay(t-1),0,0.8435,4440,3.881,1,1,0,5198.628834
103595,9434_Stay(t-3),7186_Stay(t-2),0,0.819,4311,4.661,1,1,0,5176.372465
162432,9434_Stay(t-1),1333_Stay(t-2),0,0.7781,4096,6.182,1,1,0,5157.361336
102716,9434_Stay(t-2),1333_Stay(t-2),0,0.7772,4091,6.059,1,1,0,5105.999519
105299,9434_Stay(t-2),7186_Stay(t-2),0,0.819,4311,4.357,1,1,0,5069.038911



Chi-square Statistics:
Rules with Chi-square value: 120867
Mean: 6.194
Median: 5.463
Max: 28.535
High significance (≥6.63): 37600
