In [None]:
import pandas as pd
import glob
import os

# output/IB ディレクトリ内のすべてのtxtファイルのパスを取得
path = '../output/IB'
all_files = glob.glob(os.path.join(path, "*.txt"))

# 各ファイルを読み込み、一つのデータフレームに結合
all_dfs = []
for filename in all_files:
    df = pd.read_csv(filename, sep='\t')
    all_dfs.append(df)

if all_dfs:
    # 全てのデータフレームを結合
    combined_df = pd.concat(all_dfs, ignore_index=True)

    # Chi-square値がNaNの場合は0として扱う（古いデータ対応）
    combined_df['Chi_squared'] = combined_df['Chi_squared'].fillna(0)

    # 品質フィルタリング（基準を満たすルールのみ）
    # 1. サポート率が10%以上
    # 2. Chi-square値が3.84以上（5%有意水準）またはデータが古い場合は0
    filtered_df = combined_df[
        (combined_df['support_rate'] >= 0.1) &
        ((combined_df['Chi_squared'] >= 3.84) | (combined_df['Chi_squared'] == 0))
    ].copy()

    # 総合スコアを計算（Chi-squareを考慮）
    # スコア = support_rate × support_count × (1 + chi_squared/10)
    # Chi-squareが高いほどボーナスが増える
    filtered_df['chi_bonus'] = 1 + filtered_df['Chi_squared'] / 10
    filtered_df['score'] = (
        filtered_df['support_rate'] *
        filtered_df['support_count'] *
        filtered_df['chi_bonus']
    )

    # 重複ルールを削除（同じAttr1, Attr2, Attr3の組み合わせの中で最高スコアのみを保持）
    filtered_df = filtered_df.sort_values(by='score', ascending=False)
    filtered_df_unique = filtered_df.drop_duplicates(subset=['Attr1', 'Attr2', 'Attr3'], keep='first')

    # 総合スコアで降順にソート
    top_rules = filtered_df_unique.sort_values(by='score', ascending=False).head(50)

    # 結果を表示
    print("=" * 80)
    print("High-Quality Rule Extraction (Chi-square Considered)")
    print("=" * 80)
    print(f"Total rules: {len(combined_df)}")
    print(f"After quality filter: {len(filtered_df)}")
    print(f"Duplicate rules removed: {len(filtered_df) - len(filtered_df_unique)}")
    print(f"Unique rules: {len(filtered_df_unique)}")
    print(f"Excluded (quality): {len(combined_df) - len(filtered_df)}")
    print("\nQuality Criteria:")
    print("  - Support Rate ≥ 10%")
    print("  - Chi-square ≥ 3.84 (5% significance level)")
    print("\nScore Formula:")
    print("  score = support_rate × support_count × (1 + chi_squared/10)")
    print("=" * 80)
    print("\nTop 50 Rules (Unique):")

    # 表示用のカラムを選択
    display_cols = ['Attr1', 'Attr2', 'Attr3', 'support_rate', 'support_count',
                    'Chi_squared', 'HighSup', 'LowVar', 'HighChi', 'score']
    display(top_rules[display_cols])

    # Chi-square統計
    print("\n" + "=" * 80)
    print("Chi-square Statistics:")
    print("=" * 80)
    chi_valid = filtered_df_unique[filtered_df_unique['Chi_squared'] > 0]
    if len(chi_valid) > 0:
        print(f"Rules with Chi-square value: {len(chi_valid)}")
        print(f"Mean: {chi_valid['Chi_squared'].mean():.3f}")
        print(f"Median: {chi_valid['Chi_squared'].median():.3f}")
        print(f"Max: {chi_valid['Chi_squared'].max():.3f}")
        print(f"High significance (≥6.63): {len(chi_valid[chi_valid['Chi_squared'] >= 6.63])}")
    else:
        print("No Chi-square values found (possibly old data)")

else:
    print("No files found.")

High-Quality Rule Extraction (Chi-square Considered)
Total rules: 94000
After quality filter: 94000
Duplicate rules removed: 17623
Unique rules: 76377
Excluded (quality): 0

Quality Criteria:
  - Support Rate ≥ 10%
  - Chi-square ≥ 3.84 (5% significance level)

Score Formula:
  score = support_rate × support_count × (1 + chi_squared/10)

Top 50 Rules (Unique):


  combined_df = pd.concat(all_dfs, ignore_index=True)


Unnamed: 0,Attr1,Attr2,Attr3,support_rate,support_count,Chi_squared,HighSup,LowVar,HighChi,score
15336,9434_Stay(t-0),1333_Stay(t-4),7186_Stay(t-2),0.7382,3885,7.611,1,1,1,5050.671018
76635,9434_Stay(t-4),6178_Stay(t-1),1333_Stay(t-2),0.746,3926,7.076,1,1,1,5001.21205
92695,9434_Stay(t-2),6178_Stay(t-0),1333_Stay(t-2),0.7437,3914,6.846,1,1,1,4903.604096
2868,9434_Stay(t-4),6178_Stay(t-1),1333_Stay(t-0),0.7456,3924,6.76,1,1,1,4903.530854
82405,9434_Stay(t-2),6178_Stay(t-1),1333_Stay(t-0),0.7452,3922,6.451,1,1,0,4808.091655
63249,3863_Stay(t-2),9434_Stay(t-0),6178_Stay(t-1),0.6945,3655,8.367,1,1,1,4662.274688
44541,9434_Stay(t-0),6178_Stay(t-1),1333_Stay(t-2),0.7444,3918,5.855,1,1,0,4624.204612
81344,9434_Stay(t-4),1333_Stay(t-4),7186_Stay(t-2),0.7368,3878,5.88,1,1,0,4537.408915
17475,9434_Stay(t-2),6178_Stay(t-4),1333_Stay(t-0),0.7458,3925,5.059,1,1,0,4408.168363
28691,9434_Stay(t-0),1333_Stay(t-4),7186_Stay(t-1),0.7357,3872,5.335,1,1,0,4368.374718



Chi-square Statistics:
Rules with Chi-square value: 76377
Mean: 6.670
Median: 6.002
Max: 27.794
High significance (≥6.63): 30928
