In [10]:
# 1. 讀入套件
import numpy as np
import pandas as pd
from collections import Counter
import warnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')
# 用於輸出 Excel 檔案
from openpyxl import Workbook
from openpyxl.styles import PatternFill

# 2. 資料預處理
def load_and_preprocess_data(train_path, test_path):
    
    column_names = [
        'age', 'workclass', 'fnlwgt', 'education', 'education-num',
        'marital-status', 'occupation', 'relationship', 'race', 'sex',
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
    ]
    categorical_features = [
        'workclass', 'education', 'marital-status', 'occupation',
        'relationship', 'race', 'sex', 'native-country'
    ]
    numerical_features = [
        'age', 'fnlwgt', 'education-num', 'capital-gain',
        'capital-loss', 'hours-per-week'
    ]

    # 讀取資料 
    train_df = pd.read_csv(train_path, header=None, names=column_names, 
                           skipinitialspace=True, na_values='?')
    test_df = pd.read_csv(test_path, header=None, names=column_names, 
                          skipinitialspace=True, na_values='?', skiprows=1) # 測試檔第一行為註解

    # 清理重複值和目標變數 
    train_df.drop_duplicates(inplace=True)
    test_df.drop_duplicates(inplace=True)
    train_df['income'] = train_df['income'].str.strip()
    test_df['income'] = test_df['income'].str.strip().str.rstrip('.')

    # 處理訓練集 
    
    # 儲存用於測試集的填補值
    imputation_values = {}
    
    # 填補缺失值 (使用訓練集的統計數據)
    for col in categorical_features:
        mode_val = train_df[col].mode()[0]
        train_df[col].fillna(mode_val, inplace=True)
        imputation_values[col] = mode_val
        
    for col in numerical_features:
        median_val = train_df[col].median()
        train_df[col].fillna(median_val, inplace=True)
        imputation_values[col] = median_val

    # 映射目標變數
    y_train = train_df['income'].map({'<=50K': 0, '>50K': 1})
    X_train_raw = train_df.drop('income', axis=1)

    # 處理測試集 (Test) 
    
    # 填補缺失值 (使用*訓練集*的統計數據)
    for col in categorical_features:
        train_mode = imputation_values[col]
        test_df[col].fillna(train_mode, inplace=True)
        
    for col in numerical_features:
        train_median = imputation_values[col]
        test_df[col].fillna(train_median, inplace=True)

    # 映射目標變數
    y_test = test_df['income'].map({'<=50K': 0, '>50K': 1})
    X_test_raw = test_df.drop('income', axis=1)

    # One-Hot Encoding 
    
    # 對訓練集進行 One-Hot 編碼
    X_train = pd.get_dummies(X_train_raw, columns=categorical_features, drop_first=True)
    
    # 對測試集進行 One-Hot 編碼
    X_test = pd.get_dummies(X_test_raw, columns=categorical_features, drop_first=True)
    
    # 獲取訓練集的欄位列表
    train_columns = X_train.columns
    X_test = X_test.reindex(columns=train_columns, fill_value=0)
    
    return X_train, y_train, X_test, y_test

# 3. 評估與列印函式
def calculate_metrics(y_true, y_pred):
    """計算所有需要的性能指標"""
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return {
        'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1,
        'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn
    }

def print_performance_report(metrics, title):
    """以指定格式列印性能報告"""
    print("=" * 50)
    print(f"{title:^40}")
    print("=" * 50)
    print(f"  {'準確率 (Accuracy)':<25} : {metrics['accuracy'] * 100:.2f}%")
    print(f"  {'精確度 (Precision)':<25} : {metrics['precision'] * 100:.2f}%")
    print(f"  {'召回率 (Recall)':<25} : {metrics['recall'] * 100:.2f}%")
    print(f"  {'F1-Score':<25} : {metrics['f1_score'] * 100:.2f}%")
    print("                  ")
    print(f"{'--- Confusion Matrix ---':^50}")
    print(f"  {'真陽性 (True Positive, TP)':<25}  : {metrics['tp']:<8}")
    print(f"  {'真陰性 (True Negative, TN)':<25}  : {metrics['tn']:<8}")
    print(f"  {'偽陽性 (False Positive, FP)':<25} : {metrics['fp']:<8}")
    print(f"  {'偽陰性 (False Negative, FN)':<25} : {metrics['fn']:<8}")
    print()

def export_to_excel(y_true, y_pred, original_data, filename):
    """將預測結果匯出到 Excel"""
    output_df = original_data.copy()
    output_df['actual_income'] = y_true.map({0: '<=50K', 1: '>50K'})
    output_df['predicted_income'] = pd.Series(y_pred, index=y_true.index).map({0: '<=50K', 1: '>50K'})
    
    # 僅選擇幾個重要欄位和結果進行匯出
    output_df = output_df[['age', 'workclass', 'education', 'occupation', 'hours-per-week', 'actual_income', 'predicted_income']]
    
    output_df.to_excel(filename, index=False)
    print(f"預測結果已匯出至: {filename}")


# 4. 主程式
if __name__ == "__main__":
    # 設定路徑
    TRAIN_PATH = 'adult/adult.data'
    TEST_PATH = 'adult/adult.test'
    
    # 載入原始資料（用於匯出 Excel）
    original_test_df = pd.read_csv(TEST_PATH, header=None, names=[
        'age', 'workclass', 'fnlwgt', 'education', 'education-num',
        'marital-status', 'occupation', 'relationship', 'race', 'sex',
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
    ], skipinitialspace=True, na_values='?', skiprows=1)
    original_test_df.drop_duplicates(inplace=True)


    # 載入並預處理資料
    X_train, y_train, X_test, y_test = load_and_preprocess_data(TRAIN_PATH, TEST_PATH)
    
    print("=" * 50)
    print("資料集資訊")
    print("=" * 50)
    print(f"訓練集樣本數: {len(X_train)}")
    print(f"測試集樣本數: {len(X_test)}")
    print(f"特徵數量: 14") # 原始特徵數量
    train_dist = {f"'{k}'": v for k, v in dict(Counter(y_train.map({0: '<=50K', 1: '>50K'}))).items()}
    test_dist = {f"'{k}'": v for k, v in dict(Counter(y_test.map({0: '<=50K', 1: '>50K'}))).items()}
    print(f"類別分布 (訓練集): {str(train_dist).replace('\"', '')}")
    print(f"類別分布 (測試集): {str(test_dist).replace('\"', '')}")
    print()

    # 訓練模型
    # 使用與 C4.5 範例中相同的超參數以利比較
    model = DecisionTreeClassifier(
        criterion='entropy', # 使用資訊熵，模擬 C4.5
        max_depth=10,
        min_samples_split=500,
        min_samples_leaf=100,
        random_state=42
    )

    print("開始訓練 C4.5 決策樹...")
    model.fit(X_train, y_train)
    print(f"訓練完成：總節點數: {model.tree_.node_count}")
    print()

    # 進行預測
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # 計算評估指標
    train_metrics = calculate_metrics(y_train, y_pred_train)
    test_metrics = calculate_metrics(y_test, y_pred_test)

    print_performance_report(train_metrics, "訓練資料性能指標")
    print_performance_report(test_metrics, "測試資料性能指標")

    # 匯出預測結果
    export_to_excel(y_test.reset_index(drop=True), y_pred_test, original_test_df, 'adult_predictions_CART.xlsx')

資料集資訊
訓練集樣本數: 32537
測試集樣本數: 16276
特徵數量: 14
類別分布 (訓練集): {'<=50K': 24698, '>50K': 7839}
類別分布 (測試集): {'<=50K': 12430, '>50K': 3846}

開始訓練 C4.5 決策樹...
訓練完成：總節點數: 155

                訓練資料性能指標                
  準確率 (Accuracy)            : 85.55%
  精確度 (Precision)           : 80.66%
  召回率 (Recall)              : 52.66%
  F1-Score                  : 63.72%
                  
             --- Confusion Matrix ---             
  真陽性 (True Positive, TP)    : 4128    
  真陰性 (True Negative, TN)    : 23708   
  偽陽性 (False Positive, FP)  : 990     
  偽陰性 (False Negative, FN)  : 3711    

                測試資料性能指標                
  準確率 (Accuracy)            : 85.75%
  精確度 (Precision)           : 80.95%
  召回率 (Recall)              : 51.92%
  F1-Score                  : 63.27%
                  
             --- Confusion Matrix ---             
  真陽性 (True Positive, TP)    : 1997    
  真陰性 (True Negative, TN)    : 11960   
  偽陽性 (False Positive, FP)  : 470     
  偽陰性 (False Negative, FN)  : 1849    

預