In [12]:
# 1. 讀入套件
import numpy as np
import pandas as pd
from collections import Counter
from typing import Optional, Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

# 用於輸出 Excel 檔案
from openpyxl import Workbook
from openpyxl.styles import PatternFill

# 2. 資料預處理
class ID3DataPreprocessor:
    def __init__(self, train_path: str, test_path: str, bins: int = 5):
        self.train_path = train_path
        self.test_path = test_path
        self.bins = bins
        self.column_names = [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
        ]
        self.original_categorical_features = [
            'workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'native-country'
        ]
        self.original_numerical_features = [
            'age', 'fnlwgt', 'education-num', 'capital-gain',
            'capital-loss', 'hours-per-week'
        ]
        self.bin_edges = {}

    def discretize_numerical_features(self, df: pd.DataFrame, is_train: bool) -> pd.DataFrame:
        for col in self.original_numerical_features:
            if is_train:
                df[col], bins = pd.qcut(df[col], q=self.bins, retbins=True, duplicates='drop')
                self.bin_edges[col] = bins
            else:
                bins = self.bin_edges[col]
                robust_bins = np.copy(bins)
                robust_bins[0], robust_bins[-1] = -np.inf, np.inf
                df[col] = pd.cut(df[col], bins=robust_bins, include_lowest=True)
            df[col] = df[col].astype(str)
        return df

    def load_and_preprocess(self, filepath: str, is_train: bool = False) -> pd.DataFrame:
        skip = 1 if not is_train else 0
        df = pd.read_csv(filepath, names=self.column_names, 
                         skipinitialspace=True, na_values='?', skiprows=skip)
        if not is_train:
            df['income'] = df['income'].str.rstrip('.')
        
        initial_rows = len(df)
        df.drop_duplicates(inplace=True)
        if initial_rows > len(df):
            print(f"移除 {initial_rows - len(df)} 筆重複資料")
        
        for col in self.original_categorical_features + self.original_numerical_features:
            if df[col].isnull().any():
                if col in self.original_categorical_features:
                    fill_value = df[col].mode()[0]
                    print(f"欄位 {col} 有缺失值，使用眾數 '{fill_value}' 填補")
                else:
                    fill_value = df[col].median()
                    print(f"欄位 {col} 有缺失值，使用中位數 {fill_value} 填補")
                df[col].fillna(fill_value, inplace=True)

        print("開始離散化數值特徵...")
        df = self.discretize_numerical_features(df, is_train)
        
        df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})
        df.dropna(subset=['income'], inplace=True)
        df['income'] = df['income'].astype(int)
        
        return df
    
    def run(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        print("=" * 50 + "\n處理訓練資料...\n" + "=" * 50)
        train_df = self.load_and_preprocess(self.train_path, is_train=True)
        print(f"訓練資料筆數: {len(train_df)}\n")
        
        print("=" * 50 + "\n處理測試資料...\n" + "=" * 50)
        test_df = self.load_and_preprocess(self.test_path, is_train=False)
        print(f"測試資料筆數: {len(test_df)}")
        return train_df, test_df

# 3. ID3 決策樹實作 
class ID3Node:
    def __init__(self):
        self.feature: Optional[str] = None
        self.is_leaf: bool = False
        self.prediction: Optional[int] = None
        self.children: Dict = {}
        self.class_distribution: Dict = {}

class ID3DecisionTree:
    def __init__(self, max_depth: Optional[int] = None, min_samples_split: int = 2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root: Optional[ID3Node] = None

    def _entropy(self, y: np.ndarray) -> float:
        if len(y) == 0: return 0.0
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return -np.sum(probabilities * np.log2(probabilities + 1e-10))

    def _information_gain(self, X_col: pd.Series, y: np.ndarray) -> float:
        parent_entropy = self._entropy(y)
        child_entropy = sum((len(subset_y) / len(y)) * self._entropy(subset_y) 
                            for _, subset_y in y.groupby(X_col))
        return parent_entropy - child_entropy

    def _best_split(self, X: pd.DataFrame, y: pd.Series) -> Optional[str]:
        gains = {feature: self._information_gain(X[feature], y) for feature in X.columns}
        best_feature = max(gains, key=gains.get)
        return best_feature if gains[best_feature] > 0 else None

    def _build_tree(self, X: pd.DataFrame, y: pd.Series, depth: int) -> ID3Node:
        node = ID3Node()
        node.class_distribution = y.value_counts().to_dict()
        most_common_class = y.mode()[0]

        stop_splitting = (
            len(y.unique()) == 1 or
            len(y) < self.min_samples_split or
            (self.max_depth is not None and depth >= self.max_depth)
        )
        if stop_splitting:
            node.is_leaf = True
            node.prediction = most_common_class
            return node

        best_feature = self._best_split(X, y)
        if not best_feature:
            node.is_leaf = True
            node.prediction = most_common_class
            return node
        
        node.feature = best_feature
        for value, group in X.groupby(best_feature):
            node.children[value] = self._build_tree(group.drop(columns=[best_feature]), y.loc[group.index], depth + 1)
        return node
    
    def fit(self, X: pd.DataFrame, y: pd.Series):
        print("\n開始訓練 ID3 決策樹模型...")
        y_series = pd.Series(y, index=X.index) # 確保 y 也是 Series
        self.root = self._build_tree(X, y_series, 0)
        print("訓練完成。")
    
    def _predict_sample(self, x: pd.Series, node: ID3Node) -> int:
        if node.is_leaf:
            return node.prediction
        
        feature_value = x.get(node.feature)
        if feature_value not in node.children:
            return max(node.class_distribution, key=node.class_distribution.get)
        
        return self._predict_sample(x, node.children[feature_value])
    
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        if self.root is None: raise ValueError("模型尚未訓練")
        return X.apply(self._predict_sample, axis=1, node=self.root).to_numpy()

# 4. 主程式與評估模型
def print_metrics(y_true: np.ndarray, y_pred: np.ndarray, title: str):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    print("\n" + "=" * 50 + f"\n{title.center(40)}\n" + "=" * 50)
    print(f"  準確率 (Accuracy) : {accuracy:.2%}")
    print(f"  精確度 (Precision): {precision:.2%}")
    print(f"  召回率 (Recall)   : {recall:.2%}")
    print(f"  F1-Score        : {f1:.2%}")
    print("\n--- Confusion Matrix ---".center(50))
    print(f"  {'真陽性 (True Positive, TP)':<25}  : {tp:<10}")
    print(f"  {'真陰性 (True Negative, TN)':<25}  : {tn:<10}")
    print(f"  {'偽陽性 (False Positive, FP)':<25}  : {fp:<10}")
    print(f"  {'偽陰性 (False Negative, FN)':<25}  : {fn:<10}")
    
def export_to_excel(y_true: np.ndarray, y_pred: np.ndarray, filename: str):
    wb = Workbook()
    ws = wb.active
    ws.title = "Prediction Results"
    ws.append(['實際類別', '預測類別'])
    
    label_map = {0: '<=50K', 1: '>50K'}
    for true_val, pred_val in zip(y_true, y_pred):
        ws.append([label_map[true_val], label_map[pred_val]])
        
    wb.save(filename)
    print(f"\n預測結果已匯出至: {filename}")

# 主程式 
def main():
    # --- 在此處手動調整參數 ---
    BINS = 10  #數值特徵離散化的區間數
    MAX_DEPTH = 8
    MIN_SAMPLES_SPLIT = 200 #節點可被分割的最小樣本數
    # --------------------------
    
    # 執行資料預處理
    preprocessor = ID3DataPreprocessor('adult/adult.data', 'adult/adult.test', bins=BINS)
    train_df, test_df = preprocessor.run()
    
    # 分離特徵 (X) 與目標 (y)
    X_train = train_df.drop('income', axis=1)
    y_train = train_df['income'].values
    X_test = test_df.drop('income', axis=1)
    y_test = test_df['income'].values
    
    # 建立與訓練模型
    model = ID3DecisionTree(max_depth=MAX_DEPTH, min_samples_split=MIN_SAMPLES_SPLIT)
    model.fit(X_train, y_train)
    
    # 預測與評估
    y_pred_train = model.predict(X_train)
    print_metrics(y_train, y_pred_train, "訓練資料性能指標")
    
    y_pred_test = model.predict(X_test)
    print_metrics(y_test, y_pred_test, "測試資料性能指標")
    
    # 匯出結果
    export_to_excel(y_test, y_pred_test, 'adult_predictions_ID3.xlsx')

if __name__ == "__main__":
    main()

處理訓練資料...
移除 24 筆重複資料
欄位 workclass 有缺失值，使用眾數 'Private' 填補
欄位 occupation 有缺失值，使用眾數 'Prof-specialty' 填補
欄位 native-country 有缺失值，使用眾數 'United-States' 填補
開始離散化數值特徵...
訓練資料筆數: 32537

處理測試資料...
移除 5 筆重複資料
欄位 workclass 有缺失值，使用眾數 'Private' 填補
欄位 occupation 有缺失值，使用眾數 'Prof-specialty' 填補
欄位 native-country 有缺失值，使用眾數 'United-States' 填補
開始離散化數值特徵...
測試資料筆數: 16276

開始訓練 ID3 決策樹模型...
訓練完成。

                訓練資料性能指標                
  準確率 (Accuracy) : 83.59%
  精確度 (Precision): 69.49%
  召回率 (Recall)   : 56.83%
  F1-Score        : 62.53%
            
--- Confusion Matrix ---             
  真陽性 (True Positive, TP)    : 4455      
  真陰性 (True Negative, TN)    : 22742     
  偽陽性 (False Positive, FP)   : 1956      
  偽陰性 (False Negative, FN)   : 3384      

                測試資料性能指標                
  準確率 (Accuracy) : 83.03%
  精確度 (Precision): 67.24%
  召回率 (Recall)   : 54.97%
  F1-Score        : 60.49%
            
--- Confusion Matrix ---             
  真陽性 (True Positive, TP)    : 2114      
  真陰性 (True Negat