# 樸素貝葉斯分類器 (Naive Bayes Classifier)

樸素貝葉斯是基於貝葉斯定理的概率分類算法，假設特徵之間相互獨立。

## 貝葉斯定理
P(A|B) = P(B|A) * P(A) / P(B)

在分類問題中：
P(類別|特徵) = P(特徵|類別) * P(類別) / P(特徵)

## sklearn 中的樸素貝葉斯變體
- **GaussianNB**: 適用於連續特徵，假設特徵服從高斯分布
- **MultinomialNB**: 適用於離散特徵，常用於文本分類
- **BernoulliNB**: 適用於二元特徵
- **ComplementNB**: 適用於不平衡資料集

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import seaborn as sns

## 範例 1: 使用 GaussianNB 處理 Iris 資料集

In [None]:
# 載入 Iris 資料集
iris = datasets.load_iris()
X_iris, y_iris = iris.data, iris.target

print(f"特徵名稱: {iris.feature_names}")
print(f"類別名稱: {iris.target_names}")
print(f"資料形狀: {X_iris.shape}")
print(f"類別分布: {np.bincount(y_iris)}")

In [None]:
# 分割資料
X_train, X_test, y_train, y_test = train_test_split(
    X_iris, y_iris, test_size=0.3, random_state=42, stratify=y_iris
)

print(f"訓練集大小: {X_train.shape}")
print(f"測試集大小: {X_test.shape}")

In [None]:
# 建立並訓練 GaussianNB 模型
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# 預測
y_pred = gnb.predict(X_test)
y_pred_proba = gnb.predict_proba(X_test)

# 評估結果
accuracy = accuracy_score(y_test, y_pred)
print(f"準確率: {accuracy:.4f}")
print(f"\n分類報告:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

In [None]:
# 混淆矩陣視覺化
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, 
            yticklabels=iris.target_names)
plt.title('Iris 資料集 - GaussianNB 混淆矩陣')
plt.xlabel('預測類別')
plt.ylabel('實際類別')
plt.show()

## 範例 2: 使用 MultinomialNB 處理文本分類

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups

# 載入新聞群組資料 (部分類別)
categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, 
                                     shuffle=True, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, 
                                    shuffle=True, random_state=42)

print(f"訓練文檔數量: {len(newsgroups_train.data)}")
print(f"測試文檔數量: {len(newsgroups_test.data)}")
print(f"類別: {newsgroups_train.target_names}")

In [None]:
# 文本向量化
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
X_train_text = vectorizer.fit_transform(newsgroups_train.data)
X_test_text = vectorizer.transform(newsgroups_test.data)
y_train_text = newsgroups_train.target
y_test_text = newsgroups_test.target

print(f"特徵向量形狀: {X_train_text.shape}")
print(f"詞彙量: {len(vectorizer.vocabulary_)}")

In [None]:
# 使用 MultinomialNB 進行文本分類
mnb = MultinomialNB(alpha=1.0)  # alpha 是平滑參數
mnb.fit(X_train_text, y_train_text)

# 預測
y_pred_text = mnb.predict(X_test_text)

# 評估
accuracy_text = accuracy_score(y_test_text, y_pred_text)
print(f"文本分類準確率: {accuracy_text:.4f}")
print(f"\n分類報告:")
print(classification_report(y_test_text, y_pred_text, 
                          target_names=newsgroups_train.target_names))

## 範例 3: 使用 BernoulliNB 處理二元特徵

In [None]:
# 載入乳癌資料集
cancer = datasets.load_breast_cancer()
X_cancer, y_cancer = cancer.data, cancer.target

# 將連續特徵二值化 (大於中位數為1，否則為0)
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=np.median(X_cancer, axis=0))
X_cancer_binary = binarizer.fit_transform(X_cancer)

print(f"原始特徵形狀: {X_cancer.shape}")
print(f"二值化後特徵形狀: {X_cancer_binary.shape}")
print(f"類別分布: {np.bincount(y_cancer)}")

In [None]:
# 分割資料
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(
    X_cancer_binary, y_cancer, test_size=0.3, random_state=42, stratify=y_cancer
)

# 使用 BernoulliNB
bnb = BernoulliNB(alpha=1.0)
bnb.fit(X_train_binary, y_train_binary)

# 預測
y_pred_binary = bnb.predict(X_test_binary)

# 評估
accuracy_binary = accuracy_score(y_test_binary, y_pred_binary)
print(f"BernoulliNB 準確率: {accuracy_binary:.4f}")
print(f"\n分類報告:")
print(classification_report(y_test_binary, y_pred_binary, 
                          target_names=cancer.target_names))

## 範例 4: 比較不同樸素貝葉斯算法

In [None]:
# 使用 Wine 資料集比較不同算法
wine = datasets.load_wine()
X_wine, y_wine = wine.data, wine.target

# 分割資料
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
    X_wine, y_wine, test_size=0.3, random_state=42, stratify=y_wine
)

# 標準化 (對 GaussianNB 有益)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_wine)
X_test_scaled = scaler.transform(X_test_wine)

# 二值化 (對 BernoulliNB)
binarizer_wine = Binarizer(threshold=0)  # 標準化後以0為閾值
X_train_wine_binary = binarizer_wine.fit_transform(X_train_scaled)
X_test_wine_binary = binarizer_wine.transform(X_test_scaled)

print(f"Wine 資料集特徵數: {X_wine.shape[1]}")
print(f"類別數: {len(wine.target_names)}")

In [None]:
# 比較不同算法
models = {
    'GaussianNB': (GaussianNB(), X_train_scaled, X_test_scaled),
    'MultinomialNB': (MultinomialNB(), X_train_wine, X_test_wine),  # 需要非負值
    'BernoulliNB': (BernoulliNB(), X_train_wine_binary, X_test_wine_binary)
}

results = {}

for name, (model, X_tr, X_te) in models.items():
    try:
        # 訓練模型
        model.fit(X_tr, y_train_wine)
        
        # 預測
        y_pred = model.predict(X_te)
        
        # 交叉驗證
        cv_scores = cross_val_score(model, X_tr, y_train_wine, cv=5)
        
        # 儲存結果
        results[name] = {
            'test_accuracy': accuracy_score(y_test_wine, y_pred),
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std()
        }
        
        print(f"{name}:")
        print(f"  測試準確率: {results[name]['test_accuracy']:.4f}")
        print(f"  交叉驗證: {results[name]['cv_mean']:.4f} (+/- {results[name]['cv_std']*2:.4f})")
        print()
        
    except Exception as e:
        print(f"{name} 執行錯誤: {e}")
        print()

## 範例 5: 超參數調整

In [None]:
from sklearn.model_selection import GridSearchCV

# 對 GaussianNB 調整平滑參數 (var_smoothing)
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
}

gnb_grid = GridSearchCV(
    GaussianNB(), param_grid, cv=5, scoring='accuracy', n_jobs=-1
)

gnb_grid.fit(X_train_scaled, y_train_wine)

print(f"最佳參數: {gnb_grid.best_params_}")
print(f"最佳交叉驗證分數: {gnb_grid.best_score_:.4f}")

# 測試最佳模型
best_gnb = gnb_grid.best_estimator_
y_pred_best = best_gnb.predict(X_test_scaled)
print(f"測試集準確率: {accuracy_score(y_test_wine, y_pred_best):.4f}")

## 範例 6: 特徵重要性分析

In [None]:
# 使用文本分類的特徵重要性
# 取得每個類別的特徵對數概率
feature_log_prob = mnb.feature_log_prob_
feature_names = vectorizer.get_feature_names_out()

# 找出每個類別最重要的詞彙
n_top_words = 10

for i, category in enumerate(newsgroups_train.target_names):
    top_indices = np.argsort(feature_log_prob[i])[-n_top_words:]
    top_words = [feature_names[idx] for idx in reversed(top_indices)]
    print(f"\n{category} 類別最重要的詞彙:")
    print(", ".join(top_words))

## 總結

### 樸素貝葉斯的優點：
- 訓練速度快，預測效率高
- 對小樣本表現良好
- 對不相關特徵不敏感
- 可以處理多分類問題
- 提供概率預測

### 樸素貝葉斯的缺點：
- 假設特徵獨立（現實中很少成立）
- 對特徵分布假設敏感
- 需要平滑處理零概率問題

### 選擇指南：
- **GaussianNB**: 連續特徵，假設高斯分布
- **MultinomialNB**: 計數特徵，如詞頻
- **BernoulliNB**: 二元特徵
- **ComplementNB**: 不平衡資料集