# 贝叶斯分类器的实现

In [1]:
import numpy as np
import pandas as pd

准备数据

In [2]:
data = [
    ['Sunny', 'Hot', 'High', 'Weak', 'No'],
    ['Sunny', 'Hot', 'High', 'Strong', 'No'],
    ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
    ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
    ['Sunny', 'Mild', 'High', 'Weak', 'No'],
    ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
    ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
    ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
    ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'High', 'Strong', 'No']
]

数据可视化

In [3]:
Data = pd.DataFrame(data,columns=['Outlook', 'Temperature', 'Humanity', 'Wind','PlayTennis'])
Data.head() # 查看前几条数据

Unnamed: 0,Outlook,Temperature,Humanity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [4]:
cols = Data.shape[1] # 3列：2个特征 + 1个标签
X_data = Data.iloc[:,0:cols-1]
Y_data = Data.iloc[:,(cols-1):cols]
featureNames = X_data.columns

训练贝叶斯模型

In [5]:
def Naive_Bayes(X_data, Y_data):
    # 第一步：计算先验概率
    y = Y_data.values 
    X = X_data.values 
    y_unique = np.unique(y)

    prior_prob = np.zeros(len(y_unique)) 

    for i in range(len(y_unique)):
        # 计算标签值为 y_unique[i] 的样本在总样本中的比例，即先验概率
        prior_prob[i] = np.sum(y == y_unique[i]) / len(y)

    # 第二步：计算条件概率（似然性）
    condition_prob = {}

    for feat in featureNames:
        x_unique = list(set(X_data[feat]))  # 获取特征的唯一值列表
        x_condition_prob = np.zeros((len(y_unique), len(x_unique))) 

        for j in range(len(y_unique)):
            for k in range(len(x_unique)):
                # 计算特征值为 x_unique[k] 在给定标签值的条件下的概率
                x_condition_prob[j, k] = \
                    np.sum((X_data[feat] == x_unique[k]) & (Y_data['PlayTennis'] == y_unique[j])) / np.sum(y == y_unique[j])

        # 将条件概率转换为DataFrame，存储到字典中
        x_condition_prob = pd.DataFrame(x_condition_prob, columns=x_unique, index=y_unique)
        condition_prob[feat] = x_condition_prob

    return prior_prob, condition_prob

In [6]:
def Prediction(testData, prior, condition_prob):
    numclass = prior.shape[0] 
    featureNames = testData.columns 

    numclass = prior.shape[0]  #类别数
    numsample = testData.shape[0]  #样本数
    featureNames = testData.columns

    post_prob = np.zeros((numsample, numclass))

    # 遍历测试样本
    for k in range(numsample):
        prob_k = np.zeros((numclass,))
        # 遍历类别
        for i in range(numclass):
            pri = prior[i]

            for feat in featureNames:
                feat_val = testData[feat][k] 
                cp = condition_prob[feat]
                cp_val = cp[feat_val].iloc[i]
                pri *= cp_val  #计算当前类别的联合概率
                prob_k[i] = pri  #存储当前类别的联合概率
        prob = prob_k / np.sum(prob_k, axis=0)  #计算后验概率
        post_prob[k, :] = prob 

    return post_prob

需要预测的数据

In [7]:
test_Data = [['Sunny', 'Cool', 'High', 'Strong']]
testData = pd.DataFrame(test_Data, columns=['Outlook', 'Temperature', 'Humanity', 'Wind'])
testData.head()

Unnamed: 0,Outlook,Temperature,Humanity,Wind
0,Sunny,Cool,High,Strong


In [8]:
prior_prob, condition_prob = Naive_Bayes(X_data, Y_data)
condition_prob['Outlook']

Unnamed: 0,Rain,Overcast,Sunny
No,0.4,0.0,0.6
Yes,0.333333,0.444444,0.222222


In [9]:
condition_prob['Temperature']

Unnamed: 0,Cool,Mild,Hot
No,0.2,0.4,0.4
Yes,0.333333,0.444444,0.222222


In [10]:
condition_prob['Humanity']

Unnamed: 0,High,Normal
No,0.8,0.2
Yes,0.333333,0.666667


In [11]:
postPrior = Prediction(testData, prior_prob, condition_prob)
postPrior

array([[0.79541735, 0.20458265]])

### 预测类别为'No'

# 使用sklearn实现朴素贝叶斯分类

In [12]:
from sklearn.naive_bayes import BernoulliNB

In [13]:
data_train = Data
X_train0 = data_train[['Outlook', 'Temperature', 'Humanity', 'Wind']]  # 训练样本的特征空间
X_test_ = pd.DataFrame([['Sunny', 'Cool', 'High', 'Strong']], columns=['Outlook', 'Temperature', 'Humanity', 'Wind'])
X_total = X_train0._append(X_test_)  # 将测试数据添加到训练数据中
X_total = pd.get_dummies(X_total, drop_first=True)  # 类别变量处理
X_total = X_total.to_numpy()  # 所有样本的特征
X_train = X_total[0:14]  # 训练样本的特征
X_test_ = X_total[14:15]  # 测试样本的特征
Y = Data['PlayTennis']

In [14]:
model = BernoulliNB(alpha=1.0e-10)
cft = model.fit(X_train, Y)

在训练集上评估 NB 模型

In [15]:
Y_train_predict = cft.predict(X_train)

In [16]:
Y_train_predict

array(['No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes',
       'Yes', 'Yes', 'Yes', 'No'], dtype='<U3')

In [17]:
Y.values.T

array(['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes',
       'Yes', 'Yes', 'Yes', 'No'], dtype=object)

In [18]:
prior_prob = np.exp(cft.class_log_prior_)  # 先验概率

In [19]:
prior_prob

array([0.35714286, 0.64285714])

In [20]:
log_condition_prob = cft.feature_log_prob_  # 条件概率
condition_prob = np.exp(log_condition_prob)

In [21]:
condition_prob

array([[0.4       , 0.6       , 0.4       , 0.4       , 0.2       ,
        0.4       ],
       [0.33333333, 0.22222222, 0.22222222, 0.44444444, 0.66666667,
        0.66666667]])

In [22]:
# 测试
y_test = cft.predict(X_test_)
y_test_prob = cft.predict_proba(X_test_)
y_test
y_test_prob

array([[0.82931918, 0.17068082]])

### 预测类别也为'No'