In [117]:
import numpy as np
import pandas as pd

data = [['Sunny', 'Hot', 'High', 'Weak', 'No'],
        ['Sunny', 'Hot', 'High', 'Strong', 'No'],
        ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
        ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
        ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
        ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
        ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
        ['Sunny', 'Mild', 'High', 'Weak', 'No'],
        ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
        ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
        ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
        ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
        ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
        ['Rain', 'Mild', 'High', 'Strong', 'No']
        ]

Data = pd.DataFrame(data, columns=['Outlook', 'Temperature', 'Humidity', 'Wind', 'PlayTennis'])
Data.head()

Unnamed: 0,Outlook,Temperature,Humidity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [118]:
cols = Data.shape[1]
X_data = Data.iloc[:,:cols-1]
Y_data = Data.iloc[:,cols-1:]
featureNames = X_data.columns

In [119]:
##——————————————————贝叶斯分类器模型训练过程————————————————##
def Naive_Bayes(x_data, y_data):
    # step1 prior
    y = y_data.values
    x = x_data.values
    y_unique = np.unique(y)  # 记录label

    prior_prob = np.zeros(len(y_unique))

    for i in range(len(y_unique)):
        prior_prob[i] = np.sum(y == y_unique[i]) / len(y)

    # step2 likelihood
    condition_prob = {}

    for feat in featureNames:
        x_unique = list(set(x_data[feat]))  # set转换为集合，去除重复的值，保留唯一值  -> 再转为list
        x_condition_prob = np.zeros((len(y_unique), len(x_unique)))  # 类型行*属性个数列的数组，记录条件概率
        for j in range(len(y_unique)):
            for k in range(len(x_unique)):
                x_condition_prob[j, k] = np.sum((x_data[feat] == x_unique[k]) & (y_data['PlayTennis'] == y_unique[j])) / np.sum(y == y_unique[j])
        x_condition_prob = pd.DataFrame(x_condition_prob, columns=x_unique, index=y_unique)
        condition_prob[feat] = x_condition_prob

    return prior_prob, condition_prob

In [120]:
def Prediction(testdata, prior, condition_prob):
    labelnum = len(prior)        #记录label数目
    featureNames = testdata.columns
    samplenum = testdata.shape[0]     #记录测试样例数目
    
    post_prob = np.zeros((samplenum,labelnum)) #记录每一个测试样例，在每个类别下的后验概率

    for k in range(samplenum):
        prob_k = np.zeros((labelnum,))
        for i in range(labelnum):
            pri = prior[i]
            for feat in featureNames:
                feat_val = testdata[feat][k]
                cp = condition_prob[feat]      #字典找表
                cp_val = cp[feat_val].iloc[i]   
                pri*=cp_val
            prob_k[i] = pri
        prob = prob_k/np.sum(prob_k, axis=0) 
        post_prob[k,:]= prob
    return post_prob
        
    

In [121]:
prior_prob, condition_prob = Naive_Bayes(X_data,Y_data)
prior_prob

array([0.35714286, 0.64285714])

In [122]:
condition_prob['Outlook']

Unnamed: 0,Overcast,Sunny,Rain
No,0.0,0.6,0.4
Yes,0.444444,0.222222,0.333333


In [123]:
condition_prob['Temperature']

Unnamed: 0,Mild,Hot,Cool
No,0.4,0.4,0.2
Yes,0.444444,0.222222,0.333333


In [124]:
condition_prob['Humidity']

Unnamed: 0,High,Normal
No,0.8,0.2
Yes,0.333333,0.666667


In [125]:
testdata = [['Sunny', 'Cool', 'High', 'Strong']]
testdata = pd.DataFrame(testdata, columns=['Outlook', 'Temperature', 'Humidity', 'Wind'])
testdata.head()

Unnamed: 0,Outlook,Temperature,Humidity,Wind
0,Sunny,Cool,High,Strong


In [126]:
postPrior = Prediction(testdata, prior_prob, condition_prob)
postPrior

array([[0.79541735, 0.20458265]])