In [1]:
import numpy as np
import pandas as pd

data = [['Sunny', 'Hot', 'High', 'Weak','No'], ['Sunny', 'Hot', 'High', 'Strong', 'No'], ['Overcast', 'Hot', 'High', 'Weak', 'Yes'], ['Rain', 'Mild', 'High', 'Weak', 'Yes'], ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
        ['Rain', 'Cool', 'Normal', 'Strong', 'No'], ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'], ['Sunny', 'Mild', 'High', 'Weak', 'No'], ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'], ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
        ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'], ['Overcast', 'Mild', 'High', 'Strong', 'Yes'], ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'], ['Rain', 'Mild', 'High', 'Strong', 'No']]


In [2]:
Data = pd.DataFrame(data, columns=['Outlook', 'Temperature', 'Humidity', 'Wind', 'Play Tennis'])

cols = Data.shape[1]  # 5列:4个特征+ 1个标签
X_data = Data.iloc[:, :cols - 1]
Y_data = Data.iloc[:, cols - 1:]
featureNames = X_data.columns

In [3]:
def Naive_Bayes(X_data, Y_data):
    y = Y_data.values
    X = X_data.values
    y_unique = np.unique(y)
    prior_prob = np.zeros(len(y_unique))
    for i in range(len(y_unique)):
        prior_prob[i] = sum(y == y_unique[i]) / len(y)

    condition_prob = {}
    for feat in featureNames:
        x_unique = list(set(X_data[feat]))
        x_condition_prob = np.zeros((len(y_unique), len(x_unique)))
        for j in range(len(y_unique)):
            for k in range(len(x_unique)):
                x_condition_prob[j, k] = sum((X_data[feat] == x_unique[k]) & (Y_data['Play Tennis'] == y_unique[j])) / sum(
                    y == y_unique[j])
        x_condition_prob = pd.DataFrame(x_condition_prob, columns=x_unique, index=y_unique)
        condition_prob[feat] = x_condition_prob
    return prior_prob, condition_prob, y_unique

In [4]:
prior_prob, condition_prob,_ =Naive_Bayes(X_data,Y_data)
print(prior_prob)
print(condition_prob['Outlook'])
print(condition_prob['Temperature'])
print(condition_prob['Humidity'])
print(condition_prob['Wind'])

[0.35714286 0.64285714]
        Sunny      Rain  Overcast
No   0.600000  0.400000  0.000000
Yes  0.222222  0.333333  0.444444
         Mild       Hot      Cool
No   0.400000  0.400000  0.200000
Yes  0.444444  0.222222  0.333333
         High    Normal
No   0.800000  0.200000
Yes  0.333333  0.666667
       Strong      Weak
No   0.600000  0.400000
Yes  0.333333  0.666667


  prior_prob[i] = sum(y == y_unique[i]) / len(y)
  x_condition_prob[j, k] = sum((X_data[feat] == x_unique[k]) & (Y_data['Play Tennis'] == y_unique[j])) / sum(
  x_condition_prob[j, k] = sum((X_data[feat] == x_unique[k]) & (Y_data['Play Tennis'] == y_unique[j])) / sum(
  x_condition_prob[j, k] = sum((X_data[feat] == x_unique[k]) & (Y_data['Play Tennis'] == y_unique[j])) / sum(
  x_condition_prob[j, k] = sum((X_data[feat] == x_unique[k]) & (Y_data['Play Tennis'] == y_unique[j])) / sum(


In [5]:
def Prediction(testData, prior, condition_prob, y_unique):
    numclass = prior.shape[0]
    featureNames = testData.columns
    numsample = testData.shape[0]
    post_prob = np.zeros((numsample, numclass))
    for k in range(numsample):
        prob_k = np.zeros((numclass,))
        for i in range(numclass):
            pri = prior[i]
            for feat in featureNames:
                feat_val = testData[feat][k]
                cp = condition_prob[feat]
                cp_val = cp.loc[y_unique[i], feat_val]  # 使用标签值进行索引
                pri *= cp_val
            prob_k[i] = pri
        prob = prob_k / np.sum(prob_k, axis=0)
        post_prob[k, :] = prob
    return post_prob


In [6]:
prior_prob, condition_prob, y_unique = Naive_Bayes(X_data, Y_data)

test_data = [['Sunny', 'Cool', 'High', 'Strong']]
testData = pd.DataFrame(test_data, columns=['Outlook', 'Temperature', 'Humidity', 'Wind'])

postPrior = Prediction(testData, prior_prob, condition_prob, y_unique)
print(postPrior)

[[0.79541735 0.20458265]]


  prior_prob[i] = sum(y == y_unique[i]) / len(y)
  x_condition_prob[j, k] = sum((X_data[feat] == x_unique[k]) & (Y_data['Play Tennis'] == y_unique[j])) / sum(
  x_condition_prob[j, k] = sum((X_data[feat] == x_unique[k]) & (Y_data['Play Tennis'] == y_unique[j])) / sum(
  x_condition_prob[j, k] = sum((X_data[feat] == x_unique[k]) & (Y_data['Play Tennis'] == y_unique[j])) / sum(
  x_condition_prob[j, k] = sum((X_data[feat] == x_unique[k]) & (Y_data['Play Tennis'] == y_unique[j])) / sum(


In [7]:
def get_prediction(post_prob, y_unique):
    idx = np.argmax(post_prob, axis=1)  # 获取具有最高概率的类别的索引
    prediction = [y_unique[i] for i in idx]  # 根据索引获取类别
    return prediction

post_prior = Prediction(testData, prior_prob, condition_prob, y_unique)
prediction = get_prediction(post_prior, y_unique)
print("Prediction:", prediction)

Prediction: ['No']
