In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math 


In [2]:
# data
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = [
        'sepal length', 'sepal width', 'petal length', 'petal width', 'label'
    ]
    data = np.array(df.iloc[:100, :])
    return data[:, :-1], data[:, -1]

In [3]:
x,y=create_data()
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [4]:
x_train

array([[5. , 3. , 1.6, 0.2],
       [5.1, 3.4, 1.5, 0.2],
       [4.5, 2.3, 1.3, 0.3],
       [4.7, 3.2, 1.6, 0.2],
       [5.2, 4.1, 1.5, 0.1],
       [5.6, 2.5, 3.9, 1.1],
       [6.1, 2.8, 4.7, 1.2],
       [5. , 3.5, 1.6, 0.6],
       [5.4, 3.4, 1.5, 0.4],
       [4.8, 3.1, 1.6, 0.2],
       [6.3, 3.3, 4.7, 1.6],
       [6.7, 3. , 5. , 1.7],
       [6.4, 2.9, 4.3, 1.3],
       [5.5, 2.4, 3.7, 1. ],
       [6.7, 3.1, 4.7, 1.5],
       [5.7, 2.9, 4.2, 1.3],
       [5. , 2. , 3.5, 1. ],
       [5.5, 2.3, 4. , 1.3],
       [5.7, 2.6, 3.5, 1. ],
       [6.3, 2.5, 4.9, 1.5],
       [5.7, 2.8, 4.5, 1.3],
       [5.4, 3.9, 1.7, 0.4],
       [5.4, 3. , 4.5, 1.5],
       [5.2, 3.4, 1.4, 0.2],
       [5. , 3.4, 1.5, 0.2],
       [5.6, 3. , 4.1, 1.3],
       [4.6, 3.4, 1.4, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [4.3, 3. , 1.1, 0.1],
       [6.9, 3.1, 4.9, 1.5],
       [6.1, 2.8, 4. , 1.3],
       [5.5, 2.5, 4. , 1.3],
       [4.4, 3. , 1.3, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [5.5, 2

# 高斯朴素贝叶斯 GaussianNB  
特征的可能性假设为服从**正态分布**  
概率密度函数：  
$P(x_i|y_k)=\frac{1}{\sqrt{2\pi\sigma^2_{yk}}}exp(-\frac{(x_i-\mu_{yk}\ )^2}{2\sigma^2_{yk}})$  
$\mu_{yk}$为在类别为yk的样本中，特征$x_i$的均值  
$\sigma_y$为在类别为yk的样本中，特征$x_i$的标准差  
$\sigma^2=\frac{\sum(X-\mu)^2}{N}$

In [5]:
class my_GaussianNB:
    def __init__(self):
        self.avg=None  # 均值
        self.var=None  # 方差
        self.prior=None  # 先验概率
        self.n_class=None  # 类别个数
    
    def get_prior(self,y_train):  # 获取先验概率
        count=Counter(y_train)
        prior_vec=np.array([count[i]/len(count) for i in range(len(count))])
        return prior_vec
    
    def get_avg(self,x_train,y_train):  # 获取每个label对应特征维度的均值
#         avg_arr=x_train[y_train==0].mean(axis=0)
#         i=1
#         while i<self.n_class:
#             arr=x_train[y_train==i].mean(axis=0)
#             avg_arr=np.vstack((arr,avg_arr))
        return np.array([x_train[y_train==i].mean(axis=0) for i in range(self.n_class)])
    
    def get_var(self,x_train,y_train):  # 获取方差
#         var_arr=x_train[y_train==0].var(axis=0)
#         i=1
#         while i<self.n_class:
#             arr=x_train[y_train==i].var(axis=0)
#             var_arr=np.vstack((arr,var_arr))
        return np.array([x_train[y_train==i].var(axis=0) for i in range(self.n_class)])
    
    def likelihood(self,row):
        return (1 / np.sqrt(2 * np.pi * self.var) * np.exp(
        -(row - self.avg)**2 / (2 * self.var))).prod(axis=1)  # prod表示一行元素相乘
    
    def fit(self, x_train, y_train):
        self.prior = self.get_prior(y_train)
        self.n_class = len(self.prior)
        self.avg = self.get_avg(x_train, y_train)
        self.var = self.get_var(x_train, y_train)
        
    def predict_prob(self, x_train):
        likelihood = np.apply_along_axis(self.likelihood, axis=1, arr=x_train)
        probs = self.prior * likelihood
        probs_sum = probs.sum(axis=1)
        return probs / probs_sum[:, None]  # 进行归一化
    
    def predict(self, x_train):
        return self.predict_prob(x_train).argmax(axis=1)
    

In [6]:
model=my_GaussianNB()

In [7]:
model.fit(x_train,y_train)

In [8]:
y_pre=model.predict(x_test)

In [9]:
cnt=0
for i in range(len(y_pre)):
    if y_pre[i]==y_test[i]:
        cnt=cnt+1
    acc=cnt/len(y_pre)
    
acc

1.0

# 使用sklearn库的GussianNB训练  
**predict**方法：直接给出测试集的预测类别  
**predict_proba**方法：给出测试集样本在各个类别上预测的概率  
**predict_log_proba**方法：预测概率的对数转化

In [10]:
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB  # 高斯模型，伯努利模型和多项式模型

In [11]:
model2=GaussianNB()
model2.fit(x_train,y_train)

In [12]:
model2.score(x_test,y_test)

1.0