#### traditional naive bayes cannot process continuous values,which may cause key error
#### do not built the conditional prob of $P(X^j=x^j|y=c_k)$ when $x^j$ is a continuous value

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# load dataset
iris = load_iris()
df = pd.DataFrame(iris.data,dtype=int)
df['label'] = iris.target

In [3]:
df.head()

Unnamed: 0,0,1,2,3,label
0,5,3,1,0,0
1,4,3,1,0,0
2,4,3,1,0,0
3,4,3,1,0,0
4,5,3,1,0,0


In [4]:
class NaiveBayes:
    def __init__(self, X, Y,lamda=1):
        self.labels = np.unique(Y)
        data = pd.DataFrame(X)
        data['label'] = Y
        feature_dim = data.shape[1]-1
        
        self.y_prob = data['label'].groupby(data['label']).count()+lamda
        self.y_prob /= self.y_prob.sum()
        
        #calc conditional probability
        self.condition_prob = []
        for i in range(feature_dim):
            cond_p = pd.crosstab(data[i],data['label'])+lamda
            self.condition_prob.append(cond_p/cond_p.sum())
        
    def predict(self,X):
        res = []
        labels = list(self.labels)
        for x in X:
            label_prob = []
            for c_k in labels:
                #P(y=c_k)
                prob = self.y_prob[c_k]
                for j in range(len(x)):
                    #multi P(X_j = x_j|y=c_k)
                    prob *= self.condition_prob[j][c_k][x[j]]
                label_prob.append([prob,c_k])
            # get label
            label_prob.sort(key=lambda x:-x[0])
            res.append(label_prob[0][1])
        return res

In [5]:
#prepare data
X,Y = df.values[:,:-1],df.values[:,-1]
train_X,test_X,train_Y,test_Y = train_test_split(X,Y,test_size=0.7,random_state=42)

import time

stime = time.time()
model = NaiveBayes(X=train_X,Y=train_Y,lamda=1)
preds = model.predict(test_X)

etime = time.time()
print("ACC score:{},cost time [{}]s".format(accuracy_score(test_Y,preds),etime-stime))

ACC score:0.9523809523809523,cost time [0.10011839866638184]s


In [6]:
test_Y

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0,
       2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1,
       2, 2, 1, 2, 1, 2, 1, 0, 2, 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0,
       1, 2, 0, 2, 2, 1, 1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 2,
       1, 2, 2, 1, 0, 0, 2, 2, 0, 0, 0, 1, 2])

In [7]:
np.array(preds)

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 1, 0,
       2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1,
       2, 2, 1, 2, 1, 2, 1, 0, 2, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 2, 0,
       1, 2, 0, 1, 2, 1, 1, 2, 1, 0, 1, 2, 0, 0, 1, 2, 0, 2, 0, 0, 1, 1, 2,
       2, 2, 2, 1, 0, 0, 2, 2, 0, 0, 0, 1, 2])