### Construindo um Classificador Naive Bayes em Python

Estamos construindo um classificador Naive Bayes com BernoulliNB e MultinomialNB em Python. Não usaremos as funções do Scikit-learn.

In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import logging
import sys
from time import time
from math import *
from matplotlib import pyplot as pl
from matplotlib.backends.backend_pdf import PdfPages

In [1]:
class MyBernClassifier():
  
    def __init__(self, smooth = 1):
        self._smooth = smooth 
        self._feat_prob = []
        self._class_prob = []
        self._Ncls = []
        self._Nfeat = []

    def train(self, X, y):
        print ("Treinando Bernoulli NB...")
        count_each_class = {}
        feature_count = {}
        alpha = self._smooth 
        temp = []
        temp.append(np.unique(y))
        self._Ncls.append(temp[0].size) # Número total de classes
        self._Nfeat.append(X[0].size)  # Número total de features
        
        for i in range(y.size):
            if y[i] in feature_count:
                continue
            else:
                feature_count[y[i]] = [0 for w in range (X[i].size)]
                

        # Conta os atributos para cada classe através do treinamento ou 
        # conta a ocorrência de cada classe através do treinamento
        for i in range (y.size):
            if y[i] in count_each_class:
                count_each_class[y[i]] +=1
            else:
                count_each_class[y[i]] = 1
            for j in  range(X[i].size):
                    feature_count[y[i]][j] += X[i][j]
                    
        # Calcula probabilidades de classe e atributos para cada classe      
        for cls in feature_count:
            
            num = (self._smooth+count_each_class[cls])
            din = (y.size+(self._Ncls[0]*self._smooth))
            self._class_prob.append((num/float(din)))
            ar = np.array([])
            for j in  range(X[i].size):
                
                num = (feature_count[cls][j] + self._smooth)
                din = (count_each_class[cls]+(2*self._smooth))
                ar = np.append(ar,(num/float(din)))
            self._feat_prob.append(ar)
    

    def predict(self, X):
        
        print ("Fazendo Previsões com Bernoulli NB...")
        
        Y_predict = np.array([])

        for i in X:
            neg_log_prob = 0
            minimum_neg_log_prob = 999999999999999
            category = 0  
                
            for cls in range(self._Ncls[0]):
                neg_log_prob = -log(self._class_prob[cls])
                for j in  range(self._Nfeat[0]):  
                    if (i[j])==0:
                        neg_log_prob -= log(1-self._feat_prob[cls][j])
                    else:
                        neg_log_prob -= log(self._feat_prob[cls][j])
                        
                if minimum_neg_log_prob>neg_log_prob:
                    category=cls
                    minimum_neg_log_prob=neg_log_prob
            
            Y_predict=np.append(Y_predict,category)
         
        return Y_predict

In [2]:
class MyMultinomialBayesClassifier():
    
    def __init__(self, smooth = 1):
        self._smooth = smooth 
        self._feat_prob = []
        self._class_prob = []
        self._class_neg_prob = []
        self._Ncls = []
        self._Nfeat = []

    def train(self, X, y):
        print ("Treinando Multinomial NB...")
        
        count_each_class = {}
        feature_count = {}
      
        for i in range(y.size):
            if y[i] in feature_count:
                continue
            else:
                feature_count[y[i]] = [0 for w in range (X[i].size)]
                
        for i in range (y.size):
            if y[i] in count_each_class:
                count_each_class[y[i]] +=1
            else:
                count_each_class[y[i]] = 1
            for j in  range(X[i].size):
                    feature_count[y[i]][j] += X[i][j]
                
        alpha = self._smooth 
        temp = []
        temp.append(np.unique(y))
        self._Ncls.append(temp[0].size)
        self._Nfeat.append(X[0].size)  
        self._class_prob.append(count_each_class)
        self._feat_prob.append(feature_count)
        
        
    
    def predict(self, X):
        
        print ("Fazendo Previsões com Multinomial NB...")
        
        Y_predict = np.array([])
        
        # Calcula o total de classes para os dados de treino
        total_train_count = 0
        for key in self._class_prob[0]:
            total_train_count += self._class_prob[0][key]
        
        for i in X:
            neg_log_prob = 0
            minimum_neg_log_prob=999999999999999
            category = 0
            
            for cls in self._feat_prob[0]:
                Ny = sum(self._feat_prob[0][cls])
                neg_log_prob = -log((self._class_prob[0][cls]+1)/float(total_train_count+(self._Ncls[0]*self._smooth)))
                for j in  range(self._Nfeat[0]):  
                    if (i[j])==0:
                        continue    
                    for itere in range (i[j]):
                        num = (self._smooth+self._feat_prob[0][cls][j])
                        din = (Ny+(self._Nfeat[0]*self._smooth))
                        neg_log_prob -= log(num/float(din))
                        
                if minimum_neg_log_prob>neg_log_prob:
                    category=cls
                    minimum_neg_log_prob=neg_log_prob
            
            Y_predict=np.append(Y_predict,category)
         
        return Y_predict