In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
import numpy as np 
from sklearn.datasets import load_wine
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report, confusion_matrix

#### Naive Bayes classifier with Gaussian distribution

In [3]:
class NaiveBayesGauss:
    """ Naive Bayes classification using Gaussian distribution """
    
    def __init__(self):
        self.class_prior = {} 
        self.train_params = {} 
    
    def _compute_params(self, feat_df):
        """ compute training parameters: mean and std per feature """
        training_params = {}
        
        # compute mean and std per column (i.e., feature)
        mean_vec = feat_df.mean()
        mean_vec = mean_vec.drop("labels").to_dict()
        
        std_vec = feat_df.std() 
        std_vec = std_vec.drop("labels").to_dict() 
        
        for key, val in mean_vec.items():
            training_params[key] = (val, std_vec[key])
        
        return training_params
    
    def fit(self, X_train, y_train, show_samples=True):
        """ train a Naive Bayes classifier with Gaussian distribution 
        :param X_train: training data in the format (num_samples x num_features)
        :param y_train: a list of labels for the corresponding samples 
        """
        # create a pandas dataframe 
        num_samples, num_feats = X_train.shape 
        data_mat = np.column_stack((X_train, y_train))
        self.cols = ["feat" + str(i+1) for i in range(num_feats)] + ["labels"] 
        df = pd.DataFrame(data=data_mat, columns=self.cols)
        if show_samples:
            print("training data \n", df.head())
        
        # compute class priors and training params (mean and std) per feature per class 
        for label, feats_per_class in df.groupby('labels'):
            print(f"number of samples per class ({label}): {len(feats_per_class)}")
            self.class_prior[int(label)] = round(len(feats_per_class) / num_samples, 3) 
            self.train_params[int(label)] = self._compute_params(feats_per_class)
        
    def _compute_prob(self, x, label, feat_id):
        """ compute a Gaussian probability for a given feature value and for a given class label. 
            Gaussian parameters (mean and std) are computed from the training data 
        """
        feat_mean = self.train_params[label][self.cols[feat_id]][0]
        feat_std = self.train_params[label][self.cols[feat_id]][1] 
        
        exp_part = math.exp(-((x-feat_mean)**2 / (2 * feat_std**2 )))
        gauss_prob = (1 / (math.sqrt(2 * math.pi) * feat_std)) * exp_part
        return gauss_prob
    
    def _compute_likelihood_prob(self, feat_vec, label):
        """ compute a likelihood prob which is a product of probabilities of each feature variable for a given class label """
        likelihood_probs = [self._compute_prob(feat, label, feat_id) for feat_id, feat in enumerate(feat_vec)]
        return np.prod(likelihood_probs)
    
    def predict(self, X_test):
        """ prediction on a given test data 
        :param X_test: numpy array containing samples of feature vectors 
        :return predict_vals: predicted label and probabilities 
        """
        # convert numpy array to a list of lists
        X_test = X_test.tolist() 
        predict_vals = {"probs": [], "labels": []}
        
        # compute max posterior prob for each sample 
        for ind, sample in enumerate(X_test):
            
            # compute posterior probabilities for each class and pick the max one as final class label 
            predict_prob = -1
            predict_label = -1
            norm_constant = 0.0 
            for label, prior_prob in self.class_prior.items():
                # compute likelihood prob which is a product of probs of each feature 
                likelihood_prob = self._compute_likelihood_prob(sample, label)
                
                # compute posterior distrition as multiplication of prior and likelihood. Note: normalization constant 
                # since we will compute the max prob afterwards 
                post_prob = prior_prob * likelihood_prob 
                
                # compute normalization constant 
                norm_constant += post_prob 
                if post_prob > predict_prob:
                    predict_prob = post_prob 
                    predict_label = label 
                    
            print(f"predicted label for the {ind}th sample is {predict_label} with probability {predict_prob}")
            predict_vals["probs"].append(predict_prob / norm_constant)
            predict_vals["labels"].append(predict_label) 
            
        # return the label with the max probability 
        return predict_vals

#### Load the wine data and split it into train and test datasets 

In [4]:
wine_data = load_wine()
data = wine_data.data 
labels = wine_data.target

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.20, random_state=0)

#### Train the naive Bayes model 

In [6]:
nb_classifier = NaiveBayesGauss()
nb_classifier.fit(X_train, y_train)

training data 
    feat1  feat2  feat3  feat4  feat5  feat6  feat7  feat8  feat9  feat10  \
0  13.69   3.26   2.54   20.0  107.0   1.83   0.56   0.50   0.80    5.88   
1  12.69   1.53   2.26   20.7   80.0   1.38   1.46   0.58   1.62    3.05   
2  11.62   1.99   2.28   18.0   98.0   3.02   2.26   0.17   1.35    3.25   
3  13.40   3.91   2.48   23.0  102.0   1.80   0.75   0.43   1.41    7.30   
4  13.50   1.81   2.61   20.0   96.0   2.53   2.61   0.28   1.66    3.52   

   feat11  feat12  feat13  labels  
0    0.96    1.82   680.0     2.0  
1    0.96    2.06   495.0     1.0  
2    1.16    2.96   345.0     1.0  
3    0.70    1.56   750.0     2.0  
4    1.12    3.82   845.0     0.0  
number of samples per class (0.0): 45
number of samples per class (1.0): 55
number of samples per class (2.0): 42


#### Print the trained parameters

In [7]:
nb_classifier.class_prior

{0: 0.317, 1: 0.387, 2: 0.296}

In [8]:
nb_classifier.train_params

{0: {'feat1': (13.724666666666664, 0.44862719894848513),
  'feat2': (2.030888888888889, 0.7027634485703312),
  'feat3': (2.4506666666666677, 0.23376756132379173),
  'feat4': (16.911111111111115, 2.7057365172924914),
  'feat5': (105.13333333333334, 10.010449086284337),
  'feat6': (2.8331111111111116, 0.3586833385783864),
  'feat7': (3.000666666666668, 0.40596909868967757),
  'feat8': (0.28577777777777774, 0.06790665012583916),
  'feat9': (1.9393333333333336, 0.4165082122948969),
  'feat10': (5.496444444444444, 1.2767486325456041),
  'feat11': (1.0684444444444445, 0.112167803587376),
  'feat12': (3.1768888888888904, 0.3767011247311817),
  'feat13': (1104.6, 218.80651477753327)},
 1: {'feat1': (12.265636363636364, 0.5725827245306799),
  'feat2': (1.9080000000000001, 0.924336398600519),
  'feat3': (2.2412727272727273, 0.30330924535282183),
  'feat4': (20.314545454545453, 3.4304420502221955),
  'feat5': (95.9090909090909, 17.976602825846967),
  'feat6': (2.213090909090909, 0.514773854437189

#### Perform prediction on test dataset

In [9]:
predict_vals = nb_classifier.predict(X_test)

predicted label for the 0th sample is 0 with probability 6.79147236643029e-07
predicted label for the 1th sample is 2 with probability 2.4632187160557036e-09
predicted label for the 2th sample is 1 with probability 8.452845725369398e-11
predicted label for the 3th sample is 0 with probability 4.070009987966507e-07
predicted label for the 4th sample is 1 with probability 1.4955915525288458e-12
predicted label for the 5th sample is 1 with probability 2.6800560577303106e-23
predicted label for the 6th sample is 0 with probability 1.1631681502473758e-07
predicted label for the 7th sample is 2 with probability 1.3155434166767746e-07
predicted label for the 8th sample is 1 with probability 1.7287921194231835e-07
predicted label for the 9th sample is 1 with probability 2.982284376636477e-08
predicted label for the 10th sample is 2 with probability 2.072781781472864e-09
predicted label for the 11th sample is 2 with probability 4.660554513091282e-11
predicted label for the 12th sample is 0 with

In [10]:
conf_mat = confusion_matrix(y_test, predict_vals["labels"])
print(conf_mat)

[[14  0  0]
 [ 2 13  1]
 [ 0  0  6]]


In [11]:
print(classification_report(y_test, predict_vals["labels"]))

              precision    recall  f1-score   support

           0       0.88      1.00      0.93        14
           1       1.00      0.81      0.90        16
           2       0.86      1.00      0.92         6

   micro avg       0.92      0.92      0.92        36
   macro avg       0.91      0.94      0.92        36
weighted avg       0.93      0.92      0.92        36



In [12]:
acc = 1 - (np.sum(conf_mat) - np.trace(conf_mat))/np.sum(conf_mat)
print(f"accuracy rate: {round(acc*100.0, 3)}")

accuracy rate: 91.667
