In [15]:
import numpy as np
from scipy.stats import norm
from multiprocessing import Pool
from tabulate import tabulate

class NaiveBayesKDE:
    def __init__(self, kernel='gaussian', bandwidth=1.0):
        self.kernel = kernel
        self.bandwidth = bandwidth
        self.training_data = None
        self.labels = None

    def _kernel_function(self, x):
        if self.kernel == 'hypercube':
            return np.all(np.abs(x) <= 0.5, axis=1)
        elif self.kernel == 'gaussian':
            return norm.pdf(x, scale=self.bandwidth)
        else:
            raise ValueError("Unknown kernel")

    def fit(self, X, y):
        self.training_data = X
        self.labels = y

    def _estimate_density(self, x):
        densities = []
        for label in np.unique(self.labels):
            class_data = self.training_data[self.labels == label]
            distances = (x - class_data) / self.bandwidth
            kernel_values = self._kernel_function(distances)
            density = np.mean(kernel_values, axis=0)
            densities.append(density)
        return np.array(densities)

    def predict(self, X):
        predictions = []
        for x in X:
            densities = self._estimate_density(x)
            predicted_label = np.argmax(densities)
            predictions.append(predicted_label)
        return np.array(predictions)

    # Additional methods for multiprocessing and hyperparameter tuning can be added here

# Example usage
# classifier = NaiveBayesKDE(kernel='gaussian', bandwidth=1.0)
# classifier.fit(training_data, training_labels)
# predictions = classifier.predict(test_data)


In [33]:
import numpy as np


class NaiveBayesClassifier:
    """
    A Naive Bayes Classifier using Kernel Density Estimation with Parzen windows.

    The classifier implements two kernels for parzen window  - Radial and Hypercube

    It also implements Single bandwidth model and class-specific Multi bandwidth model

    The kernel and model type are passed as arguments to class object initialization.

    Along with the number of bandwidths necessary, in case of Multi bandwidth model.
    """

    def __init__(self, bandwidth=1, kernel='radial', multi_bw=False):
        """
        Initialize the classifier with proper parameters.

        :param bandwidth: An integer giving the number of bandwidths necessary
        :param kernel: A string specifying the kernel to be used for the model
        :param multi_bw: A boolean variable specifying if the Multi bandwidth
                        model is to be used.
                        By default Single bandwidth model is selected.
        """
        self.priors = dict()
        self.dim = 1
        self.multi_bw = multi_bw
        self.bandwidth = bandwidth
        if kernel == "radial":
            self.kernel = self.radial
        elif kernel == "hypercube":
            self.kernel = self.hypercube

    def hypercube(self, k):
        """
        Hypercube kernel for Density Estimation.
        """
        return np.all(k < 0.5, axis=1)

    def radial(self, k):
        """
        Radial Kernel for Density estimation.
        """
        const_part = (2 * np.pi) ** (-self.dim / 2)
        return const_part * np.exp(-0.5 * np.add.reduce(k ** 2, axis=1))

    def parzen_estimation(self, h, x, x_train):
        """
        Density estimation for a single sample against the training set with
        parzen window using the specified bandwidth, kernel.

        :param h: An integer value giving the bandwidth to be used for the class.
        :param x: A single input sample, whose density needs to be estimated.
        :param x_train: Array of input data to calculate KDE value against.
        :return: A single float value giving the density of the function at the given point.
        """
        N = x_train.shape[0]
        dim = self.dim
        k = np.abs(x - x_train) * 1.0 / h
        summation = np.add.reduce(self.kernel(k))
        return summation / (N * (h ** dim))

    def KDE(self, h, x_test, x_train):
        """
        Kernel Density Estimation based on the parameters set.

        :param h: An integer value giving the bandwidth to be used for the class.
        :param x_test: Array of input data to make predictions.
        :param x_train: Array of input data to calculate KDE value against.
        :return: A list of floats giving the density estimation values for each
                 row in x_test, x_test[i] calculated against the training set, previously set
        """
        P_x = np.zeros(len(x_test))
        N = x_train.shape[0]
        dim = self.dim
        for i in range(len(x_test)):
            P_x[i] = self.parzen_estimation(h, x_test[i], x_train)

        return P_x

    def fit(self, X, Y):
        """
        Fits the model to the training set.
        Since KDE is a lazy learner we just need to save the necessary information.

        :param X: Array of input data
        :param Y: Array of output labels
        :return: None
        """
        self.x_train = X
        self.y_train = Y
        self.dim = X.shape[1]
        labels = set(Y)
        for c in labels:
            self.priors[c] = float(len(Y[Y == c])) / len(Y)

    def predict(self, x_test):
        """
        Predict the labels of testing set, using KDE.

        :param x_test: Array of input data to make predictions.
        :return: Predicted labels of the data.
        """
        N, D = x_test.shape
        priors = self.priors
        K = len(priors)
        P = np.zeros((N, K))
        x_train = self.x_train
        y_train = self.y_train
        if self.multi_bw:
            bw = self.bandwidth
        else:
            bw = np.repeat(self.bandwidth, K)
        for c, p in priors.items():
            P[:, int(c)] = self.KDE(bw[int(c)], x_test, x_train[y_train == c]) * p

        pred_y = np.argmax(P, axis=1)
        self.pred_y = pred_y

        return pred_y

    def accuracy(self, y_test):
        """
        Calculates the accuracy between the predicted label and actual labels.

        :param y_test: Array of actual output labels of Testing set.
        :return: A float value giving the accuracy.
        """
        pred_y = self.pred_y
        return np.array([pred_y == y_test]).mean()

    def score(self, x_test, y_test):
        """
        Function that runs both Predict and Accuracy and returns the accuracy
        score of the model.

        :param x_test: Array of input data to make predictions.
        :param y_test: Array of actual output labels of Testing set.
        :return: A float value giving the accuracy of the model.
        """
        self.predict(x_test)
        return self.accuracy(y_test)

In [34]:
import argparse
import pandas as pd
import numpy as np
from util import *
from sklearn.metrics import accuracy_score
from tabulate import tabulate
from sklearn.model_selection import KFold

In [36]:

from sklearn.datasets import *
from sklearn.naive_bayes import GaussianNB

all_res = []
for f in [load_iris,load_digits,load_breast_cancer,load_wine]:
    dataset = f()
    icv_res=[]
    for cv in range(2):
        # np.random.shuffle(dataset)
        X_train, X_test, y_train, y_test  = train_test_split(dataset.data,dataset.target ,train_size=0.75, random_state=cv)
        print(f.__name__)
        # print(X_train.shape , X_test.shape, len(np.unique(y_train)))
        result = []

        for j in np.linspace(0.1,1,10):
            model = NaiveBayesClassifier(bandwidth=j)
            model.fit(X_train,y_train)
            predict_y = model.predict(X_test)
            acc = accuracy_score(predict_y,y_test)
            result.append(acc)
            print( "%3.3f"%acc,end=" ")
        print("")
        icv_res.append(result)
    icv_res = np.array(icv_res)
    icv_res = icv_res.mean(axis=0)
    all_res.append(icv_res)


load_iris
0.974 0.974 0.974 0.974 0.974 0.974 0.974 0.974 0.947 0.921 
load_iris
1.000 1.000 0.974 0.974 0.974 0.974 0.974 0.974 0.974 0.947 
load_digits
0.082 0.082 0.113 0.318 0.687 0.893 0.969 0.991 0.991 0.991 
load_digits
0.118 0.118 0.164 0.358 0.711 0.927 0.984 0.989 0.989 0.989 
load_breast_cancer
0.371 0.427 0.538 0.713 0.818 0.860 0.888 0.902 0.923 0.923 
load_breast_cancer
0.385 0.462 0.643 0.825 0.909 0.923 0.930 0.944 0.944 0.944 
load_digits
0.082 0.082 0.113 0.318 0.687 0.893 0.969 0.991 0.991 0.991 
load_digits
0.118 0.118 0.164 0.358 0.711 0.927 0.984 0.989 0.989 0.989 
load_wine
0.422 0.578 0.733 0.733 0.689 0.733 0.733 0.733 0.733 0.756 
load_wine
0.400 0.489 0.622 0.711 0.689 0.689 0.689 0.689 0.689 0.711 


In [37]:
icv_res

array([0.41111111, 0.53333333, 0.67777778, 0.72222222, 0.68888889,
       0.71111111, 0.71111111, 0.71111111, 0.71111111, 0.73333333])

In [38]:
all_res

[array([0.98684211, 0.98684211, 0.97368421, 0.97368421, 0.97368421,
        0.97368421, 0.97368421, 0.97368421, 0.96052632, 0.93421053]),
 array([0.1       , 0.1       , 0.13888889, 0.33777778, 0.69888889,
        0.91      , 0.97666667, 0.99      , 0.99      , 0.99      ]),
 array([0.37762238, 0.44405594, 0.59090909, 0.76923077, 0.86363636,
        0.89160839, 0.90909091, 0.92307692, 0.93356643, 0.93356643]),
 array([0.1       , 0.1       , 0.13888889, 0.33777778, 0.69888889,
        0.91      , 0.97666667, 0.99      , 0.99      , 0.99      ]),
 array([0.41111111, 0.53333333, 0.67777778, 0.72222222, 0.68888889,
        0.71111111, 0.71111111, 0.71111111, 0.71111111, 0.73333333])]

In [41]:
all_res_1d = [x[0] for x in all_res]

In [43]:
t = np.array(all_res).tolist()

t

[[0.986842105263158,
  0.986842105263158,
  0.9736842105263158,
  0.9736842105263158,
  0.9736842105263158,
  0.9736842105263158,
  0.9736842105263158,
  0.9736842105263158,
  0.9605263157894737,
  0.9342105263157894],
 [0.1,
  0.1,
  0.1388888888888889,
  0.33777777777777773,
  0.6988888888888889,
  0.9099999999999999,
  0.9766666666666667,
  0.99,
  0.99,
  0.99],
 [0.3776223776223776,
  0.44405594405594406,
  0.5909090909090908,
  0.7692307692307693,
  0.8636363636363636,
  0.8916083916083917,
  0.9090909090909092,
  0.9230769230769231,
  0.9335664335664335,
  0.9335664335664335],
 [0.1,
  0.1,
  0.1388888888888889,
  0.33777777777777773,
  0.6988888888888889,
  0.9099999999999999,
  0.9766666666666667,
  0.99,
  0.99,
  0.99],
 [0.4111111111111111,
  0.5333333333333333,
  0.6777777777777778,
  0.7222222222222222,
  0.6888888888888889,
  0.711111111111111,
  0.711111111111111,
  0.711111111111111,
  0.711111111111111,
  0.7333333333333334]]

In [49]:
tt = [['dataset']+x for x in t]

latex_table = tabulate(tt, headers=['h']+["%3.3f"%i for i in np.linspace(0.1,1,10).tolist()] , tablefmt='latex_booktabs' ,floatfmt=".3g")
print(latex_table)

\begin{tabular}{lrrrrrrrrrr}
\toprule
 h       &   0.100 &   0.200 &   0.300 &   0.400 &   0.500 &   0.600 &   0.700 &   0.800 &   0.900 &   1.000 \\
\midrule
 dataset &   0.987 &   0.987 &   0.974 &   0.974 &   0.974 &   0.974 &   0.974 &   0.974 &   0.961 &   0.934 \\
 dataset &   0.1   &   0.1   &   0.139 &   0.338 &   0.699 &   0.91  &   0.977 &   0.99  &   0.99  &   0.99  \\
 dataset &   0.378 &   0.444 &   0.591 &   0.769 &   0.864 &   0.892 &   0.909 &   0.923 &   0.934 &   0.934 \\
 dataset &   0.1   &   0.1   &   0.139 &   0.338 &   0.699 &   0.91  &   0.977 &   0.99  &   0.99  &   0.99  \\
 dataset &   0.411 &   0.533 &   0.678 &   0.722 &   0.689 &   0.711 &   0.711 &   0.711 &   0.711 &   0.733 \\
\bottomrule
\end{tabular}


In [None]:
all_dataress = [[0.986842105263158,
  0.986842105263158,
  0.9736842105263158,
  0.9736842105263158,
  0.9736842105263158,
  0.9736842105263158,
  0.9736842105263158,
  0.9736842105263158,
  0.9605263157894737,
  0.9342105263157894],
 [0.1,
  0.1,
  0.1388888888888889,
  0.33777777777777773,
  0.6988888888888889,
  0.9099999999999999,
  0.9766666666666667,
  0.99,
  0.99,
  0.99],
 [0.3776223776223776,
  0.44405594405594406,
  0.5909090909090908,
  0.7692307692307693,
  0.8636363636363636,
  0.8916083916083917,
  0.9090909090909092,
  0.9230769230769231,
  0.9335664335664335,
  0.9335664335664335],
 [0.4111111111111111,
  0.5333333333333333,
  0.6777777777777778,
  0.7222222222222222,
  0.6888888888888889,
  0.711111111111111,
  0.711111111111111,
  0.711111111111111,
  0.711111111111111,
  0.7333333333333334]]
