In [1]:
import numpy as np
import pandas as pd

In [2]:
## Here we define the fit function which takes x_train,y_train as input and gives the 
## count of all the thing what we required in calculating the value of probability
def fit(x_train,y_train):
    result = {}     ## store the value of all the thing in it
    result['total_data'] = len(y_train) ## count the total no of data in y_train
    class_value = set(y_train)      ## find all the distinct value in y_train
    for current_class in class_value:    ## going to all the class of y
        result[current_class] = {}       ## declare current class as also dictionary
        current_class_rows = (y_train == current_class)
        current_xtrain = x_train[current_class_rows]    ## find x data which is belongs to the current class
        current_ytrain = y_train[current_class_rows]    ## find y data which is belongs to the current class
        result[current_class]['total_count'] = len(current_ytrain)  ## storing the length of current class
        num_features = x_train.shape[1]   ## total no of features in data
        for j in range(1,num_features+1):   ##going to all the features of data
            result[current_class][j] = {}  ## make that feature a dictionary
            all_possible_value  = set(x_train[:,j-1])  ##finding the total distinct value in x_train
            for current_value in all_possible_value:
                result[current_class][j][current_value] = (current_xtrain[:,j-1] == current_value).sum()
                ## it take all the true as 1 and false as 0 
    return result   
        

In [3]:
## define the probability function which gives us the probability for the given condition
## we also use here laplace correction in finding the probability
## we also change our count with log so that if the count is very less then we can also 
## consider our probability because when the probabiliy function is very less then their multiplication
## became more less 
def probability(dictionary,x,current_class):
    #output = dictionary[current_class]['total_count']/dictionary['total_data']
    output = np.log(dictionary[current_class]['total_count'])- np.log(dictionary['total_data'])
    num_features = len(dictionary[current_class].keys()) - 1
    for j in range(1,num_features + 1):
        xj = x[j-1]
        count_current_class_with_value_xj = dictionary[current_class][j][xj] + 1
        count_current_class = dictionary[current_class]['total_count'] + len(dictionary[current_class][j].keys())
        #current_xj_probability = count_current_class_with_value_xj/count_current_class
        current_xj_probability = np.log(count_current_class_with_value_xj) - np.log(count_current_class)
        #output = output * current_xj_probability
        output = output + current_xj_probability
    return output
    
    

In [4]:
## this takes one single row and compare it's value with all probability and give
## class of that data
def predictSinglePoint(dictionary,x):
    FirstTrue = True    ## used for checking the first point 
    best_p = -1000    ## best probability
    best_class = -1
    classes = dictionary.keys()   ## all the classes
    for current_class in classes:
        if  current_class == 'total_data':  ## we have to skip this key 
            continue
        p_currentclass = probability(dictionary,x,current_class)
        if FirstTrue or (p_currentclass > best_p): ## firstTrue is only run for first time
            best_p = p_currentclass 
            best_class = current_class
        FirstTrue = False ## we do it false after the first iteration
    return best_class

In [5]:
## define the predict function which takes the dictionary and x_test and
## give the value of y_pred
def pred(dictionary,x_test):
    y_pred = []    ## y_pred array for storing the value
    for x in x_test:
        x_class = predictSinglePoint(dictionary,x)   ## this take one row of data and predict the value
        y_pred.append(x_class)   ## append the result into y_predict
    return y_pred

In [6]:
## we make this function to convert continuous data 
## into discrete value
def makeLabelled(column):
    second_limit = column.mean()
    first_limit = 0.5*second_limit
    third_limit = 1.5*second_limit
    for i in range(0,len(column)):
        if column[i] < first_limit:
            column[i] = 0
        elif column[i] < second_limit:
            column[i] = 1
        elif column[i] < third_limit:
            column[i] = 2
        else:
            column[i] = 3
    return column

In [7]:
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data
y = iris.target
df = pd.DataFrame(x)
df

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [8]:
## convert the value of all the column
print(x.shape)
for i in range(0,x.shape[-1]):
    x[:,i] = makeLabelled(x[:,i])


(150, 4)


In [9]:
from sklearn import model_selection
x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,test_size = 0.25,random_state = 0)


In [10]:
result = fit(x_train,y_train)


In [11]:
y_pred = pred(result,x_test)
print(y_pred)
print(y_test)

[2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1, 0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1]
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 1]


In [12]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred ))

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      1.00      0.97        16
           2       1.00      0.89      0.94         9

    accuracy                           0.97        38
   macro avg       0.98      0.96      0.97        38
weighted avg       0.98      0.97      0.97        38



In [14]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred ))

[[11  2  0]
 [ 0 16  0]
 [ 0  3  6]]
              precision    recall  f1-score   support

           0       1.00      0.85      0.92        13
           1       0.76      1.00      0.86        16
           2       1.00      0.67      0.80         9

    accuracy                           0.87        38
   macro avg       0.92      0.84      0.86        38
weighted avg       0.90      0.87      0.87        38

