In [37]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

In [38]:
iris = datasets.load_iris()

X = iris.data[:,0:4]
y = iris.target


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

num_classes = len(np.unique(y))
num_features = X.shape[1]

s = np.zeros((num_features, num_classes))
u = np.zeros((num_features, num_classes))

def print_cm(y_combined, y_pred):
    cm = confusion_matrix(y_combined, y_pred, labels=[0, 1, 2])
    print(' number in each class down vs number in each known class across ')
    print(' confusion matrix \n 0 1 2\n', cm.T) ## transpose

def PGauss(mu, sig, x):
    return np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.) + 1e-300) )

for c in range(num_classes): 
    for f in range(num_features):
        u[f, c] = X[np.where(y==c), f].mean()
        s[f, c] = X[np.where(y==c), f].std()
 
print("stdev:")
print(s)

print("\nMu:")
print(u)


stdev:
[[0.34894699 0.51098337 0.62948868]
 [0.37525458 0.31064449 0.31925538]
 [0.17191859 0.46518813 0.54634787]
 [0.10432641 0.19576517 0.27188968]]

Mu:
[[5.006 5.936 6.588]
 [3.428 2.77  2.974]
 [1.462 4.26  5.552]
 [0.246 1.326 2.026]]


In [39]:
priors = np.zeros(num_classes)
for c in range(num_classes):
    priors[c] = np.sum(y == c) / len(y)

# Predictions on all the data

In [40]:
y_pred = np.zeros_like(y)

for i in range(len(X)):
    scores = np.zeros(num_classes)
    
    for c in range(num_classes):
        score_c = priors[c]
        for f in range(num_features):
            score_c *= PGauss(u[f, c], s[f, c], X[i, f]) 
        scores[c] = score_c

    y_pred[i] = np.argmax(scores)

print(y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [41]:
y_pred = np.zeros_like(y_test)

for i in range(len(X_test)):
    scores = np.zeros(num_classes)
    
    for c in range(num_classes):
        score_c = priors[c]
        for f in range(num_features):
            score_c *= PGauss(u[f, c], s[f, c], X_test[i, f]) 
        scores[c] = score_c

    y_pred[i] = np.argmax(scores)

misclassified_indices = np.where(y_test != y_pred)[0]
num_misclassified = len(misclassified_indices)
accuracy = 1.0 - (num_misclassified / len(y_test))

print(f"Number of samples in train: {len(X_train)}")
print(f"Number of samples in test: {len(X_test)}")
print(f"Number features {num_features}")

print("\nTEST RESULTS")
print("Number of misclassifications:", num_misclassified)
print("Accuracy: {:.2f}%".format(accuracy * 100))

print("Indices of misclassified samples:", misclassified_indices)
print("Actual labels:", y_test[misclassified_indices])
print("Predicted labels:", y_pred[misclassified_indices])

print_cm(y_test, y_pred)


Number of samples in train: 105
Number of samples in test: 45
Number features 4

TEST RESULTS
Number of misclassifications: 4
Accuracy: 91.11%
Indices of misclassified samples: [ 2  3 39 42]
Actual labels: [2 1 1 2]
Predicted labels: [1 2 2 1]
 number in each class down vs number in each known class across 
 confusion matrix 
 0 1 2
 [[15  0  0]
 [ 0 13  2]
 [ 0  2 13]]


In [42]:
y_pred = np.zeros_like(y_train)

for i in range(len(X_train)):
    scores = np.zeros(num_classes)
    
    for c in range(num_classes):
        score_c = priors[c]
        for f in range(num_features):
            score_c *= PGauss(u[f, c], s[f, c], X_train[i, f]) 
        scores[c] = score_c

    y_pred[i] = np.argmax(scores)

print("TRAIN RESULTS")
misclassified_indices = np.where(y_train != y_pred)[0]
num_misclassified = len(misclassified_indices)
accuracy = 1.0 - (num_misclassified / len(y_train))

print("\nNumber of misclassifications :", num_misclassified)
print("Accuracy : {:.2f}%".format(accuracy * 100))

print("Indices of misclassified samples:", misclassified_indices)
print("Actual labels:", y_train[misclassified_indices])
print("Predicted labels:", y_pred[misclassified_indices])

print_cm(y_train, y_pred)

TRAIN RESULTS

Number of misclassifications : 4
Accuracy : 96.19%
Indices of misclassified samples: [19 28 42 63]
Actual labels: [1 1 2 1]
Predicted labels: [2 2 1 2]
 number in each class down vs number in each known class across 
 confusion matrix 
 0 1 2
 [[35  0  0]
 [ 0 32  1]
 [ 0  3 34]]


In [43]:
model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

misclassified_indices = np.where(y_test != y_pred)[0]
num_misclassified = len(misclassified_indices)
accuracy = 1.0 - (num_misclassified / len(y_test))

print(f"Number of samples in train: {len(X_train)}")
print(f"Number of samples in test: {len(X_test)}")
print(f"Number features {num_features}")

print("TEST RESULTS")
print("\nNumber of misclassifications:", num_misclassified)
print("Accuracy : {:.2f}%".format(accuracy * 100))

print("Indices of misclassified samples:", misclassified_indices)
print("Actual labels:", y_test[misclassified_indices])
print("Predicted labels:", y_pred[misclassified_indices])

print_cm(y_test, y_pred)

y_pred = model.predict(X_train)

misclassified_indices = np.where(y_train != y_pred)[0]
num_misclassified = len(misclassified_indices)
accuracy = 1.0 - (num_misclassified / len(y_train))

print("TRAIN RESULTS")
print("\nNumber of misclassifications test:", num_misclassified)
print("Accuracy Test: {:.2f}%".format(accuracy * 100))

print("Indices of misclassified samples:", misclassified_indices)
print("Actual labels:", y_train[misclassified_indices])
print("Predicted labels:", y_pred[misclassified_indices])

print_cm(y_train, y_pred)


Number of samples in train: 105
Number of samples in test: 45
Number features 4
TEST RESULTS

Number of misclassifications: 4
Accuracy : 91.11%
Indices of misclassified samples: [ 2 30 39 42]
Actual labels: [2 2 1 2]
Predicted labels: [1 1 2 1]
 number in each class down vs number in each known class across 
 confusion matrix 
 0 1 2
 [[15  0  0]
 [ 0 14  3]
 [ 0  1 12]]
TRAIN RESULTS

Number of misclassifications test: 2
Accuracy Test: 98.10%
Indices of misclassified samples: [19 42]
Actual labels: [1 2]
Predicted labels: [2 1]
 number in each class down vs number in each known class across 
 confusion matrix 
 0 1 2
 [[35  0  0]
 [ 0 34  1]
 [ 0  1 34]]
