In [5]:
import pandas as pd
import numpy as np
from sklearn import metrics 
import math
from sklearn.model_selection import train_test_split

# Load data

In [6]:
df = pd.read_csv("spambase.data", header = None)

# Create training and test set

In [7]:
# get the locations
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
 
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, stratify=y)

# Create probabilistic model

In [8]:
# from series to numpy
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

In [9]:
# get the prior P(0) and P(1)
y_train = y_train.reshape(len(y_train),1)
 
count0 = 0
count1 = 0
for i in range(len(y_train)):
  if y_train[i,0] == 1:
    count1 += 1
  else:
    count0 += 1

# calculate the prior probability for each class
P_class_1 = count1 / len(y_train) # 0.4
P_class_0 = count0 / len(y_train) # 0.6

print("P(0) is: ", P_class_0)
print("P(1) is: ", P_class_1)

P(0) is:  0.6060869565217392
P(1) is:  0.3939130434782609


In [10]:
class_0 = []
class_1 = []

for i in range(len(X_train)):
  if y_train[i] == 1:
    class_1.append(X_train[i,:])
  else:
    class_0.append(X_train[i,:])

In [11]:
# compute the mean and standard deviation for 57 features
class_0 = pd.DataFrame(class_0)
mean_0 = class_0.mean(axis = 0)
std_0 = class_0.std()

class_1 = pd.DataFrame(class_1)
mean_1 = class_1.mean(axis = 0)
std_1 = class_1.std()

In [12]:
# combine the mean and std for each class (each matrix should be 2 by 57) 
mean = np.vstack((mean_0, mean_1))
std = np.vstack((std_0, std_1))

In [13]:
# assign a minimal std (0.0001) if a feature has zero std
for i in range(std.shape[0]):
  for j in range(std.shape[1]):
    if std[i,j] == 0:
      std[i,j] = 0.0001 

# Run Naive Bayes on the test data

In [14]:
N0 = np.zeros((X_test.shape[0], X_test.shape[1]))
N1 = np.zeros((X_test.shape[0], X_test.shape[1]))

for i in range(X_test.shape[0]):
  N0[i,:] = (1 / (math.sqrt(2 * math.pi) * std[0,:])) * np.exp(-np.power((X_test[i,:] - mean[0,:]), 2) / (2 * np.power(std[0,:], 2)))
  N1[i,:] = (1 / (math.sqrt(2 * math.pi) * std[1,:])) * np.exp(-np.power((X_test[i,:] - mean[1,:]), 2) / (2 * np.power(std[1,:], 2)))

In [None]:
pred = np.array([0 for k in range(y_test.shape[0])]) # store the predictions

for i in range(X_test.shape[0]):
  class_NB_0 = np.log(P_class_0) + np.sum(np.log(N0[i,:]))
  class_NB_1 = np.log(P_class_1) + np.sum(np.log(N1[i,:]))
  
  if class_NB_0 >= class_NB_1:
    pred[i] = 0
  else:
    pred[i] = 1

Compute confusion matrix, accuracy, recall, precision, and F1 score

In [17]:
confusion_matrix = metrics.confusion_matrix(y_test, pred)
accuracy = metrics.accuracy_score(y_test, pred)
recall = metrics.recall_score(y_test, pred) # tp / (tp + fn)
precision = metrics.precision_score(y_test, pred) # tp / (tp + fp)
F1_score = metrics.f1_score(y_test, pred) 

print("Confusion matrix is: \n", confusion_matrix)
print("Accuracy is: ", accuracy)
print("Recall is: ", recall)
print("Precision is: ", precision)
print("F1 score is: ", F1_score)

Confusion matrix is: 
 [[1038  356]
 [  50  857]]
Accuracy is:  0.823554976097349
Recall is:  0.9448732083792724
Precision is:  0.7065127782357791
F1 score is:  0.808490566037736


Check correlations between features using the covariant matrix 

In [29]:
np.cov(X_train)

array([[9556.75138043,  519.25879723, 1488.40143646, ..., 1733.29090707,
        2808.02064388, 1582.46187712],
       [ 519.25879723,   29.56624964,   81.54368106, ...,  100.08896558,
         152.96511432,   88.4488867 ],
       [1488.40143646,   81.54368106,  232.70571564, ...,  272.66120262,
         437.46057042,  247.5945606 ],
       ...,
       [1733.29090707,  100.08896558,  272.66120262, ...,  343.96037847,
         510.93927078,  298.90557727],
       [2808.02064388,  152.96511432,  437.46057042, ...,  510.93927078,
         825.35394353,  465.70759844],
       [1582.46187712,   88.4488867 ,  247.5945606 , ...,  298.90557727,
         465.70759844,  267.00310704]])