In [1]:
import scipy.io
from typing import Callable, List, Tuple
import pandas as pd
import numpy as np
from collections import Counter


In [2]:

# Load the MAT-file
mat_data = scipy.io.loadmat('data/guassianBayes.mat')

# See what variables are available
print("Variables in MAT-file:", mat_data.keys())

# If your data is a simple matrix/array
X_train = pd.DataFrame(mat_data["xTrain"])
X_test = pd.DataFrame(mat_data["xTest"])
Y_train = pd.DataFrame(mat_data["yTrain"])
Y_test = pd.DataFrame(mat_data["yTest"])

print(X_train.head())
print(Y_train.head())

Variables in MAT-file: dict_keys(['__header__', '__version__', '__globals__', 'yTest', 'xTrain', 'xTest', 'yTrain'])
      0     1     2     3     4
0  0.44  0.28  0.43  0.27  0.37
1  0.31  0.36  0.58  0.94  0.94
2  0.58  0.55  0.57  0.70  0.74
3  0.38  0.44  0.43  0.20  0.31
4  0.29  0.28  0.50  0.42  0.50
   0
0  1
1  2
2  2
3  1
4  1


In [3]:
# Prior
def prior(y) -> List[float]:
    y = y[0]
    count = Counter(y)
    total = len(y)
    ans = []
    for i in sorted(np.unique(y)):
        ans.append(count[i] / total)
    return np.array(ans)

def test_calc_prior():
    p = prior(Y_train)
    print(p)
    print(sum(p))
test_calc_prior()

[0.47247706 0.23394495 0.14220183 0.10091743 0.05045872]
1.0


In [4]:
def likelihood(X, Y) -> Tuple[list, list]:
    """ 
    Likelihood function will get (feature_index, class_index) and return (mean, variance)
    """
    Y = Y[0]
    means = []
    variances = []
    for f in X.columns:
        means.append([])
        variances.append([])
        for c in np.unique(Y):
            feature_value = X[Y == c][f]
            if np.isnan(feature_value.mean()):
                print(f, c)
                print(feature_value)
            means[-1].append(feature_value.mean())
            variances[-1].append(feature_value.std() ** 2)
    return np.array(means), np.array(variances)

def test_calc_likelihood():
    means, variances = likelihood(X_train, Y_train)
    for row in means:
        for cell in row:
            print(cell * 100 // 1
                   / 100, end = ' ')
        print()
    print("###########")
    for row in variances:
        for cell in row:
            print(cell * 100 // 1 / 100, end = ' ')
        print()
test_calc_likelihood()

0.35 0.47 0.65 0.73 0.68 
0.4 0.5 0.71 0.47 0.69 
0.45 0.52 0.43 0.58 0.76 
0.3 0.75 0.47 0.74 0.48 
0.39 0.71 0.38 0.76 0.31 
###########
0.01 0.04 0.01 0.01 0.0 
0.0 0.0 0.02 0.0 0.01 
0.0 0.01 0.0 0.0 0.0 
0.0 0.01 0.01 0.0 0.0 
0.0 0.03 0.01 0.0 0.01 


In [5]:
def pdf(x, mu, sigma):
    ans = 1 / sigma
    ans *= np.exp(-0.5 *((x - mu) / sigma)**2)
    return ans
def naive_bayes_classify(X, means, variances, priors):
    Y = []
    classes = len(means[0])

    for _, x in X.iterrows():
        best_class = None
        best_prob = None
        for c in range(classes):
            prob = priors[c]
            for f in X.columns:
                mu = means[f, c]
                sigma = variances[f, c] ** 0.5
                prob *= pdf(x[f], mu, sigma)
            if best_prob is None or prob > best_prob:
                best_prob = prob 
                best_class = c
        Y.append(best_class + 1)
    return np.array(Y)
        
means, variances = likelihood(X_train, Y_train)
priors = prior(Y_train)
preds = naive_bayes_classify(X_test, means, variances, priors)

In [6]:
print(sorted(np.unique(preds)))
print(sorted(np.unique(Y_test[0])))


[np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5)]
[np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5)]


In [7]:
def calc_total_correct(preds, y) -> float:
    correct = preds == y
    return np.count_nonzero(correct) / len(preds)
print("Fraction of test samples classified correctly:", calc_total_correct(preds, Y_test[0]))


Fraction of test samples classified correctly: 0.8348623853211009


In [8]:
def precision(preds, y, class_number: int) -> float:
    mask = preds == class_number
    tp = np.count_nonzero( preds[mask] == y[mask])
    fp = np.count_nonzero(preds[mask] != y[mask])
    return tp / (tp + fp)
for i in range(5):
    print(f"Class{i + 1} Precsion:", precision(preds, Y_test[0], i + 1))

Class1 Precsion: 0.9512195121951219
Class2 Precsion: 0.8095238095238095
Class3 Precsion: 0.9047619047619048
Class4 Precsion: 0.5
Class5 Precsion: 0.875


In [9]:
def recall(preds, y, class_number: int) -> float:
    mask = y == class_number
    tp = np.count_nonzero( preds[mask] == y[mask])
    fn = np.count_nonzero(preds[mask] != y[mask])
    return tp / (tp + fn)
for i in range(5):
    print(f"Class{i + 1} Recall:", recall(preds, Y_test[0], i + 1))

Class1 Recall: 0.975
Class2 Recall: 0.6538461538461539
Class3 Recall: 0.9047619047619048
Class4 Recall: 0.6923076923076923
Class5 Recall: 0.7777777777777778


In [10]:
with open("evaluation.txt", "wt") as f:
    f.write(f"{calc_total_correct(preds, Y_test[0]):0.3f}\n")
    f.write(f"{precision(preds, Y_test[0], 1):0.3f}\n")
    f.write(f"{recall(preds, Y_test[0], 1):0.3f}\n")
    f.write(f"{precision(preds, Y_test[0], 5):0.3f}\n")
    f.write(f"{recall(preds, Y_test[0], 5):0.3f}\n")