In [143]:
""" (Gaussian) Naive Bayes
P(A|B) = P(B|A)*P(A)/P(B)

Assumes the probability of each attribute belonging to a given class 
value is independent of all other attributes. We will model p(data|class)
as p(x_i|y) = 1/np.sqrt(2*np.pi*np.std(x in class y)) * np.exp(-(x-np.mean(x in class y))/(2*np.var(x in class y))).

IMPROVEMENT IDEAS: smoothing (what happens when a class never see a feature?), how would we do this for strings,
log the probabilities so they suffer less from numerical roundoff, multinomial naive bayes, how to make the independence
assumption less strong (e.g. bigrams)
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.style.use('ggplot')

In [139]:
# Read data
data = pd.read_csv('../data/pima.txt', header=None)

# Extract labels assuming last col is our labels column
labels = sorted(data.iloc[:, -1].unique())

# Split into classes based on labels and extract predictor features
# NOTE: these are computed in order of sorted labels. We assume this
# in the predict fnc defined below for how we output a value.
class_features = [data[data.iloc[:, -1] == i] for i in labels]
class_features = [c.iloc[:, :-1] for c in class_features]

# Compute means and variances for each class
class_stats = [(c.mean(), c.var())  for c in class_features]

In [140]:
def p_x_given_y(x, y_stats):
    """ Compute Gaussian probability density function given a feature vector x,
    the mean of feature vector x in class y, and the variance of features in 
    class y.
    """
    y_mean, y_var = y_stats[0], y_stats[1]
    density = 1/(np.sqrt(2*np.pi*y_var)) * np.exp(-(((x-y_mean)**2)/(2*y_var)))
    return np.prod(density)

In [141]:
def predict(x, class_stats, labels):
    """ Predict probabilities for each class using Gaussian Naive Bayes,
    return its label
    """
    probabilities =  [p_x_given_y(x, y_stats) for y_stats in class_stats]
    return labels[np.argmax(probabilities)]

In [142]:
correct, total = 0, len(data)
for i in range(total):
    
    # Extract out features and label
    x, y = data.iloc[i, :-1], data.iloc[i, -1]
    
    # Argmax over probabilities for each class
    pred = predict(x, class_stats, labels)
    
    # Keep track of accuracy
    if pred == y:
        correct += 1
        
print('Accuracy across dataset: %.2f'%(correct/total))

Accuracy across dataset: 0.75
