# Naive Bayes

In [1]:
%matplotlib inline
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score

In [2]:
dataset = np.array([[3.393533211,2.331273381,0],
    [3.110073483,1.781539638,0],
    [1.343808831,3.368360954,0],
    [3.582294042,4.67917911,0],
    [2.280362439,2.866990263,0],
    [7.423436942,4.696522875,1],
    [5.745051997,3.533989803,1],
    [9.172168622,2.511101045,1],
    [7.792783481,3.424088941,1],
    [7.939820817,0.791637231,1]])

# Separate By Class

In [3]:
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = int(vector[-1])
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

In [4]:
separate = separate_by_class(dataset)
separate

{0: [array([ 3.39353321,  2.33127338,  0.        ]),
  array([ 3.11007348,  1.78153964,  0.        ]),
  array([ 1.34380883,  3.36836095,  0.        ]),
  array([ 3.58229404,  4.67917911,  0.        ]),
  array([ 2.28036244,  2.86699026,  0.        ])],
 1: [array([ 7.42343694,  4.69652288,  1.        ]),
  array([ 5.745052 ,  3.5339898,  1.       ]),
  array([ 9.17216862,  2.51110105,  1.        ]),
  array([ 7.79278348,  3.42408894,  1.        ]),
  array([ 7.93982082,  0.79163723,  1.        ])]}

# Summarize Dataset

In [5]:
def summarize_dataset(dataset):
    summaries = [(np.mean(column), np.std(column, ddof=1), np.size(column))
                 for column in zip(*dataset)]
    del summaries[-1]
    return summaries


In [6]:
summarize_dataset(dataset)

[(5.1783333864999994, 2.7665845055177263, 10),
 (2.9984683241000001, 1.2185563436174469, 10)]

# Summarize Data By Class

In [7]:
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value in separated:
        rows = separated[class_value]
        summaries[class_value] = summarize_dataset(rows)
    return summaries

In [8]:
summarize_by_class(dataset)

{0: [(2.7420144012000001, 0.92656832892980179, 5),
  (3.0054686691999999, 1.1073295894898725, 5)],
 1: [(7.6146523718000001, 1.2344321550313704, 5),
  (2.9914679790000003, 1.4541931384601618, 5)]}

# Gaussian Probability Density Function

In [9]:
def calculate_probability(x, mean, stdev):
    exponent = np.exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
    return 1 / (np.sqrt(2 * np.pi) * stdev) * exponent

# Class Probabilities

In [10]:
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value in summaries:
        class_summaries = summaries[class_value]
        probabilities[class_value] = summaries[class_value][0][2] \
            / float(total_rows)
        for i in range(len(class_summaries)):
            (mean, stdev, _) = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i],
                    mean, stdev)
    return probabilities

# Predict

In [16]:
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    print('probabilities:{}'.format(probabilities))
    (best_label, best_prob) = (None, -1)
    for class_value in probabilities:
        probability = probabilities[class_value]
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

In [12]:
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    return predictions

In [13]:
X = dataset[:, :-1]
y = dataset[:, -1]
kfold = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

for train_index, test_index in kfold.split(X, y):
    pass

In [14]:
train_index, test_index

(array([6, 4, 1, 8, 7, 5, 2]), array([0, 3, 9]))

In [17]:
nb = naive_bayes(dataset[train_index], dataset[test_index])
print(nb)

probabilities:{1: 0.00038816279329386501, 0: 0.037419036468759097}
probabilities:{1: 0.00063359686269312901, 0: 0.0014180099110437897}
probabilities:{1: 0.00062366280752299073, 0: 6.1922465930838682e-12}
[0, 0, 1]
