# Naive Bayes Learner

In [1]:
# Load libraries
import numpy as np
import pandas as pd
from NaiveBayesClassifier import NaiveBayesClassifier

## Helper Functions

In [2]:
def preprocess(csv_path):
    """Reads and processes a csv data file. Returns a tuple of:
    (<2D list of instances>, <list of class labels>, <number of unique labels>).
    """
    
    df = pd.read_csv(csv_path, header=None)

    # Add a list of each instance for each attribute (the first N-1 columns in the DataFrame)
    instance_list = []
    if ((len(df.columns) > 1)):
        for attribute_index in range(0, (len(df.columns) - 2)):
            instance_list.append(df[attribute_index].tolist())
    
    # Make sure attribute instances are in String format
    for index in range (0, len(instance_list)):
        instance_list[index] = [str(i) for i in instance_list[index]]
        
    class_list = []
    if ((len(df.columns) > 0)):
        class_list = df[(len(df.columns) - 1)].tolist()
    class_list = [str(i) for i in class_list]
    
    n_classes = len(set(class_list))
    return instance_list, class_list, n_classes

In [3]:
def evaluate_model(predicted_classes, actual_classes):
    """Evaluates the number of correct predictions made by a Multinomial Naive Bayes classifier.
    Returns an accuracy score between [0,1].
    """
    
    n_correct = 0
    for test in range(len(predicted_classes)):
        if predicted_classes[test] == actual_classes[test]:
            n_correct += 1
    return n_correct / len(predicted_classes)

## Results

In [4]:
def test_and_print_results(dataset_csv_path):
    """Trains and evaluates a Multinomial Naive Bayes learner and prints an accuracy score"""
    
    data = preprocess(dataset_csv_path)
    
    NB = NaiveBayesClassifier()
    NB.train(data[0], data[1])
    predicted_classes = NB.predict(data[0])
    acc = evaluate_model(predicted_classes, data[1])
    print('Acc: '+ '{0:.2f}'.format(acc * 100) + '% for ' + dataset_csv_path.split('/')[-1] \
          +  ' with ' + str(len(predicted_classes)) + ' instances')

In [5]:
# Print results with laplace smoothing
print('Multinomial Naive Bayes (Laplace Smoothing)')
print('------------------------------------------')
test_and_print_results('data/breast-cancer.csv')
test_and_print_results('data/car.csv')
test_and_print_results('data/hypothyroid.csv')
test_and_print_results('data/mushroom.csv')

Multinomial Naive Bayes (Laplace Smoothing)
------------------------------------------
Acc: 75.52% for breast-cancer.csv with 286 instances
Acc: 69.85% for car.csv with 1728 instances
Acc: 95.19% for hypothyroid.csv with 3163 instances
Acc: 95.53% for mushroom.csv with 8124 instances
