In [19]:
import numpy as np

In [25]:
import math

def mean(nums):
    return sum(nums) / len(nums)

def stdev(nums):
    avg = mean(nums)
    var = sum([(x - avg) ** 2 for x in nums]) / len(nums)
    return math.sqrt(var)

def summarize(data):
    separated = {
        'yes': [],
        'no': []
    }
    for feature, label in data:
        separated[label].append(feature)
    summaries = {}
    summaries['yes'] = [(mean(feature), stdev(feature)) for feature in zip(*separated['yes'])]
    prob_yes = len(separated['yes']) / len(data)
    summaries['no'] = [(mean(feature), stdev(feature)) for feature in zip(*separated['no'])]
    prob_no = len(separated['no']) / len(data)
    return summaries, prob_yes, prob_no


# Change number in f_x when doing CFS (From 8 to 5, or vice versa)
def f_x(summaries, input, prob_yes, prob_no):
    probabilities = {}
    probabilities['yes'] = prob_yes
    for i in range(5):
        mean, stdev = summaries['yes'][i]
        probabilities['yes'] *= math.exp(-(math.pow(input[i] - mean, 2) / (2 * math.pow(stdev, 2)))) / (math.sqrt(2 * math.pi) * stdev)
    probabilities['no'] = prob_no
    for i in range(5):
        mean, stdev = summaries['no'][i]
        probabilities['no'] *= math.exp(-(math.pow(input[i] - mean, 2) / (2 * math.pow(stdev, 2)))) / (math.sqrt(2 * math.pi) * stdev)
    return probabilities

def predict(summaries, input, prob_yes, prob_no):
    probabilities = f_x(summaries, input, prob_yes, prob_no)
    return max(probabilities, key=probabilities.get)

def classify_nb(training_filename, testing_filename):
    with open(training_filename, 'r') as file:
      data_str = file.readlines()
    training_data = [line.strip().split(",") for line in data_str]
    training_data = [([float(x) for x in row[:-1]], row[-1]) for row in training_data]

    with open(testing_filename, 'r') as file:
      data_str = file.readlines()
    testing_data = [line.strip().split(",") for line in data_str]
    testing_data = [[float(x) for x in row] for row in testing_data]
    
    summaries, prob_yes, prob_no = summarize(training_data)
    predictions = []
    
    for input in testing_data:
        result = predict(summaries, input, prob_yes, prob_no)
        predictions.append(result)
    
    return predictions

In [17]:
def calculate_accuracy(labels_true, labels_pred):
    n_correct = sum(1 for true, pred in zip(labels_true, labels_pred) if true == pred)
    n_total = len(labels_true)
    accuracy = n_correct / n_total
    return accuracy

In [23]:
results = np.zeros(10)

for i in range(10):
    training_filename = "numerical_cv/train_fold_{}.txt".format(i+1)
    test_filename = "numerical_cv/test_fold_{}.txt".format(i+1)
    label_filename = "numerical_cv/label_fold_{}.txt".format(i+1)
    
    pred = classify_nb(training_filename, test_filename)    
    
    true_labels = []
    with open(label_filename, "r") as file:
        labels = file.readlines()
        for label in labels:
            true_labels.append(label.strip("\n"))

    results[i] = calculate_accuracy(true_labels, pred)

np.mean(results)

0.7488209159261792

In [26]:
results = np.zeros(10)

for i in range(10):
    training_filename = "numerical_cv_cfs/train_fold_{}.txt".format(i+1)
    test_filename = "numerical_cv_cfs/test_fold_{}.txt".format(i+1)
    label_filename = "numerical_cv_cfs/label_fold_{}.txt".format(i+1)
    
    pred = classify_nb(training_filename, test_filename)    
    
    true_labels = []
    with open(label_filename, "r") as file:
        labels = file.readlines()
        for label in labels:
            true_labels.append(label.strip("\n"))

    results[i] = calculate_accuracy(true_labels, pred)

np.mean(results)

0.7668831168831169