In [198]:
from time import perf_counter
from math import log
from collections import defaultdict

In [199]:
def read_train(filename):
    with open(filename) as f:
        data = f.readlines()

    n = len(data) - 1
    
    # read column headers
    line = data[0].strip().split(", ")
    attr_names = line[:-1]
    class_var = line[-1]
    
    class_attr_counts = [ [defaultdict(int) for _ in attr_names] for _ in range(2) ]
    class_count = 0

    # read data and update counts
    for line in data[1:]:
        line = line.strip().split(", ")
        c = int(line[-1])
        class_count += c
        for i, attr_val in enumerate(line[:-1]):
            class_attr_counts[c][i][attr_val] += 1
    
    return n, attr_names, class_var, class_attr_counts, class_count

In [200]:
def smoothed(smoothing, amt, val):
    if smoothing:
        return val + amt
    else:
        return val

In [201]:
def classify(file_in, file_out, n, class_attr_counts, class_count, smoothing):
    with open(file_in) as f:
        data = f.readlines()
    
    res = []
    
    for line in data[1:]:
        line = line.strip().split(", ")
        
        missing_flag = False
        likelihoods = [log(1-class_count/n), log(class_count/n)]  # logs are used to prevent numerical underflow
        for c in range(2):
            for i, attr_val in enumerate(line):
                if attr_val not in class_attr_counts[c][i]:
                    missing_flag = True
                    missing_id = c
                    if not smoothing:
                        break
                likelihoods[c] += log(smoothed(
                    smoothing, 
                    1, 
                    class_attr_counts[c][i][attr_val]))
                likelihoods[c] -= log(smoothed(
                    smoothing, 
                    len(class_attr_counts[c][i]), 
                    sum(class_attr_counts[c][i].values())))
        
        if missing_flag and not smoothing:
            res.append(str(1-missing_id))
        elif likelihoods[0] > likelihoods[1]:
            res.append("0")
        else:
            res.append("1")
    
    with open(file_out, 'w') as f:
        f.write('\n'.join(res))

In [202]:
def model_str(n, attr_names, class_var, class_attr_counts, class_count):
    res = []
    
    res.append(f"P({class_var}=1) = {class_count/n}")
    res.append(f"P({class_var}=0) = {1-class_count/n}")
    
    for c in range(2):
        for attr_name, attr in zip(attr_names, class_attr_counts[c]):
            total = sum(attr.values())
            for key, val in attr.items():
                res.append(f"P({attr_name}={key} | {class_var}={c}) = {val / total}")
    
    return '\n'.join(res)

In [203]:
file_train = "naive_bayes_train.txt"
n, attr_names, class_var, class_attr_counts, class_count = read_train(file_train)

In [204]:
start = perf_counter()
train_res = model_str(n, attr_names, class_var, class_attr_counts, class_count)
end = perf_counter()

print(f"time elapsed (s): {end - start}\n")
print(train_res)

time elapsed (s): 0.00018279999494552612

P(C=1) = 0.5625
P(C=0) = 0.4375
P(A1=b | C=0) = 0.42857142857142855
P(A1=d | C=0) = 0.14285714285714285
P(A1=a | C=0) = 0.14285714285714285
P(A1=c | C=0) = 0.2857142857142857
P(A2=n | C=0) = 0.2857142857142857
P(A2=m | C=0) = 0.2857142857142857
P(A2=o | C=0) = 0.42857142857142855
P(A3=y | C=0) = 0.2857142857142857
P(A3=z | C=0) = 0.2857142857142857
P(A3=x | C=0) = 0.2857142857142857
P(A3=w | C=0) = 0.14285714285714285
P(A1=a | C=1) = 0.4444444444444444
P(A1=c | C=1) = 0.3333333333333333
P(A1=b | C=1) = 0.2222222222222222
P(A2=m | C=1) = 0.3333333333333333
P(A2=n | C=1) = 0.5555555555555556
P(A2=o | C=1) = 0.1111111111111111
P(A3=z | C=1) = 0.2222222222222222
P(A3=w | C=1) = 0.2222222222222222
P(A3=x | C=1) = 0.3333333333333333
P(A3=y | C=1) = 0.2222222222222222


In [205]:
smoothing = True

In [206]:
file_test = "naive_bayes_test.txt"
file_out = "naive_bayes_output.txt"

start = perf_counter()
classify(file_test, file_out, n, class_attr_counts, class_count, smoothing)
end = perf_counter()

print(f"time elapsed (s): {end - start}\n")
print(f"written to file {file_out}")

time elapsed (s): 0.0046885000192560256

written to file naive_bayes_output.txt
