In [10]:
from time import perf_counter
from math import log
from collections import defaultdict

In [11]:
def safe_log(x):
    return log(x) if x != 0 else float('-inf')

In [12]:
def read_train(filename):
    with open(filename) as f:
        data = f.readlines()

    n = len(data) - 1
    
    # read column headers
    line = data[0].strip().split(", ")
    attr_vars = line[:-1]
    class_var = line[-1]
    
    class_attr_counts = defaultdict( lambda: [defaultdict(int) for _ in attr_vars] )
    class_counts = defaultdict(int)
    
    attr_domains = [set() for _ in attr_vars]
    class_domain = set()

    # read data and update counts
    for line in data[1:]:
        line = line.strip().split(", ")
        c = line[-1]
        class_counts[c] += 1
        class_domain.add(c)
        for i, attr_val in enumerate(line[:-1]):
            class_attr_counts[c][i][attr_val] += 1
            attr_domains[i].add(attr_val)
    
    return n, attr_vars, class_var, class_attr_counts, class_counts, attr_domains, class_domain

In [13]:
def smoothed(smoothing, amt, val):
    if smoothing:
        return val + amt
    else:
        return val

In [14]:
def model_str(n, attr_vars, class_var, class_attr_counts, class_counts, attr_domains, class_domain, smoothing):
    res = []
    
    for c in class_domain:
        res.append(f"P({class_var}={c}) = {class_counts[c]/n}")
        for i, attr in enumerate(class_attr_counts[c]):
            total_s = smoothed(smoothing, len(attr_domains[i]), sum(attr.values()))
            for key in attr_domains[i]:
                val_s = smoothed(smoothing, 1, attr[key])
                res.append(f"P({attr_vars[i]}={key} | {class_var}={c}) = {val_s / total_s}")
    
    return '\n'.join(res)

In [15]:
def classify(file_in, file_out, n, class_attr_counts, class_counts, attr_domains, class_domain, smoothing):
    with open(file_in) as f:
        data = f.readlines()
    
    res = []
    
    for line in data[1:]:
        line = line.strip().split(", ")
        
        likelihoods = {c: safe_log(class_counts[c]/n) for c in class_domain}  # logs are used to prevent numerical underflow
        for c in class_domain:
            for i, attr_val in enumerate(line):
                likelihoods[c] += safe_log(smoothed(
                    smoothing, 
                    1, 
                    class_attr_counts[c][i][attr_val]))
                likelihoods[c] -= safe_log(smoothed(
                    smoothing, 
                    len(attr_domains[i]), 
                    sum(class_attr_counts[c][i].values())))
        
        res.append(max(likelihoods, key=lambda x: likelihoods[x]))
    
    with open(file_out, 'w') as f:
        f.write('\n'.join(res))

In [16]:
smoothing = True

In [17]:
start = perf_counter()
file_train = "NB-train.txt"
n, attr_vars, class_var, class_attr_counts, class_counts, attr_domains, class_domain = read_train(file_train)
train_res = model_str(n, attr_vars, class_var, class_attr_counts, class_counts, attr_domains, class_domain, smoothing)
end = perf_counter()

print(f"time elapsed (s): {end - start}\n")
print(train_res)

time elapsed (s): 0.0032063000001016917

P(C=Y) = 0.6206896551724138
P(A1=o | C=Y) = 0.38095238095238093
P(A1=y | C=Y) = 0.2857142857142857
P(A1=m | C=Y) = 0.3333333333333333
P(A2=t | C=Y) = 0.55
P(A2=f | C=Y) = 0.45
P(A3=t | C=Y) = 0.7
P(A3=f | C=Y) = 0.3
P(A4=e | C=Y) = 0.5238095238095238
P(A4=g | C=Y) = 0.3333333333333333
P(A4=f | C=Y) = 0.14285714285714285
P(C=N) = 0.3793103448275862
P(A1=o | C=N) = 0.2857142857142857
P(A1=y | C=N) = 0.35714285714285715
P(A1=m | C=N) = 0.35714285714285715
P(A2=t | C=N) = 0.15384615384615385
P(A2=f | C=N) = 0.8461538461538461
P(A3=t | C=N) = 0.07692307692307693
P(A3=f | C=N) = 0.9230769230769231
P(A4=e | C=N) = 0.21428571428571427
P(A4=g | C=N) = 0.21428571428571427
P(A4=f | C=N) = 0.5714285714285714


In [18]:
file_test = "NB-test.txt"
file_out = "NB_test_smoothing.txt"

start = perf_counter()
classify(file_test, file_out, n, class_attr_counts, class_counts, attr_domains, class_domain, smoothing)
end = perf_counter()

print(f"time elapsed (s): {end - start}\n")
print(f"written to file {file_out}")

time elapsed (s): 0.0249840999999833

written to file NB_test_smoothing.txt
