In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from collections import defaultdict
from pprint import pprint

#from sklearn.naive_bayes import GaussianNB

In [2]:


class my_NB:

    def __init__(self, alpha=1):
        # alpha: smoothing factor
        # P(xi = t | y = c) = (N(t,c) + alpha) / (N(c) + n(i)*alpha)
        # where n(i) is the number of available categories (values) of feature i
        # Setting alpha = 1 is called Laplace smoothing
        self.alpha = alpha

    def fit(self, X, y):
        # X: pd.DataFrame, independent variables, str
        # y: list, np.array or pd.Series, dependent variables, int or str
        # list of classes for this model
        
        self.classes_ = list(set(list(y)))
        #print(self.classes_)
        
        self.n_classes = len(self.classes_)
        #print(self.n_classes)
        
        # for calculation of P(y)
        self.P_y = Counter(y)
        #print(self.P_y)
        #print(self.P_y)
        
        keys = self.P_y.keys()
        #print(keys)
        
        values = (self.P_y.values())
        
        # self.P[yj][Xi][xi] = P(xi|yi) where Xi is the feature name and xi is the feature value, 
        # yj is a specific class label
        self.P = {}
        
                
        all_possible_values = {}
        
        for key in X:
            all_possible_values[key] = set(X[key])

        for label in self.classes_:
            self.P[label] = {}
            
            for key in X:
                self.P[label][key] = {}
                count = Counter(X[key].where(y == label))
                
                for value in all_possible_values[key]:
                    l = len(all_possible_values.keys())
                    self.P[label][key][value] = (count[value] + self.alpha) / (
                                self.P_y[label] + len(all_possible_values[key]) * self.alpha)


        
        return
    
    def predict_proba(self, X):
        # X: pd.DataFrame, independent variables, str
        # prob is a dict of prediction probabilities belonging to each categories
        # return probs = pd.DataFrame(list of prob, columns = self.classes_)
        # write your code below
        probs = {}
        for label in self.classes_:
            p = self.P_y[label]
            for key in X:
                p *= X[key].apply(lambda value: self.P[label][key][value] if value in self.P[label][key] else 1)
            probs[label] = p
        probs = pd.DataFrame(probs, columns=self.classes_)
        sums = probs.sum(axis=1)
        probs = probs.apply(lambda v: v / sums)
        return probs

    def predict(self, X):
        # X: pd.DataFrame, independent variables, str
        # return predictions: list
        # write your code below
        probs = self.predict_proba(X)
        predictions = [self.classes_[np.argmax(prob)] for prob in probs.to_numpy()]
        return predictions

In [3]:
data_train = pd.read_csv("../data/audiology_train.csv",header=None)
# Separate independent variables and dependent variables
independent = range(69)
X = data_train[independent]
y = data_train[70]
# Train model
clf = my_NB()
clf.fit(X,y)
#Load testing data
data_test = pd.read_csv("../data/audiology_test.csv",header=None)
X_test = data_test[independent]
# Predict
predictions = clf.predict(X_test)
# Predict probabilities
probs = clf.predict_proba(X_test)
# Print results
for i,pred in enumerate(predictions):
    print("%s\t%f" % (pred, probs[pred][i]))


cochlear_age	0.999408
cochlear_age	0.999408
cochlear_age	0.875175
cochlear_age	0.484233
cochlear_age	0.992703
cochlear_age	0.997401
cochlear_age	0.998318
cochlear_age	0.998318
cochlear_poss_noise	0.902857
cochlear_unknown	0.611369
mixed_cochlear_unk_fixation	0.832907
mixed_cochlear_unk_fixation	0.755148
normal_ear	0.507668
normal_ear	0.990685
cochlear_age	0.997749
cochlear_age	0.992896
normal_ear	0.997311
mixed_cochlear_unk_fixation	0.930178
cochlear_age	0.982908
cochlear_age	0.996372
cochlear_age	0.959620
mixed_cochlear_unk_fixation	0.397127
normal_ear	0.997311
mixed_cochlear_unk_fixation	0.983080
cochlear_age_and_noise	0.619968
cochlear_poss_noise	0.601495
