<h1> Naive Bayes Implementation </h1>

<p> In this assignment Machine Learning Algorithms such as Naive Bayes and K Nearest Neigbors are implemented and tested </p>

In [1]:
import numpy as np
import pandas as pd

In [2]:
def get_dataframe(filename="irisTraining.txt",header=None,delimiter=" "):
    """This function reads the data from given files and return a pandas datafram object.
    
    Args:
        filename(string): The name of file that contains dataset
        header: Header of the file
        delimiter(string): the character that sperates columns of data
    Returns:
        pandas dataframe: A pandas datafram object containing the data from the filename
    """
    dataframe_df = pd.read_csv(filename,sep=delimiter,header=header) #read files
    return dataframe_df #return dataframe

In [3]:
iris_train_df = get_dataframe(filename="datasets/irisTraining.txt",header=None,delimiter=" ")
iris_train_df

Unnamed: 0,0,1,2,3,4
0,5.9,-1.0,4.2,1.5,1
1,6.9,-1.1,4.9,1.5,1
2,6.6,-1.9,4.6,1.3,1
3,4.6,-1.2,1.4,0.2,-1
4,6.0,-1.2,4.0,1.0,1
...,...,...,...,...,...
95,7.4,-1.8,6.1,1.9,-1
96,4.9,-1.4,-1.3,1.0,1
97,7.0,-1.2,4.7,1.4,1
98,5.5,-1.4,-1.7,1.0,1


## Implementation of Naive Bayes Algorithm

In [4]:
def calculate_prior_probabilities(dataset_df):
    """
    """
    columns = dataset_df.columns
    label_column = columns[len(columns)-1]
    labels = dataset_df[label_column].to_numpy()
    unique_labels = set(labels)
    total_count = len(labels)
    prior_probabilities = {}
    
    for label in unique_labels:
        label_count = len(labels[labels==label])
        prior = label_count / total_count
        log_prior = np.log(prior)
        prior_probabilities[label] = [prior,log_prior]
        
    return prior_probabilities

In [5]:
prior_probalitites = calculate_prior_probabilities(dataset_df=iris_train_df)
print(prior_probalitites)
%time 

{1: [0.34, -1.0788096613719298], -1: [0.66, -0.4155154439616658]}
Wall time: 0 ns


In [67]:
class GaussianNaiveBayes():
    def __init__(self):
        self.__features_mean = {}
        self.__features_sdevs = {}
        self.__unique_labels = []
        self.__trainset_df = None
        self.__prior_probabilities = {}
        
    def fit(self,train_data_df):
        """ THis function computes gaussian likehood 
        """
        #set train set for model
        self.__trainset_df = train_data_df.copy()
        
        #  label column
        label_column = list(self.__trainset_df.columns)
        label_column = label_column[len(label_column)-1]
        labels = train_data_df[label_column].to_numpy() #get labels for prior cacluations
        total_count = len(labels)
        
        # get list of class labels
        self.__unique_labels = set(self.__trainset_df[label_column])
        
        #caclulate gaussian parameters
        for label in self.__unique_labels:
            
            # calcuate means and standard deviations
            self.__features_mean[label] = list(self.__trainset_df[self.__trainset_df[label_column]==label].mean()) #get mean for specified classlabel  
            self.__features_sdevs[label] = list(self.__trainset_df[self.__trainset_df[label_column]==label].std()) #get standard deviation for specified classlabel    
    
            #mean and standard deviation for label coumn is not required so drop then
            self.__features_mean[label].pop()
            self.__features_sdevs[label].pop()
            
            #prior probabity calculations
            label_count = len(labels[labels==label])
            prior = (label_count * 1.0) / (total_count * 1.0)
            self.__prior_probabilities[label] = prior


    def print_model_parameters(self):
        print("Classes: ",self.__unique_labels)
        print("")
        print("Means : ", self.__features_mean)
        print("")
        print("Standard Deviations: ",self.__features_sdevs)
        print("")
        print("Prior Probabilities: ", self.__prior_probabilities)
        
    
    def predict_probabilities(self,test_set_df):
        """
        """
        column_names = test_set_df.columns
        label_column = len(column_names) - 1
        feature_matrix = test_set_df.drop([label_column],axis=1)
        feature_matrix = feature_matrix.to_numpy()
        labels_list = []
        probabilities = []
        
        # calculate gaussain likelihoods
        for label in self.__unique_labels:
            cur_matrix = feature_matrix - np.array(self.__features_mean[label])
            cur_matrix = cur_matrix **2
            cur_matrix = cur_matrix / (2 * (np.array(self.__features_sdevs[label])**2) )
            cur_matrix = np.exp((-1 * cur_matrix))
            cur_matrix = cur_matrix * (1.0/np.sqrt(2*np.pi*(np.array(self.__features_sdevs[label]))))
            
            if cur_matrix.ndim > 1:
                cur_matrix = np.prod(cur_matrix,axis=1)
            else:
                cur_matrix = np.prod(cur_matrix)
            
            #multiply by prior probabilities
            #cur_matrix = cur_matrix * self.__prior_probabilities[label]
            
            labels_list.append(label)
            probabilities.append(cur_matrix)
        
        # make probabilities into single dataframe
        probabilities_df = pd.DataFrame(probabilities) #convert to dataframe
        probabilities_df = probabilities_df.transpose()
        probabilities_df.columns = labels_list #add labels
        return probabilities_df
    
    def predict_labels(self,test_set_df):
        probabilities_df = self.predict_probabilities(test_set_df)
        labels_list = probabilities_df.columns
        probabilities_df = probabilities_df.to_numpy()
        max_index = np.argmax(probabilities_df,axis=1) #get maximum of each row, axis= 1 horizontal, max each row
        return np.array(labels_list[max_index])
        
    def evaluate(self,test_set_df):
        column_names = test_set_df.columns
        label_column = len(column_names) - 1
        actual_labels = test_set_df[label_column].to_numpy()
        predicted_labels = self.predict_labels(test_set_df)
        agreement =  ( (1.0 * sum(actual_labels == predicted_labels)) / (1.0 * len(predicted_labels)) ) * 100
        print("Model Accuracy: %0.3f%%"%agreement)
        return agreement

In [68]:
model = GaussianNaiveBayes()

In [69]:
model.fit(iris_train_df)

In [70]:
model.print_model_parameters()

Classes:  {1, -1}

Means :  {1: [5.847058823529412, -1.388235294117647, 3.005882352941176, 1.3], -1: [5.771212121212121, -1.3181818181818181, 3.548484848484849, 0.22272727272727275]}

Standard Deviations:  {1: [0.5124127681076908, 0.3198150089171844, 2.5404534523024638, 0.21602468994692867], -1: [0.9419725868157492, 0.7743799685324615, 2.080328555725489, 1.091617776801647]}

Prior Probabilities:  {1: 0.34, -1: 0.66}


In [101]:
sindex = 0
eindex = 5
test = iris_train_df.iloc[sindex:eindex]
print(test)
print("")
df = model.predict_probabilities(test)
df

     0    1    2    3  4
0  5.9 -1.0  4.2  1.5  1
1  6.9 -1.1  4.9  1.5  1
2  6.6 -1.9  4.6  1.3  1
3  4.6 -1.2  1.4  0.2 -1
4  6.0 -1.2  4.0  1.0  1



Unnamed: 0,1,-1
0,0.02345609,0.008605
1,0.003361666,0.003767
2,0.006550744,0.00545
3,7.050555e-09,0.005267
4,0.02399215,0.014317


In [102]:
model.predict_labels(test)

array([ 1, -1,  1, -1,  1], dtype=int64)

In [103]:
model.evaluate(iris_train_df)

Model Accuracy: 95.000%


95.0

In [104]:
iris_test_df = get_dataframe(filename="datasets/irisTesting.txt",header=None,delimiter=" ")

In [106]:
model.evaluate(iris_test_df)

Model Accuracy: 98.000%


98.0

In [107]:
model.predict_labels(iris_test_df)

array([ 1,  1, -1, -1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1,
        1, -1,  1, -1, -1,  1,  1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,
       -1, -1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1],
      dtype=int64)

In [109]:
iris_test_df[4].to_numpy()

array([ 1,  1, -1, -1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1,
       -1, -1,  1, -1, -1,  1,  1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,
       -1, -1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1],
      dtype=int64)