# Naive Bayes Classifier from scratch

In [16]:
import numpy as np
import pandas as pd

## Load and prepare data [Classification]

In [18]:
clf_df = pd.read_csv("Iris.csv")
clf_df = clf_df.drop("Id", axis=1)
clf_df = clf_df.rename(columns={"species": "label"})

In [19]:
clf_df.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa


In [20]:
clf_X_df = clf_df.iloc[:, :-1]
clf_y_df = clf_df.iloc[:, -1]

In [21]:
clf_X_df.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


In [22]:
clf_y_df.head(3)

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
Name: label, dtype: object

## Algorithm

In [154]:
class NaiveBayesClassifier():
    def __init__(self):
        self.feature_types = None
        self.summaries = None
        self.classes = None
        
    def fit(self, X, y):
        # For each each feature gets its type [Categorical or continuous]
        self.feature_types = self._determine_type_of_features(X)
        self.classes = np.unique(y)
        self.summaries = self._summarize_by_class(X, y)
        
    def _summarize_dataset(self, rows):
        summaries = []
        for i in range(len(self.feature_types)): # Iterate over each column
            if self.feature_types[i] == "categorical":
                values, counts = np.unique(rows[:, i], return_counts=True)
                value_counts = ((value, counts[ith_val], len(rows[:, i])) for ith_val, value in enumerate(value))
                summaries.append(value_counts)
            else: # continuous
                summaries.append((np.mean(rows[:, i]), np.std(rows[:, i]), len(rows[:, i])))
            
        return summaries
        
    def _summarize_by_class(self, X, y):
        class_rows_dict = self._split_by_class(X, y)
        summaries = {}
        for class_value, rows in class_rows_dict.items():
            summaries[class_value] = self._summarize_dataset(rows)
        return summaries
        
    def _split_by_class(self, X, y):
        class_data_dict = {}
        for unique_class in self.classes:
            indices = np.where(y == unique_class)
            class_input = X[indices]
            class_output = y[indices]
            class_rows = np.c_[class_input, class_output]
            class_data_dict[unique_class] = class_rows
        return class_data_dict
    
    def _compute_gaussian_pdf(self, x, mean, stdev):
        # Calculate gaussian PDF for x
        return (1 / (np.sqrt(2 * np.pi) * stdev)) * (np.exp(-(x-mean)**2 / (2 * stdev**2)))
    
    
    def _calculate_class_probabilities(self, row):
        total_rows = np.sum([self.summaries[label][0][2] for label in self.summaries])
        probabilities = {}
        for i, (class_value, class_summaries) in enumerate(self.summaries.items()):
            probabilities[class_value] = self.summaries[class_value][0][2]/float(total_rows)
            for i in range(len(class_summaries)):
                if self.feature_types[i] == "categorical":
                    _, value_counts, label_counts = class_summaries[i]
                    probabilities[class_value] *= (value_counts / float(label_counts))
                else:
                    mean, stdev, _ = class_summaries[i]
                    probabilities[class_value] *= self._compute_gaussian_pdf(row[i], mean, stdev)
                
        return probabilities
    
    
    def _predict_row(self, row):
        probabilities = self._calculate_class_probabilities(row)
        predicted_class = self.classes[np.argmax(list(probabilities.values()))]
        return predicted_class
    
    def predict(self, X):
        predictions = []
        for row in X:
            predicted_class = self._predict_row(row)
            predictions.append(predicted_class)
            
        return predictions
    
    
    def _determine_type_of_features(self, X):
        """
        Determine, if the feature is categorical or continuous
        """
        feature_types = []
        n_unique_values_treshold = 15 # Threshold for a numeric feature to be categorical
        
        for feature_i in range(X[0].size):
            unique_values = np.unique(X[:, feature_i])
            example_value = unique_values[0]

            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")

        return feature_types

In [155]:
clf = NaiveBayesClassifier()

In [156]:
clf.fit(clf_X_df.values, clf_y_df.values)

In [157]:
predictions = clf.predict(clf_X_df.values)

In [160]:
def accuracy_classification(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true)
    return accuracy

In [161]:
accuracy_classification(clf_y_df.values, predictions)

0.96