# Naive Bayes Classifier

In [1]:
import numpy as np

$$f(x | \mu, \sigma) = \frac{1}{\sigma \sqrt{2 \pi}} e^{-\frac{(x - \mu)^2}{2\sigma^2}}$$

In [22]:
class naivebayes_gauss(object):
    
    def __init__(self):
        pass
    
    def _log_gauss(self, X, s):
        '''
                X - data, s - mean (0) and std (1) matrix
        '''
        return np.log(np.exp(-(((X-s[:,0])**2)/(2*s[:,1]**2)))/(s[:,1]*np.sqrt(2*np.pi)))
    
    def fit(self, X, y):
        # separate data by class
        separated = [X[y==c] for c in np.unique(y)]
        
        self.model = np.array([np.c_[c.mean(axis=0), c.std(axis=0)] for c in separated])
        return self
    
    def predict_log_prob(self, X):
        return np.array([self._log_gauss(X, s).sum(axis=1) for s in self.model])
    
    def predict(self, X):
        return np.argmax(self.predict_log_prob(X), axis=0)

    def score(self, X, y):
        return np.where(self.predict(X) == y, True, False).sum() / y.shape[0] * 100

In [23]:
def split_train_test(dataset, split_ratio=0.67):
    '''
            split dataset to train and test data
            dataset must contain features (X) and targets (y)
            return 4 arrays: X_train, X_test, y_train, y_test
    '''
    # size of arrays
    size = dataset.shape[0]
    train_size = int(size * split_ratio)
    
    # random choosed indices
    shuffled_indices = np.random.permutation(size)
    
    # split indicies for training and test set by `train_size`
    train_indices, test_indices = shuffled_indices[:train_size], shuffled_indices[train_size:]
    
    # Create training and test sets by indicies
    train, test = dataset[train_indices,:], dataset[test_indices,:]
    
    return train[:,:-1], test[:,:-1], train[:,-1], test[:,-1]

## Pick data

In [29]:
data = np.genfromtxt('data.csv', delimiter=',')
X_train, X_test, y_train, y_test = split_train_test(data, split_ratio=0.67)

In [39]:
sep = np.array([X_train[y_train==c] for c in np.unique(y_train)])
model = np.array([np.c_[c.mean(axis=0), c.std(axis=0)] for c in sep])
model.shape

((342, 8), (2, 8, 2))

## Test model

In [40]:
nb_gauss = naivebayes_gauss()
nb_gauss.fit(X_train, y_train)

print('Accuracy on train set: ', nb_gauss.score(X_train, y_train), '%')
print('Accuracy on test set: ', nb_gauss.score(X_test, y_test), '%')

Accuracy on train set:  76.4591439689 %
Accuracy on test set:  72.0472440945 %


## Sklearn

In [27]:
from sklearn.naive_bayes import GaussianNB
nb_gauss_sk = GaussianNB()
nb_gauss_sk.fit(X_train, y_train)

print('Accuracy on train set: ', nb_gauss_sk.score(X_train, y_train)*100,  '%')
print('Accuracy on test set: ', nb_gauss_sk.score(X_test, y_test)*100, '%')

Accuracy on train set:  77.4319066148 %
Accuracy on test set:  74.0157480315 %
