In [13]:
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df = pd.read_csv('mushrooms.csv')
df.shape

(8124, 23)

In [14]:
le = LabelEncoder()
ds = df.apply(le.fit_transform)
data = ds.values
y = data[:,0]
y.shape

(8124,)

In [15]:
x = data[:,1:]
x.shape

(8124, 22)

In [16]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [17]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [18]:
np.unique(y_train)

array([0, 1])

In [19]:
def prior_prob(y_train,label):
    
    """This function calculates the prior probability of a label/class i.e. P(Y = label)"""
    
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train == label)
    
    return (class_examples)/float(total_examples)

In [20]:
def cond_prob(x_train,y_train,feature_col,feature_val,label):
    
    x_filtered = x_train[y_train == label]
    numerator = np.sum(x_filtered[:,feature_col] == feature_val)
    denominator = np.sum(y_train == label)
    
    return numerator/float(denominator)

In [21]:
def predict(x_train,y_train,x_test):
    
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    
    post_probs = []
    
    for label in classes:
        likelihood = 1.0
        
        for f in range(n_features):
            cond = cond_prob(x_train,y_train,f,x_test[f],label)
            likelihood *= cond
            
        prior = prior_prob(y_train,label)
        post = likelihood * prior
        
        post_probs.append(post)
        
    pred = np.argmax(post_probs)
    
    return pred

In [22]:
def score(X_train,X_test,Y_train,Y_test):
    
    m = X_test.shape[0]
    
    y_pred = []
    
    for i in range(m):
        pred = predict(X_train,Y_train,X_test[i])
        y_pred.append(pred)
    
    acc = accuracy_score(Y_test, y_pred)
    
    return acc

In [23]:
score(x_train,x_test,y_train,y_test)

0.9969230769230769