In [108]:
import numpy as np
import pandas as pd
mush = pd.read_csv("mushroom.csv")
mush.replace("?", np.nan, inplace = True)
print("Initially: ", len(mush.columns), "columns. After dropping NA: ", len(mush.dropna(axis = 1).columns), "columns.")

Initially:  20 columns. After dropping NA:  19 columns.


In [109]:
mush.dropna(axis = 1, inplace = True)
target = "class"
features = mush.columns[mush.columns != target]
classes = mush[target].unique()
test = mush.sample(frac = 0.3)
mush = mush.drop(test.index)
probs = {}
probcl = {}
for x in classes:
    mushcl = mush[mush[target] == x][features]
    clsp = {}
    tot = len(mushcl) 
    for col in mushcl.columns:
        colp = {}
        for val,cnt in mushcl[col].value_counts().iteritems():
            pr = cnt/tot
            colp[val] = pr
            clsp[col] = colp
    probs[x] = clsp
    probcl[x] = len(mushcl) / len(mush)

In [110]:
def probabs(x):
    if not isinstance(x,pd.Series):
        raise IOError("Arg must be of type Series")
    probab = {}
    for cl in classes:
        pr = probcl[cl]
        for col,val in x.iteritems():
            try:
                pr *= probs[cl][col][val]
            except KeyError:
                pr = 0
        probab[cl] = pr
    return probab

In [111]:
def classify(x):
    probab = probabs(x)
    mx = 0
    mxcl = ""
    for cl,pr in probab.items():
        if pr>mx:
            mx = pr
            mxcl = cl
    return mxcl

In [112]:
b = []
for i in mush.index:
    b.append(classify(mush.loc[i,features]) == mush.loc[i,target])
print("Train dataset: ")
print(sum(b), " correct out of ", len(mush))
print("Accuracy: ", sum(b) / len(mush))

Train dataset: 
45  correct out of  45
Accuracy:  1.0


In [113]:
b = []
for i in test.index:
    b.append(classify(test.loc[i,features]) == test.loc[i,target])
print("Test dataset: ")
print(sum(b), " correct out of ", len(test))
print("Accuracy: ", sum(b) / len(test))

Test dataset: 
19  correct out of  20
Accuracy:  0.95


In [None]:
#alternate with libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

df = pd.read_csv("mushroom.csv")
df.astype("category")
labelencoder=LabelEncoder()
for column in df.columns:
    df[column] = labelencoder.fit_transform(df[column])
print("Initially", len(df.columns), "columns. After dropping NA:", len(df.dropna(axis=1).columns), "columns")

#splitting the dataset into train and test data

X = df.drop(["class"], axis=1)
y = df["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33)

print ('\n the total number of Training Data :',y_train.shape)
print ('\n the total number of Test Data :',y_test.shape)


# Training Naive Bayes (NB) classifier on training data.
nb = GaussianNB()
nb.fit(X_train,y_train)
predicted = nb.predict(X_test)
#printing Confusion matrix, accuracy, Precision and Recall

print('\n Confusion matrix')
print(metrics.confusion_matrix(y_test,predicted))

print('\n Accuracy of the classifier is',metrics.accuracy_score(y_test,predicted))

print('\n The value of Precision', metrics.precision_score(y_test,predicted))

print('\n The value of Recall', metrics.recall_score(y_test,predicted))