## Q1

In [369]:
import numpy as np
import pandas as pd
import math
from csv import reader

Let's go on to importing the data. To do so, we'll utilize the pandas library to manipulate our data. The code for importing and previewing the data is as follows:

In [370]:
df = pd.read_csv('./Mushroom_Train.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


The code below confirms that all columns are categorical.

In [371]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


Before we go any further, let's make sure there aren't any null values. 

In [372]:
obj_df[obj_df.isnull().any(axis=1)]

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat


There are no null values in the dataframe, however column "stalk-root" has some(31%) missing values ("?"). As a result, we apply the most-common value method to assign a value to this.

In [373]:
obj_df["stalk-root"].value_counts()


b    3756
?    1991
e    1120
c     541
r     192
Name: stalk-root, dtype: int64

In [374]:
obj_df["stalk-root"].replace({"?": "b"}, inplace=True)
obj_df["stalk-root"].value_counts()

b    5747
e    1120
c     541
r     192
Name: stalk-root, dtype: int64

Now that there are no null values in the data, we can consider encoding options for categorical values.

We'll create a mapping dictionary with each column to process as well as a dictionary of values to translate.

remove "veil-type" column because it has 1 unique value.

In [375]:
obj_df.pop("veil-type")

0       p
1       p
2       p
3       p
4       p
       ..
7595    p
7596    p
7597    p
7598    p
7599    p
Name: veil-type, Length: 7600, dtype: object

In [376]:
colNum = obj_df.shape[1] #number of columns(features)
print(colNum)
colNames = list(obj_df.columns) #names of columns(features)

def encodeCol(col_index):   

    names = np.unique(obj_df[colNames[col_index]].values)
    keys = list(range(0,len(names)))
    encoded_names = dict(zip(names, keys))

    return encoded_names

# n = colNames[11]
# k = encodeCol(11)
# print(k)

encoded_all = {}
# encoded_all[n] = k
# print(encoded_all)


i=0
while i < colNum:
    encoded_all[colNames[i]] = encodeCol(i)
    i+=1

print(encoded_all)

22
{'class': {'e': 0, 'p': 1}, 'cap-shape': {'b': 0, 'c': 1, 'f': 2, 'k': 3, 's': 4, 'x': 5}, 'cap-surface': {'f': 0, 'g': 1, 's': 2, 'y': 3}, 'cap-color': {'b': 0, 'c': 1, 'e': 2, 'g': 3, 'n': 4, 'p': 5, 'r': 6, 'u': 7, 'w': 8, 'y': 9}, 'bruises': {'f': 0, 't': 1}, 'odor': {'a': 0, 'c': 1, 'f': 2, 'l': 3, 'm': 4, 'n': 5, 'p': 6, 's': 7, 'y': 8}, 'gill-attachment': {'a': 0, 'f': 1}, 'gill-spacing': {'c': 0, 'w': 1}, 'gill-size': {'b': 0, 'n': 1}, 'gill-color': {'b': 0, 'e': 1, 'g': 2, 'h': 3, 'k': 4, 'n': 5, 'o': 6, 'p': 7, 'r': 8, 'u': 9, 'w': 10, 'y': 11}, 'stalk-shape': {'e': 0, 't': 1}, 'stalk-root': {'b': 0, 'c': 1, 'e': 2, 'r': 3}, 'stalk-surface-above-ring': {'f': 0, 'k': 1, 's': 2, 'y': 3}, 'stalk-surface-below-ring': {'f': 0, 'k': 1, 's': 2, 'y': 3}, 'stalk-color-above-ring': {'b': 0, 'c': 1, 'e': 2, 'g': 3, 'n': 4, 'o': 5, 'p': 6, 'w': 7, 'y': 8}, 'stalk-color-below-ring': {'b': 0, 'c': 1, 'e': 2, 'g': 3, 'n': 4, 'o': 5, 'p': 6, 'w': 7, 'y': 8}, 'veil-color': {'n': 0, 'o': 1,

Using replace, convert the columns to numbers:

In [377]:
# from collections import defaultdict
# d = defaultdict(preprocessing.LabelEncoder)

df_train = obj_df.replace(encoded_all)

df_train
# df_train = df(map(lambda x: d[x.name].fit_transform(x)))

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,2,7,7,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,2,7,7,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,2,7,7,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,2,7,7,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,2,7,7,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7595,1,3,2,2,0,2,1,0,1,0,...,1,2,7,7,2,1,0,7,4,2
7596,1,3,2,4,0,7,1,0,1,0,...,2,1,6,6,2,1,0,7,4,0
7597,0,5,2,4,1,5,1,0,0,10,...,2,2,7,7,2,2,4,7,4,4
7598,1,3,2,4,0,2,1,0,1,0,...,1,1,7,6,2,1,0,7,4,4


We can see that the first column represents the class, while the remaining columns represent the features.

After encoding the dataset, we use Naive Bayes to classify it.

In [378]:
X_train = df_train.drop('class', axis=1)
y_train = df_train['class']

In [379]:
df_test = pd.read_csv('./Mushroom_Test.csv')

df_test["stalk-root"].replace({"?": "b"}, inplace=True)

colNum = df_test.shape[1] #number of columns(features)
print(colNum)
colNames = list(df_test.columns) #names of columns(features)

def encodeCol(col_index):   
    names = np.unique(df_test[colNames[col_index]].values)
    keys = list(range(0,len(names)))
    encoded_names = dict(zip(names, keys))

    return encoded_names

# n = colNames[11]
# k = encodeCol(11)
# print(k)

encoded_all = {}
# encoded_all[n] = k
# print(encoded_all)


i = 0
while i < colNum:
    encoded_all[colNames[i]]=encodeCol(i)
    i += 1

# print(encoded_all)
df_test = df_test.replace(encoded_all)
df_test.head()

# df_test = df_test.apply(lambda x: d[x.name].transform(x))
df_test.pop("veil-type")
X_test = df_test.drop('class', axis=1)
y_test = df_test['class']

23


Building the classifier:

# Naive Bayes Rule
$ P(class|data) = \frac{P(data|class)P(class)}{P(data)}$

- $P(data|class)$ : Likelihood
- $P(class)$ : Prior Probability
- $P(class|data)$ : Posterior Probability

In [380]:
class GNaiveBayesClassifier:
    def priorProb(self, X, y):

        self.prior = (X.groupby(y).apply(lambda x: len(x)) / self.rows).to_numpy()

        return self.prior
    
    def statParamteres(self, X, y):

        self.mean = X.groupby(y).apply(np.mean).to_numpy()
        self.var = X.groupby(y).apply(np.var).to_numpy()
        return self.mean, self.var

    def densGauss(self, class_i, x):     
 
        mean = self.mean[class_i]
        var = self.var[class_i]
        pmf = np.exp((-1/2)*((x-mean)**2) / (2 * var)) / np.sqrt(2 * np.pi * var)

        return pmf

    def postProb(self, x):

        posteriors = []

        for i in range(self.count):

            prior = np.log(self.prior[i]) 
            conditional = np.sum(np.log(self.densGauss(i, x))) 
            posterior = prior + conditional
            posteriors.append(posterior)

        return self.classes[np.argmax(posteriors)]
        
    def train(self, X, y):
        self.classes = np.unique(y)
        self.count = len(self.classes)
        self.num_feature = X.shape[1]
        self.rows = X.shape[0]
        
        self.statParamteres(X, y)
        self.priorProb(X, y)
        
    def predict(self, X):
        preds = [self.postProb(f) for f in X.to_numpy()]
        return preds

    def accuracy(self, y_test, y_pred):
        accuracy = np.sum(y_test == y_pred) / len(y_test)
        return accuracy
    
    def confusionMatrix(self, predicted, classes): #confusion matrix 
        mat = np.zeros((2, 2), dtype=np.int32)

        predictedNP = np.array(predicted)
        classesNP = np.array(classes)

        for i in range(len(predictedNP)):
            if predictedNP[i] == 1: 
                if classesNP[i] == 1:#TP
                    mat[0][0] += 1
                elif classesNP[i] == 0:#FP
                    mat[0][1] += 1
            elif predictedNP[i] == 0:
                if classesNP[i] == 1:#TN
                    mat[1][0] += 1
                elif classesNP[i] == 0:#FN
                    mat[1][1] += 1
    
        print(mat)

In [381]:
model = GNaiveBayesClassifier()
model.train(X_train, y_train)


In [382]:
predicted = model.predict(X_test)
predicted

[0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,


In [383]:
model.accuracy(y_test, predicted)


0.9561068702290076

In [384]:
model.confusionMatrix(predicted, y_test)

[[260   8]
 [ 15 241]]
