## Naive Bayes-Mushroom Dataset
Goal is to predict the class of mushrooms, given some features of mushrooms. We will use Naive Bayes model for this classification

### Load the Dataset

In [1]:
import pandas as pd
import numpy as np

In [10]:
data=pd.read_csv('Mushrooms/mushrooms.csv')

In [11]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
dataset.shape

(8124, 23)

## Encoding the data => Categorical to Numeric

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
le=LabelEncoder()

In [8]:
dataset=data.apply(le.fit_transform)

In [12]:
dataset.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [13]:
type(dataset)

pandas.core.frame.DataFrame

In [14]:
dataset=dataset.values

In [15]:
type(dataset)

numpy.ndarray

In [17]:
print(dataset[:5,:])

[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


In [18]:
y=dataset[:,0]

In [20]:
X=dataset[:,1:]

In [21]:
X.shape

(8124, 22)

### Breaking in train test

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
X_train.shape

(6499, 22)

In [25]:
np.unique(y_train)

array([0, 1])

### Building Our Classifier

In [27]:
def prior_prob(y_train,label):
    total_examples=y_train.shape[0]
    class_examples=np.sum(y_train==label)
    
    return (class_examples)/float(total_examples)

In [39]:
def cond_prob(X_train,y_train,feature_col,feature_val,label):
    X_filtered=X_train[y_train==label]
    numerator=np.sum(X_filtered[:,feature_col]==feature_val)
    denominator=np.sum(y_train==label)
    
    return numerator/float(denominator)

### Posterior Probability

In [40]:
def predict(X_train,y_train,X_test):
    """Xtest is a single testing point, n features"""
    classes=np.unique(y_train)
    n_features=X_train.shape[1]
    post_probs=[] # list of probs for all classes and a given sample testing point
    for label in classes:
        
        # Post_c=Likelihood*prior
        likelihood=1.0
        for f in range(n_features):
            cond=cond_prob(X_train,y_train,f,X_test[f],label)
            likelihood *= cond
        
        prior=prior_prob(y_train,label)
        post=likelihood*prior
        post_probs.append(post)
    pred=np.argmax(post_probs)
    return pred

In [41]:
output=predict(X_train,y_train,X_test[1])
print(output)

1


In [38]:
print(y_test[1])

1
