In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Load the Dataset

In [8]:
df = pd.read_csv('./Dataset/mushrooms.csv')
df.head() # shows 1st 5 rows

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [10]:
df.head(n=10) # then it will show 1st 10 rows

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [11]:
print(df.shape)

(8124, 23)


## Encode the categorical values into Numerical values

In [19]:
# like converting different class names into numerical value
le = LabelEncoder()
# This operation would be performed on each of the columns.
ds = df.apply(le.fit_transform)
print(type(ds)) # new dataframe

<class 'pandas.core.frame.DataFrame'>


In [21]:
ds.head(n=10) # converted into numerical values

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
5,0,5,3,9,1,0,1,0,0,5,...,2,7,7,0,2,1,4,2,2,1
6,0,0,2,8,1,0,1,0,0,2,...,2,7,7,0,2,1,4,2,2,3
7,0,0,3,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,3,3
8,1,5,3,8,1,6,1,0,1,7,...,2,7,7,0,2,1,4,2,4,1
9,0,0,2,9,1,0,1,0,0,2,...,2,7,7,0,2,1,4,2,3,3


In [25]:
data = ds.values # converting into numpy array
print(type(data))
print(data.shape)

<class 'numpy.ndarray'>
(8124, 23)


In [27]:
print(data[:5,:]) # 1st five rows and all columns

[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


In [31]:
# carefully observing the dataset, we see that 1st column denotes the class, and the remaining column
# denotes the features.
data_x = data[:,1:] #all the rows and all columns starting from column no. 1
data_y = data[:,0]

## Break the Data into train and test

In [37]:
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.2)

In [40]:
print(x_train.shape,y_train.shape) #x_train includes all the features for all the 6499 training examples
# y_train includes the labels for the 6499 training examples after train_test_split

(6499, 22) (6499,)


In [41]:
print(x_test.shape,y_test.shape)

(1625, 22) (1625,)


In [43]:
# determining the no. of classes of mushrooms
np.unique(y_train)

array([0, 1])

## Building the classifier

In [46]:
a=np.array([1,1,1,0,0,0,0,0,1])
a==1

array([ True,  True,  True, False, False, False, False, False,  True])

In [48]:
print(sum(a==1)) # This works faster than a for loop

4


In [60]:
def prior_prob(y_train, label):
    total_examples=y_train.shape[0] # Along the zero axis
    class_examples=np.sum(y_train==label)
    return (class_examples)/float(total_examples)

In [54]:
#y=np.array([1,1,1,0,0,0,0,0,1,1])

#prior_prob(y,1) # out of 10 examples half belong to class 1.
# This is the use of prior_prob

0.5

In [55]:
# eg.
# P(mushroom_color=green/class=2)
# Probability of mushroom color is green in class equal to label.
def cond_prob(x_train,y_train,feature_col,feature_val,label):
    x_filtered = x_train[y_train==label]
    # only those rows from x_train where y_train = label value
    numerator = np.sum(x_filtered[:,feature_col]==feature_val)
    deno = np.sum(y_train==label)
    return numerator/float(deno)
    

## Determining Posterior probability for each test example and make predictions

In [59]:
np.unique(y_train) # It gives the classes

array([0, 1])

In [66]:
def predict(x_train,y_train,xtest):
    """Xtest is a single testing point having n no. of features"""
    """Before predicting the class, we will calc the posterior prob of each class"""
    """This means given the mushroom, what is the prob that it belongs to class i"""
    """Then we will take max of all the posterior prob and return the index for which the posterior prob is greatest"""
    """That index would denote the class"""
    
    # Determine the number of classes
    classes = np.unique(y_train)
    
    #Compute Posterior for each class
    post_probs = [] # List of probs for all classes given a single testing point
    n_features = x_train.shape[1]
    
    for label in classes:
        
        #Post_c = likelihood*prior
        
        likelihood = 1.0
        for f in range(n_features):
            cond = cond_prob(x_train,y_train,f,xtest[f],label)
            likelihood =* cond
            
        prior = prior_prob(y_train,label)
        post = likelihood*prior
        
        post_probs.append(post)
        
    pred = np.arg_max(post_probs)
    return pred



SyntaxError: can't use starred expression here (<ipython-input-66-2e2c912f1c0a>, line 25)