In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier,GradientBoostingClassifier,RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier





In [2]:
mushroom_df = pd.read_csv('../datasets/mushrooms.csv')
print(mushroom_df.shape)
pd.options.display.max_columns = None

(8124, 23)


In [3]:
mushroom_df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g


In [5]:
mushroom_df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [6]:
print(mushroom_df.columns)

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')


In [7]:
labelencoder = LabelEncoder()

for column in mushroom_df.columns:
    mushroom_df[column] = labelencoder.fit_transform(mushroom_df[column])

In [8]:
mushroom_df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,0,3,2,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,0,2,2,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,0,2,2,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,0,3,2,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,1,3,2,2,7,7,0,2,1,0,3,0,1


In [9]:
X = mushroom_df.drop('class',axis=1)
y = mushroom_df['class']

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

print('X_train:',x_train.shape)
print('X_test:',x_test.shape)

print('y_train: ',y_train.shape)
print('y_test:',y_test.shape)

X_train: (5686, 22)
X_test: (2438, 22)
y_train:  (5686,)
y_test: (2438,)


In [10]:
classifiers = [LogisticRegression(),KNeighborsClassifier(n_neighbors=3),
               DecisionTreeClassifier(max_depth=5),SVC(kernel='linear',C=0.025),
              SVC(gamma=2,C=1),RandomForestClassifier(max_depth=5),
              BaggingClassifier(),GradientBoostingClassifier()]
classifier_names = ['Logistic Regression','KNearestNeighbours','DecisionTress',
                    'Linear SVM','RBF SVM','Random Forest',
                   'Bagging Classifier','Gradient Boosting Classifier']

In [11]:
classifier_compare_df = pd.DataFrame(columns=['Classifier','Mean Accuracy']) # empty dataframe for populating results
for classifier,name in zip(classifiers,classifier_names):
    model = classifier
    model.fit(x_train,y_train)
    mean_accuracy = model.score(x_test,y_test)
    classifier_compare_df.loc[len(classifier_compare_df)] = [name,mean_accuracy]
classifier_compare_df

Unnamed: 0,Classifier,Mean Accuracy
0,Logistic Regression,0.95283
1,KNearestNeighbours,0.998769
2,DecisionTress,0.979902
3,Linear SVM,0.946678
4,RBF SVM,0.965956
5,Random Forest,0.991386
6,Bagging Classifier,1.0
7,Gradient Boosting Classifier,1.0
