In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in mushrooms dataset
data = pd.read_csv('mushrooms.csv')
data = data[data['stalk-root'] != '?'] # drop rows with null values
# data.drop('stalk-root', axis=1, inplace=True) # drop column with null values

data

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7986,e,b,y,n,f,n,f,c,b,w,...,y,n,n,p,w,t,p,w,y,p
8001,e,x,y,n,f,n,f,c,b,w,...,y,n,n,p,w,t,p,w,y,p
8038,e,x,y,g,t,n,f,c,b,w,...,s,w,w,p,w,t,p,w,y,p
8095,p,x,y,c,f,m,f,c,b,y,...,y,c,c,p,w,n,n,w,c,d


In [3]:
# training/testing split
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
def greedy_forward(data=data):
    labels = data['class'].map({'p':0, 'e': 1})
    features = data.drop('class', axis=1)
    features = features.applymap(lambda x : (ord(x)-ord('a')))
    attributes = features.columns
    selected = []
    min_error = (2.0, None)
    last_min_error = (1.0, None)
    # split data into training and test
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.20)
    while last_min_error < min_error or len(features) == len(selected): # still improving
        min_error = last_min_error
        for attr in attributes:
            temp = selected + [attr]
            # compute error of SVM of selected + feat
            # add min error to selected IFF it reduces total error
            svclassifier = SVC(kernel='rbf')
            svclassifier.fit(train_features[temp], train_labels)

            label_pred = svclassifier.predict(test_features[temp])

            c = confusion_matrix(test_labels, label_pred)
            error = (c[0][1]+c[1][0])/np.sum(c)
            
            if error < last_min_error[0]:
                last_min_error = (error, attr) 
            
        # add to selected
        if last_min_error[0] < min_error[0]:
            selected.append(last_min_error[1])
            attributes.drop(last_min_error[1])

    return selected

In [5]:
select = greedy_forward()

In [6]:
select

['odor',
 'spore-print-color',
 'stalk-surface-below-ring',
 'ring-number',
 'population']

In [7]:
# training/testing split
labels = data['class'].map({'p':0, 'e': 1})
features = data[select]
features = features.applymap(lambda x : (ord(x)-ord('a')))

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.20)

svclassifier = SVC(kernel='poly', degree=1)
svclassifier.fit(train_features, train_labels)

label_pred = svclassifier.predict(test_features)

c = confusion_matrix(test_labels, label_pred)
error = (c[0][1]+c[1][0])/np.sum(c)
print(error)
print(c)
print(classification_report(test_labels, label_pred))

0.11160318866253321
[[334  99]
 [ 27 669]]
              precision    recall  f1-score   support

           0       0.93      0.77      0.84       433
           1       0.87      0.96      0.91       696

    accuracy                           0.89      1129
   macro avg       0.90      0.87      0.88      1129
weighted avg       0.89      0.89      0.89      1129



In [8]:
svclassifier = SVC(kernel='rbf')
svclassifier.fit(train_features, train_labels)

label_pred = svclassifier.predict(test_features)

c = confusion_matrix(test_labels, label_pred)
error = (c[0][1]+c[1][0])/np.sum(c)
print(error)
print(c)
print(classification_report(test_labels, label_pred))

0.0
[[433   0]
 [  0 696]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       433
           1       1.00      1.00      1.00       696

    accuracy                           1.00      1129
   macro avg       1.00      1.00      1.00      1129
weighted avg       1.00      1.00      1.00      1129

