The purpose of this notebook is to explore several different options for model and feature selection in order to classify mushrooms as either edible or poisonous. 

Metadata: 

Attribute Information: (classes: edible=e, poisonous=p)

cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s

cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

bruises: bruises=t,no=f

odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

gill-attachment: attached=a,descending=d,free=f,notched=n

gill-spacing: close=c,crowded=w,distant=d

gill-size: broad=b,narrow=n

gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

stalk-shape: enlarging=e,tapering=t

stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

veil-type: partial=p,universal=u

veil-color: brown=n,orange=o,white=w,yellow=y

ring-number: none=n,one=o,two=t

ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.naive_bayes import GaussianNB
import xgboost
from xgboost import XGBClassifier, plot_importance, plot_tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2, RFE

First we need to load the data into a DataFrame and check the head to make sure that our data is loaded correctly. 

In [None]:
address = '/kaggle/input/mushroom-classification/mushrooms.csv'
mushroom_data = pd.read_csv(address) 
mushroom_data.head()

We should also check for NaN or missing vlaues in our DataFrame 

In [None]:
mushroom_data.isna().sum()

Next we need to select our target column, as well as our features. 
We use the "get_dummies" approach to encode the strings as binary columns, dropping the first of any encoding to ensure that we avoid overparameterisation with redundant information.  
Finally we use sklearn's "test_train_split" to split our data into a 2/3 training batch and a 1/3 testing batch.

In [None]:
X = mushroom_data.drop('class',axis=1)
y = mushroom_data['class'].values
X = pd.get_dummies(X, drop_first=True)
y = pd.get_dummies(y, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, shuffle=True)

Let's check our training DataFrame.

In [None]:
X_train.head()

There are a number of different models that can be used to solve classification problems with supervised learning, and we can compare a variety of these to see which is the most effective with this dataset. 

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(accuracy_score(preds, np.ravel(y_test)))
print(confusion_matrix(preds, y_test))

We can test the permuation importance of the features used in the prediction

In [None]:
perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

In [None]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train,
             early_stopping_rounds=5, 
             eval_set=[(X_test, y_test)],
             verbose=True)
xgb_preds=xgb_model.predict(X_test)
confusion_matrix(xgb_preds, y_test)

In [None]:
perm = PermutationImportance(xgb_model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

It is also possible to make use of the inbuilt plotting functions of XGBoost to plot the tree structure

In [None]:
plot_importance(xgb_model, importance_type='weight')
fig, ax = plt.subplots(figsize=(30, 30))
xgboost.plot_tree(xgb_model, ax=ax)
plt.show()

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train, np.ravel(y_train))
preds = nb_model.predict(X_test)
confusion_matrix(preds, y_test)

In [None]:
perm = PermutationImportance(nb_model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

In [None]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
preds = tree_model.predict(X_test)
confusion_matrix(preds, y_test)

In [None]:
perm = PermutationImportance(tree_model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

In [None]:
plt.figure(figsize=(5,5))
plot_tree(tree_model, feature_names=X_test.columns.tolist())
plt.show()

In [None]:
svm_model = SVC()
svm_model.fit(X_train, np.ravel(y_train))
preds = svm_model.predict(X_test)
confusion_matrix(preds, y_test)

In [None]:
#perm = PermutationImportance(svm_model, random_state=1).fit(X_test, y_test)
#eli5.show_weights(perm, feature_names = X_test.columns.tolist())

In [None]:
per_model = Perceptron()
per_model.fit(X_train, np.ravel(y_train))
preds = per_model.predict(X_test)
confusion_matrix(preds, y_test)

In [None]:
perm = PermutationImportance(per_model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

In [None]:
ridge_model = RidgeClassifier()
ridge_model.fit(X_train, np.ravel(y_train))
preds = ridge_model.predict(X_test)
confusion_matrix(preds, y_test)

In [None]:
sgd_model = SGDClassifier()
sgd_model.fit(X_train, np.ravel(y_train))
preds = sgd_model.predict(X_test)
confusion_matrix(preds, y_test)

In [None]:
cvs = cross_val_score(sgd_model, X_test, np.ravel(y_test), cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvs.mean(), cvs.std() * 2))

If we want to select a subset of the features in order to be able to make predictions with fewer inputs, we could try PCA, or we can look at methods of feature selection within SKLearn, which can either be performed based on a particular model, or in a model agnostinc way.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca_fitted = pca.fit_transform(X_train)
pca_fitted

In [None]:
plt.scatter(pca_fitted[:,0],pca_fitted[:,1], c=np.ravel(y_train))

In [None]:
tree_model = DecisionTreeClassifier()
selector = RFE(tree_model, n_features_to_select=10, step=1)
selector = selector.fit(X_train, np.ravel(y_train))
selector.support_
print('Tree model columns: ',X_train.columns[[i for i in selector.support_==True]])
sgd_model = SGDClassifier()
selector = RFE(sgd_model, n_features_to_select=10, step=1)
selector = selector.fit(X_train, np.ravel(y_train))
selector.support_
print('SGD model columns: ',X_train.columns[[i for i in selector.support_==True]])
per_model = Perceptron()
selector = RFE(per_model, n_features_to_select=10, step=1)
selector = selector.fit(X_train, np.ravel(y_train))
print('Perceptron model columns: ',X_train.columns[[i for i in selector.support_==True]])


In [None]:
reduced_X_train = X_train.drop(X_train.columns[[i for i in selector.support_==False]], axis=1)
reduced_X_test = X_test.drop(X_train.columns[[i for i in selector.support_==False]], axis=1)
reduced_X_train.head()

In [None]:
sgd_model = SGDClassifier()
sgd_model.fit(reduced_X_train, np.ravel(y_train))
preds = sgd_model.predict(reduced_X_test)
print(confusion_matrix(preds, y_test))
cvs = cross_val_score(sgd_model, reduced_X_test, np.ravel(y_test), cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvs.mean(), cvs.std() * 2))

In [None]:
tree_model = DecisionTreeClassifier()
tree_model.fit(reduced_X_train, np.ravel(y_train))
preds = tree_model.predict(reduced_X_test)
print(confusion_matrix(preds, y_test))
cvs = cross_val_score(tree_model, reduced_X_test, np.ravel(y_test), cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvs.mean(), cvs.std() * 2))

In [None]:
red_pca = PCA(n_components=3)
red_pca_fitted = red_pca.fit_transform(reduced_X_train)
plt.scatter(red_pca_fitted[:,0],red_pca_fitted[:,1],c=np.ravel(y_train))
plt.scatter(red_pca_fitted[:,0],red_pca_fitted[:,2],c=np.ravel(y_train))

Here we use the 'SelectKBest' function in order to try a pre-fitting approach to feature selection, by choosing the five best features based on a chi^2 statistical test. 

In [None]:
X = mushroom_data.drop('class',axis=1)
y = mushroom_data['class'].values
X = pd.get_dummies(X, drop_first=True)
y = pd.get_dummies(y, drop_first=True)
X_new = SelectKBest(chi2, k=5).fit_transform(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.33, shuffle=True)

In [None]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, np.ravel(y_train))
preds = tree_model.predict(X_test)
print(confusion_matrix(preds, y_test))
cvs = cross_val_score(tree_model, X_test, np.ravel(y_test), cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvs.mean(), cvs.std() * 2))