In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
#target variable
df['class'].unique()

In [None]:
df.isnull().sum()

In [None]:
df['class'].value_counts()

In [None]:
df = df.astype('object')

In [None]:
df.dtypes

In [None]:
X = df.drop('class',axis=1)
y = df['class']

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)
y = pd.DataFrame(le.transform(y))

In [None]:
y.head()

In [None]:
from category_encoders import target_encoder
from sklearn.model_selection import KFold

In [None]:
kf = KFold(n_splits=4, shuffle=True, random_state=1111)
train_te = pd.DataFrame()
for tra_idx, val_idx in kf.split(X):
    te = target_encoder.TargetEncoder(handle_missing='return_nan',handle_unknown='return_nan')
    te.fit(X.iloc[tra_idx],y.iloc[tra_idx])
    temp = te.transform(X.iloc[val_idx])
    train_te = pd.concat([train_te,temp],axis=0)

train_te.sort_index(inplace=True)
X_new = train_te

In [None]:
### source: X_new
### target: y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.30, random_state=42)

In [None]:
# Import required libraries for performance metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate

In [None]:
# Define dictionary with performance metrics
scoring = {'accuracy':make_scorer(accuracy_score), 
           'precision':make_scorer(precision_score),
           'recall':make_scorer(recall_score), 
           'f1_score':make_scorer(f1_score)}

In [None]:
# Import required libraries for machine learning classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# Instantiate the machine learning classifiers
log_model = LogisticRegression(max_iter=10000)
svc_model = LinearSVC(dual=False)
dtr_model = DecisionTreeClassifier()
rfc_model = RandomForestClassifier()
gnb_model = GaussianNB()

In [None]:
# Define the models evaluation function
def models_evaluation(X, y, folds):
    
    '''
    X : data set features
    y : data set target
    folds : number of cross-validation folds
    
    '''
    
    # Perform cross-validation to each machine learning classifier
    log = cross_validate(log_model, X, y, cv=folds, scoring=scoring)
    svc = cross_validate(svc_model, X, y, cv=folds, scoring=scoring)
    dtr = cross_validate(dtr_model, X, y, cv=folds, scoring=scoring)
    rfc = cross_validate(rfc_model, X, y, cv=folds, scoring=scoring)
    gnb = cross_validate(gnb_model, X, y, cv=folds, scoring=scoring)

    # Create a data frame with the models perfoamnce metrics scores
    models_scores_table = pd.DataFrame({'Logistic Regression':[log['test_accuracy'].mean(),
                                                               log['test_precision'].mean(),
                                                               log['test_recall'].mean(),
                                                               log['test_f1_score'].mean()],
                                       
                                      'Support Vector Classifier':[svc['test_accuracy'].mean(),
                                                                   svc['test_precision'].mean(),
                                                                   svc['test_recall'].mean(),
                                                                   svc['test_f1_score'].mean()],
                                       
                                      'Decision Tree':[dtr['test_accuracy'].mean(),
                                                       dtr['test_precision'].mean(),
                                                       dtr['test_recall'].mean(),
                                                       dtr['test_f1_score'].mean()],
                                       
                                      'Random Forest':[rfc['test_accuracy'].mean(),
                                                       rfc['test_precision'].mean(),
                                                       rfc['test_recall'].mean(),
                                                       rfc['test_f1_score'].mean()],
                                       
                                      'Gaussian Naive Bayes':[gnb['test_accuracy'].mean(),
                                                              gnb['test_precision'].mean(),
                                                              gnb['test_recall'].mean(),
                                                              gnb['test_f1_score'].mean()]},
                                      
                                      index=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
    
    # Add 'Best Score' column
    models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)
    
    # Return models performance metrics scores data frame
    return(models_scores_table)
  
# Run models_evaluation function
models_evaluation(X_train, y_train, 5)

In [None]:
### Random Forest Classification is the final model
rfc_model.fit(X_train,np.ravel(y_train))
y_pred = rfc_model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
labels = ['p','e']
cm = confusion_matrix(y_test, y_pred)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

### Feature Importance

In [None]:
## Feature importances
from matplotlib import pyplot
importance = rfc_model.feature_importances_
# summarize feature importance
feat_importances = pd.Series(importance, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
pyplot.show()

### permutation Importance

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
#Permutation Importance
perm_train = PermutationImportance(rfc_model, scoring='accuracy',
n_iter=100, random_state=1)
# fit and see the permuation importances
perm_train.fit(X_train, y_train)
eli5.explain_weights_df(perm_train, feature_names = X_train.columns.tolist()).head()
# top 5 important features

In [None]:
perm = PermutationImportance(rfc_model, random_state = 0).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())
#The most important feature is odor

Upon Comparison of multiple classifiers menitoned above the classification metrics voted **Random Forest** as the most efficient classifier.
Upon feature importance check Odor was considered to be the greatest determinant of class of Mushroom.