In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization

# to ignore warnings
import warnings
warnings.filterwarnings('ignore')

# train-test split
from sklearn.model_selection import train_test_split

# libraries for model building
import sklearn
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

# display all columns of the dataframe
pd.options.display.max_columns = None

# display all rows of the dataframe
pd.options.display.max_rows = None


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# set the plot size
plt.rcParams['figure.figsize'] = (10,5)

### Read the Data

In [None]:
# read the data
df_mushroom = pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')

# check the first five observations
df_mushroom.head()

Here `class` is the target variable that includes the edible or poisonous category of mushroom which is to be predicted.

## Exploratory Data Analysis

### Understand the Dataset

In [None]:
# check the dimension of the data
df_mushroom.shape

In [None]:
# check the data type of each variable
df_mushroom.dtypes

The dataset information shows that the variable `stalk-root` contains non-standard missing value. (i.e. '?') Check the count of different levels in this variable and treat the missing data before starting the analysis.

In [None]:
df_mushroom['stalk-root'].value_counts()

There are 2480 missing values in the variable `stalk-root`.  

We can not remove the observations with missing data, as it will reduce the dimension of the dataset significantly. Thus we need to replace '?' with the mode of the variable i.e. 'b'.

In [None]:
# replace '?' by 'b'
df_mushroom['stalk-root'] = df_mushroom['stalk-root'].replace(to_replace = '?', value ='b')

In [None]:
# check the count of each level in 'stalk-root'
df_mushroom['stalk-root'].value_counts()

We can see that the '?' is replaced by 'b'.

### Statistical Summary

In [None]:
# statistical summary of the dataset
df_mushroom.describe()

The above output shows that there is majority of edible mushrooms in the dataset. The variable `veil-type` contains only single value throughout the dataset; thus this variable will be redundant for the analysis. We remove `veil-type` before further analysis.
Also, there are no missing values in the data. 

In [None]:
# drop the variable 'veil-type'
df_mushroom = df_mushroom.drop('veil-type', axis=1)

### Distribution of Variables

In [None]:
# plot the count plot for each categorical variable 
# 'figsize' sets the figure size
# pass the required number of rows and columns to plot the grid of subplots 
fig, ax = plt.subplots(4, 6, figsize=(28, 20))
for variable, subplot in zip(df_mushroom.columns, ax.flatten()):
    sns.countplot(df_mushroom[variable], ax=subplot)

# set the spacing between plots
plt.tight_layout()

# display the plot
plt.show()

The plot shows that the levels of target variable 'class' are balanced. 

### Encode the Categorical Variables

The algorithms like random forest, XGBoost will require the features in the numeric format. Thus, we dummy encode the independent variables.

In [None]:
# use 'get_dummies' from pandas to create dummy variables
# encode the independent variables
dummy_var = pd.get_dummies(data = df_mushroom.drop('class', axis=1))

dummy_var.head()

In [None]:
# check the shape of the dummy encoded dataframe
dummy_var.shape

Now we have 115 independent features and one target variable.

### Split the Data into Train and Test Set

In [None]:
# consider the target variable
df_target = df_mushroom['class']

# split data into train and test set
# set 'random_state' to generate the same dataset each time you run the code 
# 'test_size' returns the proportion of data to be included in the test set
X_train, X_test, y_train, y_test = train_test_split(dummy_var, df_target, random_state = 1, test_size = 0.2)

# check the dimensions of the train & test subset using 'shape'
# print dimension of train set
print('X_train', X_train.shape)
print('y_train', y_train.shape)

# print dimension of test set
print('X_test', X_test.shape)
print('y_test', y_test.shape)

## Model Building

We first build the random forest on the training dataset and check the accuracy of the model.

### Build a Random Forest Model

In [None]:
# pass the required number of trees in the random forest to the parameter, 'n_estimators'
# pass the 'random_state' to obtain the same samples for each time you run the code
rf_classification = RandomForestClassifier(n_estimators = 35, random_state = 1)

# use fit() to fit the model on the train set
rf_model = rf_classification.fit(X_train, y_train)

In [None]:
# predict the target variable
y_test_predicted = rf_model.predict(X_test)

# print the classification report for test set
print(classification_report(y_test, y_test_predicted))

The report shows that the accuracy, sensitivity, specificity and the f1-score is 1. This means that the random forest model has predicted all the mushrooms correctly.

Let us look at the important features in the random forest. Here there are 115 dummy encoded features. Thus we will plot only the top 20 important features.

In [None]:
# create a dataframe that stores the feature names and their importance score
# 'feature_importances_' returns the features based on the gini importance
important_features = pd.DataFrame({'Features': X_train.columns, 
                                   'Importance': rf_model.feature_importances_})

# sort the dataframe in the descending order according to the feature importance
important_features = important_features.sort_values('Importance', ascending = False)

# create a barplot to visualize the features based on their importance
sns.barplot(x = 'Importance', y = 'Features', data = important_features[0:20])

# add plot and axes labels
# set text size using 'fontsize'
plt.title('Feature Importance', fontsize = 15)
plt.xlabel('Importance', fontsize = 15)
plt.ylabel('Features', fontsize = 15)

# display the plot
plt.show()

The plot shows that the encoded variable 'odor_n' is the most important variable in the random forest. Now we consider these variables to again build a random forest.

We can not consider only few levels of a categorical variable. Either we need to consider all the dummy encoded levels of a variable or we will remove the variable entirely. Thus we use the distribution plot to choose the categorical variables.

Consider the following variables: `odor, stalk-surface-below-ring, gill-size, ring-type, bruises, gill-spacing, stalk-root, population, spore-print-color`

In [None]:
# consider the above variables as independent features and build a random forest model
df_imp_feat = df_mushroom[['odor', 'stalk-surface-below-ring', 'gill-size', 'ring-type', 'bruises', 'gill-spacing', 'stalk-root', 'population', 
                           'spore-print-color']]

# encode the variables 
dummy_var = pd.get_dummies(data = df_imp_feat)

dummy_var.head()

#### Split the data into train and test set

In [None]:
# consider the target variable
df_target = df_mushroom['class']

# split data into train and test set
# set 'random_state' to generate the same dataset each time you run the code 
# 'test_size' returns the proportion of data to be included in the test set
X_train, X_test, y_train, y_test = train_test_split(dummy_var, df_target, random_state = 1, test_size = 0.2)

# check the dimensions of the train & test subset using 'shape'
# print dimension of train set
print('X_train', X_train.shape)
print('y_train', y_train.shape)

# print dimension of test set
print('X_test', X_test.shape)
print('y_test', y_test.shape)

Now there are 43 independent features in the dataset.

### Build a Random Forest using Significant Variables

In [None]:
# pass the required number of trees in the random forest to the parameter, 'n_estimators'
# pass the 'random_state' to obtain the same samples for each time you run the code
rf_classification = RandomForestClassifier(n_estimators = 35, random_state = 1)

# use fit() to fit the model on the train set
rf_model = rf_classification.fit(X_train, y_train)

In [None]:
# predict the target variable
y_test_predicted = rf_model.predict(X_test)

# print the classification report for test set
print(classification_report(y_test, y_test_predicted))

The accuracy and f-1 score of the random forest with significant variables is 100%.

In [None]:
# create a confusion matrix
conf_mat = confusion_matrix(y_test, y_test_predicted)

# label the confusion matrix  
conf_matrix = pd.DataFrame(data = conf_mat,columns = ['Predicted:e','Predicted:p'], index = ['Actual:e','Actual:p'])

# plot a heatmap to visualize the confusion matrix
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = 'Greens', cbar = False, linewidths = 0.1, annot_kws = {'size':20})

# set the font size of x-axis ticks using 'fontsize'
plt.xticks(fontsize = 15)

# set the font size of y-axis ticks using 'fontsize'
plt.yticks(fontsize = 15)

# display the plot
plt.show()

The above matrix shows that all the mushrooms are correctly classified. Thus, we build a model with only 43 features instead of 115 and obtained the 100% accuracy.

Let us use these 43 features to build a XGBoost model.

### Build the XGBoost Model

Let us build the XGBoost model using the 43 dummy encoded features.

In [None]:
# initialize the SGBoost classifier
xgb_model = XGBClassifier(max_depth = 8, learning_rate = 0.1, random_state = 5)

# fit the model using fit() on train data
xgb_model.fit(X_train, y_train)

In [None]:
# predict the target variable
y_test_predicted = xgb_model.predict(X_test)

# print the classification report for test set
print(classification_report(y_test, y_test_predicted))

The XGBoost model gives the 100% accuracy.


So far we have build three models:
1. Random forest on all the 115 dummy encoded features
2. Random forest on the 43 dummy encoded significant features
3. XGBoost on the 43 dummy encoded significant features

All the three models have 100% accuracy. i.e. all the models classified the poisonous/ edible mushrooms correctly!! For the future prediction of a mushroom we can consider the random forest/ XGBoost model with 43 dummy encoded features.

### Thanks for reading!