# Perform Mushroom Classification using Random Forest Classification

![](https://wallpaperaccess.com/full/365722.jpg)

**Import the libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
!pip install seaborn==0.11.0
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

**Load the data**

In [None]:
df = pd.read_csv("../input/mushroom-classification/mushrooms.csv")
df.head()

Check missing data

In [None]:
df.isnull().sum()

In [None]:
df.shape

Our data contain 8124 rows and 23 columns.

In [None]:
df.dtypes

**Understand the data**

**About this file** :

**Attribute Information**: (classes: edible=e, poisonous=p)

**cap-shape**: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

**cap-surface**: fibrous=f,grooves=g,scaly=y,smooth=s

**cap-color**: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

**bruises**: bruises=t,no=f

**odor**: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

**gill-attachment**: attached=a,descending=d,free=f,notched=n

**gill-spacing**: close=c,crowded=w,distant=d

**gill-size**: broad=b,narrow=n

**gill-color**: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

**stalk-shape**: enlarging=e,tapering=t

**stalk-root**: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

**stalk-surface-above-ring**: fibrous=f,scaly=y,silky=k,smooth=s

**stalk-surface-below-ring**: fibrous=f,scaly=y,silky=k,smooth=s

**stalk-color-above-ring**: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

**stalk-color-below-ring**: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

**veil-type**: partial=p,universal=u

**veil-color**: brown=n,orange=o,white=w,yellow=y

**ring-number**: none=n,one=o,two=t

**ring-type**: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

**spore-print-color**: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

**population**: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

**habitat**: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

In [None]:
df['class'].value_counts()

In [None]:
colors= ['#003f5c','#2f4b7c','#6de309','#f5b507','#a05195','#d45087','#f95d6a','#ff7c43','#ffa600','#fcca46','#a1c181','#619b8a','#15e8aa']
for i in df.columns:
    plt.figure()
    df[i].value_counts().plot.pie(textprops={'color':"w"},pctdistance=0.7,autopct='%.2f%%',colors=colors, figsize=(5,5),labels=None,subplots=True)
    plt.title("Mushroom {} Distribution ".format(i),fontsize=17,ha='right')
    plt.legend(labels=df[i].value_counts().index, loc="best",bbox_to_anchor=(1, 0.25, 0.5, 0.5))
    plt.show()

In [None]:
for i in df.columns:
    sns.countplot(x=i, hue="class", data=df, palette =['#2f4b7c','#6de309'])
    plt.show()


In [None]:
sns.catplot(x="cap-shape", hue="class", col="class",
                data=df, kind="count",
                height=4, aspect=.7,palette =['#2f4b7c','#6de309']);

We will consider that :


e  -->  1
           
p  -->  0

In [None]:
df['class'] = df['class'].replace({'e':1,'p':0})

In [None]:
df.head(2)

In [None]:
X = df.iloc[:,1:]
X=pd.get_dummies(X,columns=X.columns,drop_first=True)
X.head(2)

In [None]:
plt.subplots(figsize=(20,15))
corr = X.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)

ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);  #);

In [None]:
y = df['class'].values
y

# Machine Learning Models

**Split the data into train dataset and test data set**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=52)

**1. Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
lrg = LogisticRegression()
lrg.fit(X_train, y_train)
y_pred = lrg.predict(X_test)
print('Logistic Regression Accuracy' , accuracy_score(y_test, y_pred))

# Mushroom Classification using Random Forest Classifier

**2. Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=2, random_state=0)
rfc.fit(X_train, y_train)

# Predicting the Test set results
y_pred = rfc.predict(X_test)
print('Random Forest Classifier Accuracy' , accuracy_score(y_test, y_pred))

In [None]:
#sample prediction for csv file
Decision_Tree=pd.DataFrame({'y_test':y_test,'prediction':y_pred})#df for camparison
Decision_Tree.to_csv("Decision Tree.csv")#export to csv 

# Mushroom Classification By Xgboost

In [None]:
from xgboost import XGBClassifier
xgb=XGBClassifier()
xgb.fit(X_train,y_train)
y_xgb=xgb.predict(X_test)
print('XGBoost Accuracy' , accuracy_score(y_test,y_xgb))

In [None]:
#sample prediction for csv file
XG_Boost=pd.DataFrame({'y_test':y_test,'prediction':y_pred})#df for camparison
XG_Boost.to_csv("XGBoost.csv") #export to csv 

# Mushroom Classification using keras

In [None]:
from keras.models import Sequential
from keras import layers

input_dim = X_train.shape[1]

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', 
               optimizer='adam', 
               metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train,
                     epochs= 30,
                     verbose=False,
                     validation_data=(X_test, y_test),
                     batch_size=10)

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(14, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
plot_history(history)

In [None]:
#sample prediction for csv file
Keras=pd.DataFrame({'y_test':y_test,'prediction':y_pred})#df for camparison
Keras.to_csv("Keras.csv") #export to csv 