In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Problem Statement:** Mushroom hunting, mushrooming, mushroom picking, mushroom foraging, and similar terms describe the activity of gathering mushrooms in the wild, typically for culinary use. This practice is popular throughout most of Europe, Australia, Japan, Korea, parts of the Middle East, and the Indian subcontinent, as well as the temperate regions of Canada and the United States.

In this kernel we're gonna explore an ensemble based model called Random Forest and further dig down to the following:

* Random Forest with Tuning
* Identifying ways to estimate Feature Importance

# About Dataset

**Attribute Information:** (classes: edible=e, poisonous=p)

* cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
* cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
* cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
* bruises: bruises=t,no=f
* odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
* gill-attachment: attached=a,descending=d,free=f,notched=n
* gill-spacing: close=c,crowded=w,distant=d
* gill-size: broad=b,narrow=n
* gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
* stalk-shape: enlarging=e,tapering=t
* stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
* stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
* stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
* stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
* stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
* veil-type: partial=p,universal=u
* veil-color: brown=n,orange=o,white=w,yellow=y
* ring-number: none=n,one=o,two=t
* ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
* spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
* population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

# 1. Analysis

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
dataset = pd.read_csv('../input/mushroom-classification/mushrooms.csv')

In [None]:
dataset

Check for the Empty or NaN values
* For more details to handle missing data,
* Refer: https://www.geeksforgeeks.org/working-with-missing-data-in-pandas/

In [None]:
dataset.isna().sum()
#this means no missing data is there

In [None]:
# dictionary of lists
dict = {'First Score':[100, np.nan, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score':[52, np.nan, 80, 98],
        'Fourth Score':[np.nan, np.nan, np.nan, 65]}
  
df = pd.DataFrame(dict)
df

In [None]:
df.isna().sum()
#this means that First Score contains 2 missing values and Second score contain 1 and so on.

In [None]:
dataset.describe()

In [None]:
dataset.info()
#shorcut to see number of null values

In [None]:
df.info()

In [None]:
dataset['class'].describe()

In [None]:
#making countplot which gives that which one is maximum e or p
sns.countplot(x='class', data=dataset, palette=('#fe4a49', '#3da4ab'))
plt.title('Mushroom Poisionous vs Edible Count')

In [None]:
dataset['class'].value_counts()

Analysing cap-shape

In [None]:
col = 'cap-shape'

In [None]:
dataset[col].value_counts()

In [None]:
dataset[col].value_counts().index

In [None]:
#plot 1
fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(12,5))
sns.countplot(x=col, data=dataset, ax=ax[0])

#plot 2
sns.countplot(x=col, hue='class', ax=ax[1], data=dataset)


Analysing cap-surface

In [None]:
col = 'cap-surface'
fig, ax = plt.subplots(1, 2, figsize=(12,5))
#plot 1
sns.countplot(x=col, data=dataset, ax=ax[0])
#plot 2
sns.countplot(x=col, hue='class',data=dataset, ax=ax[1])
plt.show()

Analysing cap-color

In [None]:
col = 'cap-color'
fig, ax = plt.subplots(1, 2, figsize=(12,5))
#plot 1
sns.countplot(x=col, data=dataset, ax=ax[0])
#plot 2
sns.countplot(x=col, hue='class',data=dataset, ax=ax[1])
plt.show()

In [None]:
dataset[col].value_counts()

Analysing bruises

In [None]:
col = 'bruises'
fig, ax = plt.subplots(1, 2, figsize=(12,5))
#plot 1
sns.countplot(x=col, data=dataset, ax=ax[0])
#plot 2
sns.countplot(x=col, hue='class',data=dataset, ax=ax[1])
plt.show()

In [None]:
dataset.columns

# 2. Splitting the whole dataset

LabelEncoder

In [None]:
label_encoded_data = dataset.copy()

In [None]:
label_encoded_data

In [None]:
label_encoded_data.columns

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in label_encoded_data.columns:
    label_encoded_data[col] = le.fit_transform(label_encoded_data[col])

In [None]:
label_encoded_data

In [None]:
#splitting in decision variables and classification varibles from label_encoded_data
#Predictor Variables
x = label_encoded_data.drop(['class', 'veil-type'], axis=1)

#Class Variable
y = label_encoded_data['class']

In [None]:
x

In [None]:
y

In [None]:
label_encoded_data.corr()

In [None]:
label_encoded_data.isna().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(data=label_encoded_data.corr(), annot=True)
#clearly there is no such correlations between any of the features
#so we have to take all of the features for to build the model
#also here veil-type should be ignored because thats the same for all values

In [None]:
label_encoded_data['veil-type'].value_counts()
#now verified that veil-type is same for all values so this should drop

In [None]:
x

In [None]:
#converting values of x and y from dataframe and series to array
x_model = x.values
y_model = y.values

In [None]:
x_model

In [None]:
y_model

Splitting data into Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_model, y_model, test_size=0.25, random_state=42)

## Traning the Model using RandomForest Model on Training Set

In [None]:
from sklearn.ensemble import RandomForestClassifier
#classifier = DecisionTreeClassifier(random_state=0)
classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)

## Predicting the Test set results from RandomForest

In [None]:
y_pred = classifier.predict(x_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
y_pred = classifier.predict(x_test)
# predicted_results & real_results
print('predicted_results & real_results')
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), axis=1))

## Making the Confusion Matrix

* I can't Believe how this is showing the 100% accuracy that means that data measured is somewhat overfitted.
    * But this can't be true because Naive Bayes also gives the very good results
* Here i also randomize the training set and test set with the random_state=42 then also Random Forest gives the 100% accuracy.

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

## Applying k-Fold Cross Validation for Naive Bayes

What the H*ck, after cross validation this gives 100% accuracy

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

## Traning the Model using Naive Bayes Model on Training Set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)
# predicted_results & real_results
print('predicted_results & real_results')
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), axis=1))

From Naive Bayes Model this gives the 92.6% accuracy

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

## Applying k-Fold Cross Validation for Naive Bayes

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))