In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/mushroom-classification/mushrooms.csv")
df.shape

##### Let us take a look at the data.

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe().transpose()

In [None]:
df.info()

### Null Values:

In [None]:
df.isnull().sum() [df.isnull().sum() > 0]

There are no null values

### Target Variable:

In [None]:
print(df["class"].value_counts())
sns.countplot(df["class"],palette="Reds" )
plt.show()

- In our Target column, we have two classes "e" (edible) and p (poisonous). 
- It looks like a fairly balanced data set with an almost equal number of poisonous and edible mushrooms.
- Let us check target variable vs independent variables:

In [None]:
def draw_countplot(dataframe, features, rows, cols):
    fig=plt.figure(figsize=(20,20))
    for i, feature in enumerate(features):
        ax=fig.add_subplot(rows,cols,i+1)
        sns.countplot(x=feature, hue="class", data=df, ax=ax, palette="Reds")
        ax.set_title(feature+" vs class",color='DarkRed')
        
    fig.tight_layout()  
    plt.show()
draw_countplot(df.iloc[:, 1:],df.iloc[:, 1:].columns,6,4)

### Label Encoding the target variable:
- It approach is very simple and it involves converting each value in a column to a number. LabelEncoder encode labels with a value between 0 and n_classes-1 where n is the number of distinct labels.
- In our Target column, we have two classes "e" (edible) and p (poisonous). Let us label encode them.

In [None]:
from sklearn.preprocessing import LabelEncoder
Encoder_y=LabelEncoder()
df["class"] = Encoder_y.fit_transform(df["class"])

### Dummies value approach for Independent Columns:
- Get_dummies is a common way to create dummy variables for categorical features.
- It allows encoding as many category columns as you would like.
- In our dataset all the independent variables are categorial .Let us encode them and drop the first columns.

In [None]:
df = pd.get_dummies(df, drop_first=True)

In [None]:
df.head()

### Separating Predictors and Response:

In [None]:
X=df.drop('class',axis=1) #Predictors
y=df['class'] #Response

### Splitting Train and Test Data:

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

### Metrics:

***
$\mathbf{\text{Confusion Matrix:}}$<br>

True Positives (TP) - These are the correctly predicted positive values which means that the value of actual class is Positive and the value of predicted class is also Positive.

True Negatives (TN) - These are the correctly predicted negative values which means that the value of actual class is negative and value of predicted class is also negative.

False Positives (FP) – When actual class is negative and predicted class is positive.

False Negatives (FN) – When actual class is positive but predicted class in negative. 

***

$\mathbf{\text{Accuracy:}}$<br>

Accuracy is the ratio of number of correct predictions to the total number of predictions

$$Accuracy =  (TP+TN ) / (TP+TN+FN+FP)$$
***

$\mathbf{\text{Clssification Report:}}$<br>

The classification report visualizer displays the precision, recall, F1, and support scores for the model.

Precision - Precision is the ratio of correctly predicted positive observations to the total predicted positive observations.It answers the question, "Among all positive predictions, how many are truly prositive".

$$Precision = TP/(TP+FP)$$

Recall (Sensitivity) - Recall is the ratio of correctly predicted positive observations to the all observations in actual class - Positive. Itl answers to the questions: Of all the positive cases, how many did we predict correctly.

$$Recall = TP/(TP+FN)$$

F1 score - F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into account.

$$F1 Score = 2*(Recall * Precision) / (Recall + Precision)$$

### Import the requried Libraries:

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

### DecisionTreeClassifier
- A Decision tree is a flowchart like tree structure, where each internal node denotes a test on an attribute, each branch represents an outcome of the test, and each leaf node (terminal node) holds a class label.
- Decision trees classify instances by sorting them down the tree from the root to some leaf node, which provides the classification of the instance

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
print("Testing Accuracy :", accuracy_score(y_test , model.predict(X_test)))
print("Confusion Matrix:\n" , confusion_matrix(y_test , model.predict(X_test ) ))
print("Classification Report:\n", classification_report(y_test , model.predict(X_test)))

### Random Forest:
- Random forest is a classifier that evolves from decision trees. It actually consists of many decision trees.
- To classify a new instance, each decision tree provides a classification for input data. Random forest collects the classifications and chooses the most voted prediction as the result. 

In [None]:
model = RandomForestClassifier()
model.fit(X_train,y_train)
print("Testing Accuracy : ", accuracy_score(y_test , model.predict(X_test)))
print("Confusion Matrix:\n" , confusion_matrix(y_test , model.predict(X_test ) ))
print("Classification Report:\n", classification_report(y_test , model.predict(X_test)))

Both Decision tree and Random Forest gives us an accuracy of 100%.