### Abstract
Analyze data from Mushroom Classification dataset to classify the mushrooms and find which are edible and which are poisonous.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

### Importing and printing dataset

In [None]:
data = pd.read_csv('../input/mushroom-classification/mushrooms.csv')
print(data.shape)
data.head()


***
### Preparing the Mushroom Classification dataset
<p>I have to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels.<br/>
I use sklearn.preprocessing.LabelEncoder to do that and fit label encoder and return encoded labels.</p>

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
for i in range(0,23):
    data.iloc[:,i] = le.fit_transform(data.iloc[:,i])
data.head()

In the column "class" the values are: 1=edible, 0=poisonous.
***
### Drop non-relevant column
From the table above it can be seen that the column "veil-type" is 0 and not contributing to the data so I remove it.

In [None]:
# Feature selection: remove variables no longer containing relevant information
data=data.drop(["veil-type"],axis=1)
data.head(5)

#### Check attributes correlation
Now explore the relationship between variables by plotting the Pearson Correlation between all the attributes in dataset.

In [None]:
# Imports needed for the script
import seaborn as sns # making statistical graphics in Python
import matplotlib.pyplot as plt
%matplotlib inline

colormap = plt.cm.viridis

plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(data.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)

In this case the attribute "gill-color" show the lower correlation value (in absolute terms) with the "class" attribute: -0.53. That means highest importance for the classification.

In [None]:
data[['class', 'gill-color']].groupby(['gill-color'], as_index=False).mean().sort_values(by='class', ascending=False)

Data shows each type of gill-color with the percentage of the mushrooms (of that type) that are edible. The rest are poisonous. For example: gill-color="4"(black) has 15% of mushrooms that are edible and 85% that are poisonous.

#### Creating training set and test set

In [None]:
# Import train_test_split
from sklearn.model_selection import train_test_split

#Split data into 70% training and 30% test
X=data.drop(['class'], axis=1)
y=data['class']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 2)


#train_pct_index = int(0.7 * len(X))
#X_train, X_test = X[:train_pct_index], X[train_pct_index:]
#y_train, y_test = y[:train_pct_index], y[train_pct_index:]

print("Total number of data examples " + str(len(data.index)))
print("Number of training data examples "+str(len(X_train.index)))
print("Number of test data examples "+str(len(X_test.index)))

#### Decision Tree Classifier
The goal of the Decision Trees learning algorithms is always to find the best split for each node of the tree.
For measuring the "godness" we are trying two criteria:
* entropy
* gini index

In [None]:
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
# Import accuracy_score
from sklearn.metrics import accuracy_score

# Instantiate dt
dt = DecisionTreeClassifier(criterion="entropy")

#### Measure the impurity of a node using entropy:

In [None]:
# Fit dt to the training set
dt.fit(X_train,y_train)
# Predict test set labels
y_pred = dt.predict(X_test)
# Evaluate test-set accuracy
print("Accuracy score on the test set: "+str(accuracy_score(y_test, y_pred)))

In [None]:
#Print decision tree
import graphviz
dot_data = export_graphviz(dt, feature_names=X.columns, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data) 
graph

Tree accuracy is 100% and this value means clearly overfitting.
I tried to modify the depth of the tree, saving accuracy to each step and plot it to verify at which tree depth the model begins to overfit. For this purpose I use Cross Validation method.

In [None]:
from sklearn.model_selection import KFold

def computeCVAccuracy(X,y):
    accuracy=[]
    foldAcc=[]
    for i in range(1,21): 
        kf = KFold(10,False) # K-Folds cross-validator: 10 split
        for train_index, test_index in kf.split(X):
            X_train, X_test,y_train,y_test = train_test_split(X,y, test_size = 0.1)
            clf = DecisionTreeClassifier(criterion="entropy", max_depth = i).fit(X_train, y_train)
            score=clf.score(X_test, y_test)
            accuracy.append(score)     
        foldAcc.append(np.mean(accuracy))  
    return(foldAcc)
    
cvAccuracy=computeCVAccuracy(X,y)

df1=pd.DataFrame(cvAccuracy)
df1.columns=['10-fold cv Accuracy']
df=df1.reindex(range(1,20))
df.plot()
plt.title("Decision Tree - 10-fold Cross Validation Accuracy vs Depth of tree")
plt.xlabel("Depth of tree")
plt.ylabel("Accuracy")
plt.ylim([0.8,1])
plt.xlim([0,20])

#### Measure the impurity of a node using Gini index:

In [None]:
# Instantiate dt2, set 'criterion' to 'gini'
dt2 = DecisionTreeClassifier(criterion='gini', random_state=1)

In [None]:
# Fit dt to the training set
dt2.fit(X_train,y_train)
# Predict test set labels
y_pred2 = dt2.predict(X_test)
# Evaluate test-set accuracy
print("Accuracy score on the test set: "+str(accuracy_score(y_test, y_pred2)))

In [None]:
dot_data = export_graphviz(dt2, out_file=None, 
                         feature_names=X.columns,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 

The first line of each node (except those of the final row) shows the splitting condition in the form "feature <= value".
Next, we find the Gini Impurity of the node. "Samples" is simply the number of observations contained in the node.
"Value" shows the class distribution of the samples ([edible,poisonous]).

In [None]:
from sklearn.model_selection import KFold

def computeCVAccuracy(X,y):
    accuracy=[]
    foldAcc=[]
    for i in range(1,21): 
        kf = KFold(10,False) # K-Folds cross-validator: 10 split
        for train_index, test_index in kf.split(X):
            X_train, X_test,y_train,y_test = train_test_split(X,y, test_size = 0.1)
            clf = DecisionTreeClassifier(criterion="gini",max_depth = i).fit(X_train, y_train)
            score=clf.score(X_test, y_test)
            accuracy.append(score)     
        foldAcc.append(np.mean(accuracy))  
    return(foldAcc)
    
cvAccuracy=computeCVAccuracy(X,y)

df1=pd.DataFrame(cvAccuracy)
df1.columns=['10-fold cv Accuracy']
df=df1.reindex(range(1,20))
df.plot()
plt.title("Decision Tree - 10-fold Cross Validation Accuracy vs Depth of tree")
plt.xlabel("Depth of tree")
plt.ylabel("Accuracy")
plt.ylim([0.8,1])
plt.xlim([0,20])