Using a Tree to Classify Whether Mushrooms Poisonous
Written by:
Ethan Bartlett

Dataset: https://www.kaggle.com/uciml/mushroom-classification

In [None]:
import numpy as np # linear algebra
import pandas as pd # data orginization tool used for data processing/cleaning, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#This code just downloads the dataset from kaggle and stores it as a pandas array 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
mushroom = pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')

mushroom.head()

In [None]:
# Importing necessary sklearn libraries
from sklearn.preprocessing import LabelEncoder 

# Encoding the non-numerical data into numbers
labelencoder = LabelEncoder()
for col in mushroom.columns:
    mushroom[col] = labelencoder.fit_transform(mushroom[col])

mushroom.head()

In [None]:
# Separate the dataset into X and y ('class' is y and everything else is X)
X = (mushroom.drop('class', axis = 1)).values
print('X:', X)

y = (mushroom['class']).values
print('y:', y)

In [None]:
# Split the X and y into training and testing datasets
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 1, stratify = y)

In [None]:
# Train a decision tree on the dataset
from sklearn.tree import DecisionTreeClassifier as skTree
tree = skTree(criterion = 'entropy', max_depth = 5, random_state = 3)
tree = tree.fit(X_train,y_train)

In [None]:
# Train a random forest on the dataset
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
forest = forest.fit(X_train, y_train)

In [None]:
# Tree Results on Testing Dataset
predict = tree.predict(X_test)

"""
Shows how many samples were classified right and how many were classfied wrong.
0, 0 : Classified as 0 and answer is 0 - Right,
0, 1 : Classified as 0 and answer is 1 - Wrong,
etc.
"""
pd.crosstab(predict,y_test)

In [None]:
# Forest Results on Testing Dataset
predict = forest.predict(X_test)
pd.crosstab(predict, y_test)

In [None]:
# Tree results on the training dataset
predict = tree.predict(X_train)
pd.crosstab(predict,y_train)

In [None]:
# Forest results on the training dataset
predict = forest.predict(X_train)
pd.crosstab(predict, y_train)