# Mushroom Classification Decision Tree

Importing Modules

In [488]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

Reading in dataset

In [489]:
df = pd.read_csv('mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


Checking for missing data

In [490]:
df.isna().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

Data Preprocessing

In [491]:
le = LabelEncoder()
df = df.apply(LabelEncoder().fit_transform)

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [492]:
'''
Passes data, train/test split ratio, max_depth parameters, and criterion

Function splits randomly into train/test, and returns the evaluated score of the decision tree model
'''
def build_decision_tree(ratio, max_depth=None,criterion='gini',df=df):
    output = df['class']
    no_output = df.drop(['class'], axis=1)
    x_train, x_test, y_train, y_test = train_test_split(no_output, output, test_size=ratio)

    if criterion != 'gini' and criterion != 'entropy':
        print('Invalid criterion')
        return None


    model = DecisionTreeClassifier(max_depth=max_depth,criterion=criterion)
    model.fit(x_train, y_train)
    predict = model.predict(x_test)
    accuracy = accuracy_score(y_test, predict)
    
    return accuracy

In [493]:
def build_random_forest(ratio, n_estimators = 130, max_depth=None, criterion='gini', df=df):
    output = df['class']
    no_output = df.drop(['class'], axis=1)
    x_train, x_test, y_train, y_test = train_test_split(no_output, output, test_size=ratio)
    
    if criterion != 'gini' and criterion != 'entropy':
        print('Invalid criterion')
        return None

    model = RandomForestClassifier(max_depth=max_depth,n_estimators=n_estimators,criterion=criterion)
    model.fit(x_train, y_train)
    predict = model.predict(x_test)
    accuracy = accuracy_score(y_test,predict)

    return accuracy

In [494]:
columns = ['Model Type', 'Training Split Ratio', 'Max Depth', 'Criterion', 'Number of Trees', 'Mean Accuracy']
results = pd.DataFrame(columns=columns)

In [495]:
test_ratios = [0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45]
test_criterion = ['gini','entropy']
test_depth = [1,2,3,4,5,6]
num_trees = 130 # Tests number of trees from 130 to num_trees (must 130 ir greater)

In [496]:
def test_decision_tree(tests):
    global results
    for ratio in test_ratios:
        for criterion in test_criterion:
            for depth in test_depth:
                accuracy_sum = 0
                completed_tests = 0
                while completed_tests < tests:
                    accuracy_sum += build_decision_tree(ratio,depth,criterion)
                    completed_tests += 1
                mean_accuracy = accuracy_sum / tests
                new_row = {'Model Type': 'Decision Tree', 'Training Split Ratio': ratio, 'Max Depth': depth, 'Criterion': criterion, 'Number of Trees': np.nan, 'Mean Accuracy': mean_accuracy}
                results = results.append(new_row,ignore_index=True)
                print('Decision Tree Test: ', ratio, criterion, depth, 'completed with an accuracy of ', mean_accuracy)

In [497]:
def test_forest(tests):
    global results
    for ratio in test_ratios:
        for criterion in test_criterion:
            for depth in test_depth:
                for trees in range(125,num_trees+1):
                    accuracy_sum = 0
                    completed_tests = 0
                    while completed_tests < tests:
                        accuracy_sum += build_random_forest(ratio,trees,depth,criterion)
                        completed_tests += 1
                    mean_accuracy = accuracy_sum / tests
                    new_row = {'Model Type': 'Random Forest', 'Training Split Ratio': ratio, 'Max Depth': depth, 'Criterion': criterion, 'Number of Trees': trees, 'Mean Accuracy': mean_accuracy}
                    results = results.append(new_row,ignore_index=True)
                    print('Forest Test: ', ratio, criterion, depth, trees, 'completed with an accuracy of ', mean_accuracy)

In [498]:
def test(tests):
    test_decision_tree(tests)
    test_forest(tests)

In [499]:
test(100)

results = results.sort_values(by='Mean Accuracy',ascending=False)
results = results[results['Mean Accuracy'] != 1.0]
results.to_csv('results.csv',index=False)

Decision Tree Test:  0.1 gini 1 completed with an accuracy of  0.7918327183271834
Decision Tree Test:  0.1 gini 2 completed with an accuracy of  0.9108364083640833
Decision Tree Test:  0.1 gini 3 completed with an accuracy of  0.9578228782287824
Decision Tree Test:  0.1 gini 4 completed with an accuracy of  0.9764821648216483
Decision Tree Test:  0.1 gini 5 completed with an accuracy of  0.9789052890528903
Decision Tree Test:  0.1 gini 6 completed with an accuracy of  0.9953751537515377
Decision Tree Test:  0.1 entropy 1 completed with an accuracy of  0.7336039360393605
Decision Tree Test:  0.1 entropy 2 completed with an accuracy of  0.9123247232472328
Decision Tree Test:  0.1 entropy 3 completed with an accuracy of  0.956088560885609
Decision Tree Test:  0.1 entropy 4 completed with an accuracy of  0.9536900369003692
Decision Tree Test:  0.1 entropy 5 completed with an accuracy of  0.9770356703567039
Decision Tree Test:  0.1 entropy 6 completed with an accuracy of  0.991709717097171
