# Introduction

This dataset includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family Mushroom drawn from The Audubon Society Field Guide to North American Mushrooms (1981). Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like "leaflets three, let it be'' for Poisonous Oak and Ivy.

In [None]:
# including packages

import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# reading csv, converting to pandas DataFrame

df = pd.read_csv('mushrooms.csv')
df.head()

In [None]:
# save list of columns
columns = list(df.columns)

# dictionary {column : {factorized_value : label}} 
col_d = {}

# traverse through columns
for x in columns:
    # factorizing, so instead of character labels there will be integer labels
    codes, uniques = pd.factorize(df[x])
    # building inner dictionary and assigning values to index
    col_d[x] = (dict(zip(list(set(codes)), uniques)))
    df[x] = codes

print(col_d)
df.head()

In [None]:
set(list(df['veil-type']))

In [None]:
df.info()

In [None]:
def_d = {}

def_d[columns[0]] = ({'e':'edible', 'p':'poisonous'})
def_d[columns[1]] = ({'b':'bell','c':'conical','x':'convex','f':'flat', 'k':'knobbed','s':'sunken'})
def_d[columns[2]] = ({'f':'fibrous','g':'grooves','y':'scaly','s':'smooth'})
def_d[columns[3]] = ({'n':'brown','b':'buff','c':'cinnamon','g':'gray','r':'green','p':'pink','u':'purple','e':'red','w':'white','y':'yellow'})
def_d[columns[4]] = ({'t':'bruises','f':'no'})
def_d[columns[5]] = ({'a':'almond','l':'anise','c':'creosote','y':'fishy','f':'foul','m':'musty','n':'none','p':'pungent','s':'spicy'})
def_d[columns[6]] = ({'a':'attached','d':'descending','f':'free','n':'notched'})
def_d[columns[7]] = ({'c':'close','w':'crowded','d':'distant'})
def_d[columns[8]] = ({'b':'broad','n':'narrow'})
def_d[columns[9]] = ({'k':'black','n':'brown','b':'buff','h':'chocolate','g':'gray', 'r':'green','o':'orange','p':'pink','u':'purple','e':'red','w':'white','y':'yellow'})
def_d[columns[10]] = ({'e':'enlarging','t':'tapering'})
def_d[columns[11]] = ({'b':'bulbous','c':'club','u':'cup','e':'equal','z':'rhizomorphs','r':'rooted','?':'missing'})
def_d[columns[12]] = ({'f':'fibrous','y':'scaly','k':'silky','s':'smooth'})
def_d[columns[13]] = ({'f':'fibrous','y':'scaly','k':'silky','s':'smooth'})
def_d[columns[14]] = ({'n':'brown','b':'buff','c':'cinnamon','g':'gray','o':'orange','p':'pink','e':'red','w':'white','y':'yellow'})
def_d[columns[15]] = ({'n':'brown','b':'buff','c':'cinnamon','g':'gray','o':'orange','p':'pink','e':'red','w':'white','y':'yellow'})
def_d[columns[16]] = ({'p':'partial','u':'universal'})
def_d[columns[17]] = ({'n':'brown','o':'orange','w':'white','y':'yellow'})
def_d[columns[18]] = ({'n':'none','o':'one','t':'two'})
def_d[columns[19]] = ({'c':'cobwebby','e':'evanescent','f':'flaring','l':'large','n':'none','p':'pendant','s':'sheathing','z':'zone'})
def_d[columns[20]] = ({'k':'black','n':'brown','b':'buff','h':'chocolate','r':'green','o':'orange','u':'purple','w':'white','y':'yellow'})
def_d[columns[21]] = ({'a':'abundant','c':'clustered','n':'numerous','s':'scattered','v':'several','y':'solitary'})
def_d[columns[22]] = ({'g':'grasses','l':'leaves','m':'meadows','p':'paths','u':'urban','w':'waste','d':'woods'})

print(def_d)

In [None]:
#sns.heatmap(df.corr(method='spearman'))
sns.heatmap(df.corr())

In [None]:
def df_label(df, column, def_d, col_d):
    df[column + '-label'] = [def_d[column][col_d[column][x]] for x in df[column]]
    return df

def plot_class(plt, df, name, column):
    plt.hist([df[column], df['class']], 25, stacked=True)
    plt.title(name + ' and class correlation')
    plt.xlabel(name)
    plt.ylabel('Count')
    plt.legend(['poisonous', 'edible'])


In [None]:
df = df_label(df=df, column='odor', def_d=def_d, col_d=col_d)
plot_class(plt=plt, df=df, name='Odor', column='odor-label')

In [None]:
df = df_label(df=df, column='veil-type', def_d=def_d, col_d=col_d)
plot_class(plt=plt, df=df, name='Veil-type', column='veil-type-label')

In [None]:
df = df_label(df=df, column='bruises', def_d=def_d, col_d=col_d)
plot_class(plt=plt, df=df, name='Bruises', column='bruises-label')

In [None]:
df = df_label(df=df, column='gill-size', def_d=def_d, col_d=col_d)
plot_class(plt=plt, df=df, name='Gill size', column='gill-size-label')

In [None]:
df = df_label(df=df, column='stalk-surface-above-ring', def_d=def_d, col_d=col_d)
plot_class(plt=plt, df=df, name='Stalk surface above ring', column='stalk-surface-above-ring-label')

In [None]:
df = df_label(df=df, column='stalk-surface-below-ring', def_d=def_d, col_d=col_d)
plot_class(plt=plt, df=df, name='Stalk surface above ring', column='stalk-surface-below-ring-label')

In [None]:
df = df_label(df=df, column='spore-print-color', def_d=def_d, col_d=col_d)
plot_class(plt=plt, df=df, name='Spore print color', column='spore-print-color-label')

In [None]:
df = df_label(df=df, column='ring-type', def_d=def_d, col_d=col_d)
plot_class(plt=plt, df=df, name='Ring type', column='ring-type-label')

In [None]:
to_drop = ['class']
for x in df.columns:
    if x[-6:] == '-label':
        to_drop.append(x)
X = df.drop(to_drop, axis=1)
Y = df['class']

In [None]:
# test_size from 0.9994 to 0.1 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.8, random_state=1)
print(X_train.shape)
print(X_test.shape)

In [None]:
log_r = LogisticRegression(max_iter=1000)
log_r.fit(X_train, Y_train)
print('Logistic Regression coefficients: ' + str([str(x) + ' : ' + str(y) for x, y in zip(columns[1:], log_r.coef_.reshape(22))]))
print('Logistic Regression train  score: ' + str(log_r.score(X_train, Y_train)))
print('Logistic Regression test   score: ' + str(log_r.score(X_test,  Y_test )))
Y_pred = log_r.predict(X_test)
print('Logistic Regression score: ' + str(balanced_accuracy_score(Y_test, Y_pred)))

In [None]:
lin_r = LinearRegression()
lin_r.fit(X_train, Y_train)
print('Linear Regression coefficients: ' + str([str(x) + ' : ' + str(y) for x, y in zip(columns[1:], lin_r.coef_)]))
print('Linear Regression train  score: ' + str(lin_r.score(X_train, Y_train)))
print('Linear Regression test   score: ' + str(lin_r.score(X_test,  Y_test )))

In [None]:
lin_svc = LinearSVC(max_iter=10000)
lin_svc.fit(X_train, Y_train)
print('Linear SVC train score: ' + str(lin_svc.score(X_train, Y_train)))
print('Linear SVC test  score: ' + str(lin_svc.score(X_test,  Y_test )))
Y_pred = lin_svc.predict(X_test)
print('Linear SVC score: ' + str(balanced_accuracy_score(Y_test, Y_pred)))

In [None]:
dt_c = DecisionTreeClassifier()
dt_c.fit(X_train, Y_train)
print('Decision Tree Classifier train score: ' + str(dt_c.score(X_train, Y_train)))
print('Decision Tree Classifier test  score: ' + str(dt_c.score(X_test,  Y_test )))
Y_pred = dt_c.predict(X_test)
print('Decision Tree Classifier score: ' + str(balanced_accuracy_score(Y_test, Y_pred)))

In [None]:
knn_c = KNeighborsClassifier()
knn_c.fit(X_train, Y_train)
print('KNN Classifier train score: ' + str(knn_c.score(X_train, Y_train)))
print('KNN Classifier test  score: ' + str(knn_c.score(X_test,  Y_test )))
Y_pred = knn_c.predict(X_test)
print('KNN Classifier score: ' + str(balanced_accuracy_score(Y_test, Y_pred)))