## Introduction
The [Mushroom Classification](https://www.kaggle.com/uciml/mushroom-classification) dataset has information on 23 types of mushrooms in North America, with 23 data columns relating to traits like cap, gill, spacing, bruises, stalk, surface, color, veil, ring, spores, population and habitat. There are 8,124 data rows. The mushrooms are categorized as either edible or poisonous. For this project, we will be using 5 algorithms:
* logistic regression
* decision tree classification with tree display
* XG boost with feature importance
* neural network with keras
* neural network with pytorch

By Mary T

The question(s) that the project plans to answer is:
1. What features are the best features to predict whether a mushroom is poisonous or edible?
2. What is the best algorithm to use, and what algorithm is the least efficient?

### Libraries to be Imported

In [None]:
# importing libraries 

import warnings
warnings.simplefilter("ignore")
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc, roc_curve
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from xgboost import XGBClassifier
from numpy import loadtxt
from xgboost import plot_importance
import tensorflow as tf
from tensorflow import keras
import torch
from torch.utils.data import Dataset
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from torchvision import transforms, utils
from torch import nn
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from torch.optim import Adam
import spacy
from matplotlib import pyplot
import graphviz

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Dataset Import

In [None]:
# loading data into variable

data = pd.read_csv("../input/mushroom-classification/mushrooms.csv")

In [None]:
# previewing the data

data.head()

### Dataset Information

In [None]:
# see how many rows and columns there are

data.shape

In [None]:
# see if there are any null and what data types there are

data.info()

In [None]:
# making sure there are no nulls

data.isnull().sum()

In [None]:
# confirming each column's data type

data.dtypes

In [None]:
# see how many mushrooms are edible or poisonous

data['class'].value_counts()

In [None]:
# see what traits are associated with edible mushrooms

datacounts = data[data['class']=='e'].describe().T
datacounts.sort_values(by=['freq'], ascending = False)

In [None]:
# function for getting column labels
def get_labels(order, a_dict):    
    labels = []
    for values in order:
        for key, value in a_dict.items():
            if values == value:
                labels.append(key)
    return labels

In [None]:
# visualize population
pop_dict = {"abundant":"a","clustered":"c","numerous":"n","scattered":"s","several":"v","solitary":"y"}
hab_dict = {"grasses":"g","leaves":"l","meadows":"m","paths":"p","urban":"u","waste":"w","woods":"d"}

f, ax = plt.subplots(figsize=(15, 10))
order = list(data['population'].value_counts().index)
pop_labels = get_labels(order, pop_dict)
explode = (0.0,0.01,0.02,0.03,0.04,0.05)
data['population'].value_counts().plot.pie(explode=explode , autopct='%1.1f%%', labels=pop_labels, shadow=True, ax=ax)
ax.set_title('Mushroom Population Type Percentange');

In [None]:
# visualize odor
color_dict = {"brown":"n","yellow":"y", "blue":"w", "gray":"g", "red":"e","pink":"p",
              "orange":"b", "purple":"u", "black":"c", "green":"r"}
odor_dict = {"almond":"a","anise":"l","creosote":"c","fishy":"y",
             "foul":"f","musty":"m","none":"n","pungent":"p","spicy":"s"}
order = ['p', 'a', 'l', 'n', 'f', 'c', 'y', 's', 'm']
labels = get_labels(order, odor_dict)      
plot_col(col='odor', color=color_dict.keys(), labels=labels)

In [None]:
# see what traits are associated with poisonous mushrooms

data[data['class']=='p'].describe().T

In [None]:
def plot_col(col, hue=None, color=['red', 'lightgreen'], labels=None):
    fig, ax = plt.subplots(figsize=(15, 7))
    sns.countplot(col, hue=hue, palette=color, saturation=0.6, data=data, dodge=True, ax=ax)
    ax.set(title = f"Mushroom {col.title()} Quantity", xlabel=f"{col.title()}", ylabel="Quantity")
    if labels!=None:
        ax.set_xticklabels(labels)
    if hue!=None:
        ax.legend(('Poisonous', 'Edible'), loc=0)
class_dict = ('Poisonous', 'Edible')
plot_col(col='class', labels=class_dict)

In [None]:
# data total counts and other stats

data.describe()

In [None]:
fig, axs = plt.subplots(nrows=8, ncols=3, figsize=(15, 20))
ax_title_pairs = zip(axs.flat, list(data.columns))

for ax, title in ax_title_pairs:
    sns.countplot(x=title, data=data, palette='Pastel2', ax=ax)
    ax.set_title(title.title())
    ax.set_xticklabels(ax.get_xticklabels(), rotation=30)
    ax.set_xlabel('')

axs[7][1].set_axis_off()
axs[7][2].set_axis_off()
plt.tight_layout()

### Predictors vs. Response
The column we are interested in using as the response is 'class,' to see which mushroom(s) is edible or poisonous. All other columns will be predictors.

In [None]:
# setting X, y
del data['veil-type']
X = data.drop('class', axis = 1) # X will act as predictors variable
y = data['class'] # y will act as response variable

In [None]:
# previewing X

X.head()

### Label Encoding
Right now, the data is categorical. We must convert to ordinal/numeric.

In [None]:
# for correlation heatmap

labelencoder=LabelEncoder()
for column in data.columns:
    data[column] = labelencoder.fit_transform(data[column])

In [None]:
# heatmap

plt.figure(figsize=(16,16))
sns.heatmap(data.corr(),linewidths=.1,cmap="YlGnBu", annot=True)
plt.yticks(rotation=0);

In [None]:
labelencoder=LabelEncoder()
for column in data.columns:
    data[column] = labelencoder.fit_transform(data[column])

Encoder_X = LabelEncoder() 
for col in X.columns:
    X[col] = Encoder_X.fit_transform(X[col])
Encoder_y = LabelEncoder()
y = Encoder_y.fit_transform(y)

In [None]:
# preview X

X.head()

In [None]:
# preview y

y

In this case, poisonous is 1 and edible is 0.

### Splitting Training and Test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Logistic Regression

In [None]:
lrmodel = LogisticRegression()

In [None]:
# training set

lrmodel.fit(X_train,y_train)

In [None]:
lrpredictions = lrmodel.predict(X_test)
y_pred = lrmodel.predict(X_test)

In [None]:
print('Classes', lrmodel.classes_)
print('Intercept', lrmodel.intercept_)
print('Coefficients', lrmodel.coef_)
print(classification_report(y_test, lrmodel.predict(X_test)))

In [None]:
print("Accuracy of Logistic Regression: " + str(accuracy_score(y_test, lrpredictions)))


In [None]:
conf_mat = confusion_matrix(y_test,y_pred)
conf_mat

In [None]:
categories = [0,1] 
fig, ax = plt.subplots()
plt.xticks([0,1], categories)
plt.yticks([0,1], categories)
# create heatmap
sns.heatmap(pd.DataFrame(conf_mat), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

## Decision Tree (with display of tree)

In [None]:
clf = DecisionTreeClassifier()

In [None]:
clf = clf.fit(X_train, y_train)

In [None]:
dtpredictions = clf.predict(X_test)

In [None]:
print("Accuracy of Decision Tree: " + str(accuracy_score(y_test, dtpredictions)))

In [None]:
treegraph = export_graphviz(clf, out_file = None, 
                         feature_names = X.columns, 
                         filled = True, rounded = True,  
                         special_characters = True)  
graph = graphviz.Source(treegraph)  
graph 

In [None]:
plt.figure(figsize=[20, 10])
tree.plot_tree(clf, rounded= True, filled= True)

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
tree.plot_tree(decision_tree=clf, max_depth= 2,fontsize=12);

## XGBoost with Feature Importance

In [None]:
xgbmodel = XGBClassifier()

In [None]:
xgbmodel.fit(X_train, y_train)

In [None]:
xgbpredictions = xgbmodel.predict(X_test)

In [None]:
print("Accuracy of XGBoost: " + str(accuracy_score(y_test, xgbpredictions)))

In [None]:
# feature importance

print(xgbmodel.feature_importances_)

In [None]:
features_list = X.columns.values
feature_importance = xgbmodel.feature_importances_
sorted_idx = np.argsort(feature_importance)

plt.figure(figsize=(5,7))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), features_list[sorted_idx])
plt.xlabel('Importance')
plt.title('Feature importances')
plt.draw()
plt.show()

According to the feature importance visualization, there are two columns that are significantly important for determining whether a mushroom is poisonous or edible. The two columns are:
* gill-color
* population

In [None]:
data_div = pd.melt(data, "class", var_name="Characteristics")
fig, ax = plt.subplots(figsize=(10,5))
p = sns.violinplot(ax = ax, x="Characteristics", y="value", hue="class", split = True, data=data_div, inner = 'quartile', palette = 'Set1')
data_no_class = data.drop(["class"],axis = 1)
p.set_xticklabels(rotation = 90, labels = list(data_no_class.columns));

## Neural Network (Keras)

In [None]:
model = keras.Sequential([
    keras.layers.Dense(32,input_shape = (21,)),
    keras.layers.Dense(20,activation = tf.nn.relu),
    keras.layers.Dense(2,activation = "softmax")])

In [None]:
model.compile(optimizer = 'adam',
             loss = 'sparse_categorical_crossentropy',
             metrics = ['acc'])

In [None]:
prediction = model.fit(X_train, y_train, epochs = 100, validation_data = (X_test, y_test))

In [None]:
prediction_features = model.predict(X_test)
prediction_features

In [None]:
a = prediction_features.tolist()
predictions = []
for i in a:
   # print(i[0])
    if i[0]>i[1]:
        predictions.append(0)
    else:
        predictions.append(1)

In [None]:
print("Accuracy of Keras: " + str(accuracy_score(y_test, predictions)))

## Neural Network (Pytorch)

In [None]:
le = LabelEncoder()
for col in data.columns:
    data[col] = le.fit_transform(data[col])
data.head()

In [None]:
cutoff = int(len(data)*0.8) + 1
train_df = data.iloc[:cutoff, :]
test_df = data.iloc[cutoff:, :]

In [None]:
len_train = (len(train_df))
len_test = (len(test_df))
print(float(len_train) / (float(len_test) + float(len_train)))

In [None]:
class MushroomDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.mushroom_frame = dataframe
        self.transform = transform
        le = LabelEncoder()
        for col in self.mushroom_frame.columns:
            self.mushroom_frame[col] = le.fit_transform(self.mushroom_frame[col])
    def __len__(self):
        return len(self.mushroom_frame)
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        inputs = torch.from_numpy(np.array(self.mushroom_frame.iloc[idx, 1:])).type(torch.float)
        label = self.mushroom_frame.iloc[idx, 0]
        label = torch.Tensor([label]).type(torch.long)
        sample = inputs, label
        if self.transform:
            sample = self.transform(sample)
        return sample

In [None]:
train_dset = MushroomDataset(train_df)
test_dset = MushroomDataset(test_df)
train_dl = torch.utils.data.DataLoader(train_dset,batch_size=50, shuffle=True,num_workers=4)
test_dl = torch.utils.data.DataLoader(test_dset, batch_size=1, shuffle=False, num_workers=4)

In [None]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(21, 80)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(80, 2)
        self.softmax = nn.Softmax()
        
    def forward(self, x, test=False):
        x = self.relu1(self.fc1(x))
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [None]:
net = NeuralNet()
criterion = nn.BCELoss()
optimizer = Adam(net.parameters(), 0.001)

for epoch in range(20):
    for i, (x, y) in enumerate(train_dl):
        y_onehot = torch.FloatTensor(50, 2)
        y_onehot.zero_()
        y_onehot.scatter_(1, y, 1)
        y_hat = net(x)
        loss = criterion(y_hat, y_onehot)
        if i % 500 == 0:
            print(epoch, i, loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
correct = 0
total = 0

with torch.no_grad():
    for i, (x, y) in enumerate(test_dl):
        y_hat = net(x, test=True)
        _, predicted = torch.max(y_hat.data, 1)
        total += y.size(0)
        correct += (predicted == y).sum().item()

In [None]:
print("Accuracy of Pytorch: ", float(correct)/total)

### Comparing algorithm accuracy

* **Logistic**: 0.9507631708517972
* **Decision Tree**: 1.0
* **XGBoost**: 1.0
* **Keras**: 1.0
* **Pytorch**: 0.8879310344827587

Comparing accuracies between the algorithms show that these three algorithms are the most accurate: 
* Decision Tree
* XGBoost
* Keras 

XGBoost's feature importance chart shows that these three facets are the most important features in determining whether a mushroom is poisonous or edible:
* gill color
* population
