# **Setting**

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder  # for convert those
from sklearn.preprocessing import StandardScaler  # for scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# our models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
rawData = pd.read_csv('../input/mushroom-classification/mushrooms.csv')

In [None]:
rawData

# **Examining the Data**

In [None]:
rawData.head()

In [None]:
rawData.describe()

# **Unique occurrences of ‘class’ column**

In [None]:
rawData['class'].unique()

In [None]:
rawData['class'].value_counts()

In [None]:
count = rawData['class'].value_counts()
plt.figure(figsize=(8, 7))
sns.barplot(x=count.index, y=count.values, alpha=0.8, palette="prism")
plt.ylabel('Count', fontsize=12)
plt.xlabel('Class', fontsize=12)
plt.title('Number of poisonous/edible mushrooms')
# plt.savefig("mushrooms1.png", format='png', dpi=500)
plt.show()

# **Obtain total number of mushrooms for each 'cap-color'**

In [None]:
cap_colors = rawData['cap-color'].value_counts()
m_height = cap_colors.values.tolist()            # Provides numerical values
cap_colors.axes                                  # Provides row labels
cap_color_labels = cap_colors.axes[0].tolist()   # Converts index object to list
ind = np.arange(10)                              # the x locations for the groups
width = 0.7                                      # the width of the bars
colors = ['#DEB887', '#778899', '#DC143C', '#FFFF99', '#f8f8ff', '#F0DC82', '#FF69B4', '#D22D1E', '#C000C5', 'g']

fig, ax = plt.subplots(figsize=(10, 7))
mushroom_bars = ax.bar(ind, m_height, width, color=colors)

# Add some text for labels, title and axes ticks
ax.set_xlabel("Cap Color", fontsize=20)
ax.set_ylabel('Quantity', fontsize=20)
ax.set_title('Mushroom Cap Color Quantity', fontsize=22)
ax.set_xticks(ind)                               # Positioning on the x axis
ax.set_xticklabels(('brown', 'gray', 'red', 'yellow', 'white', 'buff', 'pink', 'cinnamon', 'purple', 'green'),
                   fontsize=12)


# Auto-labels the number of mushrooms for each bar color.
def autolabel(rects, fontsize=14):
    """
    Attach a text label above each bar displaying its height
    """
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width() / 2., 1 * height, '%d' % int(height),
                ha='center', va='bottom', fontsize=fontsize)


autolabel(mushroom_bars)
plt.show()

# **Preprocessing**

In [None]:
mappings = list()
encoder = LabelEncoder()

# given feature to be is only numaric label so create dictionary that convert text labels to numeric labels
for column in range(len(rawData.columns)):
    rawData[rawData.columns[column]] = encoder.fit_transform(rawData[rawData.columns[column]])
    mappings_dict = {index: label for index, label in enumerate(encoder.classes_)}
    mappings.append(mappings_dict)

In [None]:
mappings

In [None]:
y = rawData['class']
X = rawData.drop('class', axis=1).values


scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=rawData.columns[rawData.columns != 'class'])
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [None]:
X

# **Modeling**

**Classification Methods**

In [None]:
log_model = LogisticRegression()
svm_model = SVC(C=1.0, kernel='rbf')

**Training**

In [None]:
np.sum(y) / len(y)

In [None]:
log_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)

**Logistic Regression Classification**

In [None]:
print(f"---Logistic Regression Test Accuracy: {log_model.score(X_test, y_test)}")

In [None]:
y_pred_lr = log_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred_lr)
x_axis_labels = ["Edible", "Poisonous"]
y_axis_labels = ["Edible", "Poisonous"]
f, ax = plt.subplots(figsize=(7, 7))
sns.heatmap(cm, annot=True, linewidths=0.2, linecolor="black", fmt=".0f", ax=ax, cmap="Greens",
            xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel("PREDICTED LABEL")
plt.ylabel("TRUE LABEL")
plt.title('Confusion Matrix for Logistic Regression Classifier')
# plt.savefig("lrcm.png", format='png', dpi=500, bbox_inches='tight')
plt.show()

**SVM Classification**

In [None]:
print(f"Support Vector Machine Test Accuracy: {svm_model.score(X_test, y_test)}")

In [None]:
svm = SVC(random_state=42, gamma="auto")
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
cm = confusion_matrix(y_test, y_pred_svm)
x_axis_labels = ["Edible", "Poisonous"]
y_axis_labels = ["Edible", "Poisonous"]
f, ax = plt.subplots(figsize=(7, 7))
sns.heatmap(cm, annot=True, linewidths=0.2, linecolor="black", fmt=".0f", ax=ax, cmap="YlGnBu",
            xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel("PREDICTED LABEL")
plt.ylabel("TRUE LABEL")
plt.title('Confusion Matrix for SVM Classifier')
# plt.savefig("svmcm.png", format='png', dpi=500, bbox_inches='tight')
plt.show()

# **Visualization**

In [None]:
X_test.shape

In [None]:
corr = rawData.corr()
sns.heatmap(corr)