In [None]:
# Mushroom

In [None]:

import math
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [None]:
mushroom_columns = [
    "edible/poisonous",
    "cap-shape",
    "cap-surface",
    "cap-color",
    "bruises",
    "odor",
    "gill-attachment",
    "gill-spacing",
    "gill-size",
    "gill-color",
    "stalk-shape",
    "stalk-root",
    "stalk-surface-above-ring",
    "stalk-surface-below-ring",
    "stalk-color-above-ring",
    "stalk-color-below-ring",
    "veil-type",
    "veil-color",
    "ring-number",
    "ring-type",
    "spore-print-color",
    "population",
    "habitat",
]


mushrooms_df = pd.read_csv("../input/mushroom-classification/mushrooms.csv", header = None, names=mushroom_columns)
mushrooms_df

In [None]:
mushrooms_df.info(memory_usage=False)

In [None]:
mushrooms_df.describe()

In [None]:
mushrooms_df.head(10)

## Rename attributes and plot

In [None]:
odors = [('almond','a'),('anise','l'),('creosote','c'),('fishy','y'),('foul','f'),('musty','m'),('none','n'),('pungent','p'),('spicy','s')]
for o in odors:
    mushrooms_df['odor'] = mushrooms_df['odor'].replace(o[1], o[0])

mushrooms_df['odor'].value_counts().plot(kind='bar')

In [None]:
le = preprocessing.LabelEncoder()

for column in mushrooms_df.columns:
    mushrooms_df[column] = le.fit_transform(mushrooms_df[column])

In [None]:
mushrooms_df.describe()

In [None]:
mushrooms_df.drop(["veil-type"], axis=1)

## Histogramme

In [None]:
%matplotlib inline

# First Encode chars to ints to show in hist

# OrdinalEncoder
ordinalEncoder = OrdinalEncoder()
mushrooms_ordinal_encoder = ordinalEncoder.fit_transform(mushrooms_df)

# Mache wieder ein Dateframe daraus
mushrooms_ordinal_encoder = pd.DataFrame(mushrooms_ordinal_encoder, columns=mushroom_columns)

mushrooms_ordinal_encoder.hist(bins=10, figsize=(20,15))
plt.show()

In [None]:
mushrooms_df["edible/poisonous"].hist(bins=3)

In [None]:
mushrooms_df.isna().sum()

## Information Gain

In [None]:
mushrooms_df

In [None]:
def get_info_gain(data):
    counter = Counter(data)
    if len(counter.keys()) == 1:
        return 0.0
    entropy = 0
    for key in counter.keys():
        val = counter[key] / len(data)
        entropy +=  - val * math.log(val, len(counter.keys()))
    return entropy
info_gain = dict()
for col in mushroom_columns:
    info_gain[col] = get_info_gain(mushrooms_df[col].values)
info_gain

In [None]:
## Correlation Matrix

In [None]:
plt.figure(figsize=(25,25))

sns.heatmap(mushrooms_df[mushroom_columns[1:]].drop('veil-type', 1).corr(),
            vmax=1,
            annot=True,
            fmt='.1f',
            cmap='viridis',
            square=True)

## One Hot Encoder

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
one_hot = ohe.fit_transform(mushrooms_df[['odor']])

odor_df = pd.DataFrame(one_hot)
odor_df

In [None]:
ohe.categories_

## Train Test Split

In [None]:
X = mushrooms_df[mushroom_columns[1:]]
y = mushrooms_df[mushroom_columns[:1]]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape

## Decision Tree Classifier

In [None]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)

In [None]:
data = export_graphviz(clf, out_file="mushroom_graph.dot", feature_names=X.columns, filled=True, rounded=True, special_characters=True)

print("Decision Tree Classifier is in mushroom_graph.dot file")

## Feature Importance

In [None]:
features_list = mushroom_columns[1:]
features_importance = clf.feature_importances_
index_sorted = np.argsort(features_importance)

plt.figure(figsize=(5,7))
plt.barh(range(len(index_sorted)), features_importance[index_sorted], align="center")
plt.yticks(range(len(index_sorted)), [features_list[i] for i in index_sorted])
plt.xlabel("Importance")
plt.title("Feature Importance")
plt.draw()
plt.show()

## Logistic Regression

In [None]:
logistic_regression = LogisticRegression(max_iter=100000)
logistic_regression.fit(X_train, Y_train.values.ravel())

lr_predict = clf.predict(X)
lr_score = clf.score(X,y)

print("Logistic Regression Score = %.3f" % lr_score)

# Random Forest

In [None]:
RC = RandomForestClassifier()
RCfit = RC.fit(X_train, Y_train.values.ravel())
y_predict = RC.predict(X_test)

score = accuracy_score(Y_test, y_predict)
print("Random Forest Accuracy Score = %.3f" % score)

RC_train = RC.score(X_train, Y_train)
RC_test = RC.score(X_test, Y_test)
print("Random Forest Train Score = %.3f\nRandom Forest Test Score = %.3f\n" % (RC_train, RC_test))

## Simple Imputer

In [None]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit_transform(mushrooms_df[mushroom_columns[:1]])

# SimpleImputer macht nur mit numerischen Werten Sinn. Hier sinnlos.

## Pipeline

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC())
])

pipeline.fit(X_train, Y_train)
print("SVC Score = %.1f" % (pipeline.score(X_test, Y_test)))