In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Logistic Regression
from sklearn.linear_model import LogisticRegression
# Support Vector Machine
from sklearn.svm import SVC
# Neural Network
from sklearn.neural_network import MLPClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/mushroom-classification/mushrooms.csv")
df

In [None]:
mushroom_data = df.copy()
mushroom_data

# Preprocessing
- Encode all labels into numerical format in order to fit into our models
- Scale in order for all values to take similar ranges



In [None]:
mappings = list()

encoder = LabelEncoder()

for column in mushroom_data.columns:
    mushroom_data[column] = encoder.fit_transform(mushroom_data[column])
    mappings_dict = {index: label for index, label in enumerate(encoder.classes_)}
    mappings.append(mappings_dict)

In [None]:
# for reference to see what each number originally means 
# before transformation 
mappings

In [None]:
# variables are now transformed to numerical values
mushroom_data

In [None]:
# split x and y data
y = mushroom_data['class']
X = mushroom_data.drop('class', axis=1)

In [None]:
X

In [None]:
# scaling x so that all values take upon similar ranges
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=80)


# Model Selection

In [None]:
log_model = LogisticRegression()
svs_model = SVC(C=1.0, kernel='rbf')
nn_model = MLPClassifier(hidden_layer_sizes=(128,128))

## Training

In [None]:
# Before training, check if imbalance data
# How many positive examples compared to negative examples? 

# calculating percentage of positive examples by
# summing up amount of positive examples (since 1=positive) 
# and dividing by length of y
np.sum(y) / len(y)

# result= 48%
# data is balanced and accuracy can be used to measure 

In [None]:
log_model.fit(X_train, y_train)
svs_model.fit(X_train, y_train)
nn_model.fit(X_train, y_train)

In [None]:
print(f'---Logistic Regression: {log_model.score(X_train, y_train)}')
print(f'Support Vector Machine: {svs_model.score(X_train, y_train)}')
print(f'--------Neural Network: {nn_model.score(X_train, y_train)}')

# Visualization

In [None]:
corr_mat = mushroom_data.corr()

corr_mat.style.background_gradient(cmap='Reds')

In [None]:
#Correlation with target
cor_target = abs(corr_mat['class'])

#threshold to select only highly relevant features
relevant_features=cor_target[cor_target>0.5]
relevant_features