In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

import sklearn.model_selection as ms
import sklearn.metrics as m
import sklearn.tree as tree
import sklearn.ensemble as ensemble
import sklearn.svm as svm
import sklearn.linear_model as lm
import sklearn.preprocessing as pp

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/mushroom-classification/mushrooms.csv')

In [None]:
data.info()

In [None]:
data.sample(5)

In [None]:
data.iloc[:, 0:10]

In [None]:
data.iloc[:, 10:]

In [None]:
data.isnull().sum() # check for blank values

In [None]:
data['stalk-root'].unique()

In [None]:
tol_label = 0
for col in data.columns:
    print(col)
    print(data[col].unique())
    tol_label += len(data[col].unique())
    print()
# seems that only the column - "stalk-root" contain ?

In [None]:
tol_label

In [None]:
data['stalk-root'].value_counts() # we have about 2480 row items with ?

In [None]:
data[data['stalk-root'] == '?']['class'].value_counts() # breakdown of class where stalk-root == ?

In [None]:
data['class'].value_counts() # about 45% of the positive class (p) has rows where stalk-root == ?

# Data Processing
* One-hot enncoding
* Data is splitted such that BOTH the training and the testing dataset contain the same proportion of positive and negative class

In [None]:
data['class'].replace({'p': 1, 'e': 0}, inplace=True)

X = data.copy()
X.drop('class', inplace=True, axis=1)

y = data['class'].copy()

In [None]:
one_hot = pp.OneHotEncoder()
one_hot.fit(X)
X_transform = one_hot.transform(X)
X_transform.shape

In [None]:
X_train_val, X_test, y_train_val, y_test = ms.train_test_split(X_transform, y, train_size=0.75, shuffle= True, stratify= y, random_state= 42)
X_train_val.shape

In [None]:
X_train, X_validation, y_train, y_validation = ms.train_test_split(\
                                                X_train_val, y_train_val, train_size=0.75, shuffle= True, stratify= y_train_val, random_state= 42)

In [None]:
print(f'Size of training set: {X_train.shape[0]}')
print(f'Size of validation set: {X_validation.shape[0]}')
print(f'Size of testing set: {X_test.shape[0]}')

# Candidates Models & Ensemble (BASELINE)
* My approach will be to train individual classifier models as well as an ensemble model (voting classifier)
* I will assess the individual classifier models based on the default hyperparameter values
* The voting classifier will be based on all the individual classifier models with their default hyperparameter values
* ALL features will be used for this baseline models

In [None]:
rf_clf = ensemble.RandomForestClassifier(random_state=42)
dt_clf = tree.DecisionTreeClassifier(random_state=42)
ext_clf = ensemble.ExtraTreesClassifier(random_state=42)
svc_clf = svm.LinearSVC(random_state=42)
log_clf = lm.LogisticRegression(random_state=42)
gb_clf = ensemble.GradientBoostingClassifier(random_state=42)

voting_classifier = ensemble.VotingClassifier([
                    ('rf_clf', ensemble.RandomForestClassifier(random_state=42)),
                    ('dt_clf', tree.DecisionTreeClassifier(random_state=42)),
                    ('ext_clf', ensemble.ExtraTreesClassifier(random_state=42)),
                    ('svc_clf', svm.LinearSVC(random_state=42)),
                    ('log_clf', lm.LogisticRegression(random_state=42)),
                    ('gb_clf', ensemble.GradientBoostingClassifier(random_state=42))
                    ], voting='hard')

estimators = [rf_clf, dt_clf, ext_clf, svc_clf, log_clf, gb_clf, voting_classifier]

# Performance on Training Set

In [None]:
cv = ms.RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

for estimator in estimators:
    estimator.fit(X_train, y_train)
    cv_accuracy = ms.cross_val_score(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring='accuracy')
    cv_f1 = ms.cross_val_score(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring='f1')
    
    print(estimator.__class__.__name__)
    print(f'Avg Accuracy: {np.mean(cv_accuracy) * 100}')
    print(f'Std Accuracy: {np.std(cv_accuracy) * 100}')
    print(f'Avg F1: {np.mean(cv_f1) * 100}')
    print(f'Std F1: {np.std(cv_f1) * 100}')
    print()

# Performance on Validation Set

In [None]:
# Most likely, we have overfitted the models
# Let's see how it perform on the validation set

for estimator in estimators:
    print(estimator.__class__.__name__)
    print(estimator.score(X_validation, y_validation) * 100)
    print()

# Performance on Testing Set
* Seems like all of the individual models performed just as well as the voting model (voting classifier)

In [None]:
for estimator in estimators:
    print(estimator.__class__.__name__)
    print(estimator.score(X_test, y_test) * 100)
    print()