In [None]:
import numpy as np 
import pandas as pd
import os
from subprocess import call
from IPython.display import Image

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import tensorflow as tf
import tensorflow_datasets as tfds

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier, BaggingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, fbeta_score, accuracy_score
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.tree import export_graphviz, DecisionTreeClassifier, plot_tree


pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('../input/mushroom-classification/mushrooms.csv')
Y = data['class']
data.drop(['class'], axis=1, inplace=True) 
X_train, X_test, y_train, y_test = train_test_split(data, Y, test_size=0.20, stratify=Y, random_state=1)

#training target
encode = LabelEncoder()
y_train = encode.fit_transform(y=y_train)

#test target
y_test = encode.transform(y=y_test)

categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

In [None]:
data.head()

In [None]:
sns.scatterplot(x=data['odor'], y=data['ring-type'], hue=Y)

In [None]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder())
])

# Bundle preprocessing for categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
# Define model
model = RandomForestRegressor(max_depth=None, n_estimators=100, random_state=0)
scores = cross_val_score(model, X_train, y_train, cv=10)
model = model.fit(X_train, y_train)

print('Score: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [None]:
tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=1,
                              random_state=1)
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

In [None]:
bag = BaggingClassifier(
    base_estimator=tree,
    n_estimators=500,
    max_samples=1.0,
    max_features=1.0,
    bootstrap=True,
    bootstrap_features=False,
    n_jobs=1,
    random_state=1)

bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
bag_y_test_pred = bag.predict(X_test)

bag_train = accuracy_score(y_train, y_train_pred) 
bag_test = accuracy_score(y_test, bag_y_test_pred) 
print('Bagging train/test accuracies %.3f/%.3f'
      % (bag_train, bag_test))

In [None]:
ada = AdaBoostClassifier(base_estimator=tree,
                         n_estimators=500, 
                         learning_rate=0.1,
                         random_state=1)

ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
ada_y_test_pred = ada.predict(X_test)

ada_train = accuracy_score(y_train, y_train_pred) 
ada_test = accuracy_score(y_test, ada_y_test_pred) 
print('AdaBoost train/test accuracies %.3f/%.3f'
      % (ada_train, ada_test))

In [None]:
##Bagging
print('test data MAE:', mean_absolute_error(y_test, bag_y_test_pred))

#recal is important more than acc in this task
print('test data F beta score:', fbeta_score(y_test == 1, bag_y_test_pred == 1, beta=100))

plot_tree(bag.estimators_[5])

In [None]:
##Ada

print('test data MAE:', mean_absolute_error(y_test, ada_y_test_pred))

#recal is important more than acc in this task
print('test data F beta score:', fbeta_score(y_test == 1,ada_y_test_pred== 1, beta=100))

plot_tree(ada.estimators_[5])

In [None]:
##RANDOM FOREST
preds_test = model.predict(X_test)

print('test data MAE:', mean_absolute_error(y_test, preds_test))

#recal is important more than acc in this task
print('test data F beta score:', fbeta_score(y_test == 1, preds_test == 1, beta=100))

export_graphviz(model.estimators_[6], out_file='tree.dot', 
                feature_names = categorical_cols,
                class_names = ['class'],
                rounded = True, proportion = False, 
                precision = 2, filled = True)

call(['dot', '-Tpng', 'tree.dot', '-o', 'bag.png', '-Gdpi=600'])
Image(filename = 'bag.png')