In [1]:
%load_ext jupyter_black

In [52]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score

In [3]:
RANDOM_SEED = 42

In [5]:
import platform

if platform.system() == "Windows":
    address = "data\\"
else:
    address = "data/"

df = pd.read_csv(address + "mushroom_cleaned.csv")
df.head()

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
0,1372,2,2,10,3.807467,1545,11,1.804273,1
1,1461,2,2,10,3.807467,1557,11,1.804273,1
2,1371,2,2,10,3.612496,1566,11,1.804273,1
3,1261,6,2,10,3.787572,1566,11,1.804273,1
4,1305,6,2,10,3.711971,1464,11,0.943195,1


In [45]:
num_cols = ["cap-diameter", "stem-height", "stem-width"]
cat_cols = ["cap-shape", "gill-attachment", "gill-color", "stem-color", "season"]

In [51]:
df.season.value_counts()

season
0.943195    27210
0.888450    20387
1.804273     4219
0.027372     2219
Name: count, dtype: int64

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(["class"], axis=1),
    df["class"],
    random_state=RANDOM_SEED,
    stratify=df["class"],
)

In [47]:
column_transformer = ColumnTransformer(
    transformers=[
        (
            "ohe",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            cat_cols,
        ),
        ("scaling", StandardScaler(), num_cols),
    ],
    remainder="passthrough",
)

In [48]:
pipe_gnb = Pipeline(
    steps=[
        ("preprocessor", column_transformer),
        (
            "classifier",
            GaussianNB(),
        ),
    ]
)

pipe_rfc = Pipeline(
    steps=[
        ("preprocessor", column_transformer),
        (
            "classifier",
            RandomForestClassifier(
                random_state=RANDOM_SEED,
                n_estimators=100,
            ),
        ),
    ]
)

pipe_hgb = Pipeline(
    steps=[
        ("preprocessor", column_transformer),
        (
            "classifier",
            HistGradientBoostingClassifier(
                random_state=RANDOM_SEED,
                class_weight="balanced",
            ),
        ),
    ]
)

pipes = [pipe_gnb, pipe_rfc, pipe_hgb]

In [54]:
scores = dict()
for pipe in pipes:
    y_pred = pipe.fit(X_train, y_train).predict(X_test)
    scores[pipe["classifier"]] = f1_score(y_test, y_pred)
    print(f"{pipe["classifier"]}:\n{classification_report(y_test, y_pred)}")

GaussianNB():
              precision    recall  f1-score   support

           0       0.50      0.96      0.66      6090
           1       0.86      0.21      0.33      7419

    accuracy                           0.55     13509
   macro avg       0.68      0.58      0.49     13509
weighted avg       0.70      0.55      0.48     13509

RandomForestClassifier(random_state=42):
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6090
           1       0.99      0.99      0.99      7419

    accuracy                           0.99     13509
   macro avg       0.99      0.99      0.99     13509
weighted avg       0.99      0.99      0.99     13509

HistGradientBoostingClassifier(class_weight='balanced', random_state=42):
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      6090
           1       0.99      0.97      0.98      7419

    accuracy                           0.98     13509