In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split

from preprocess import *
from classify import *

In [2]:
df = pd.read_csv("data/raw/mushroom/mushrooms_raw.csv", delimiter=";")

X_raw = df.copy().drop(["class"], axis=1)
y_raw = df["class"].copy()

In [3]:
X = X_raw.dropna(thresh=len(df)*0.80, axis=1).copy()

for attribute in ["does-bruise-or-bleed", "has-ring"]:
    X[attribute] = X[attribute].map({"t": True, "f": False})

X["season"] = X["season"].map({"s": 0, "u": 1, "a": 2, "w": 3})

y = y_raw.map({"e": True, "p": False}).copy()

In [4]:
attributes = {
    "ohe": [attribute for attribute, dtype in dict(X.dtypes).items() if dtype is np.dtype("O")],
    "zsc": ["cap-diameter", "stem-height", "stem-width"],
    "rimp": ["gill-attachment", "ring-type"],
}

categories = [
    [category for category in X[attribute].unique() if type(category) is str]
    for attribute in attributes["ohe"]
]
categories_amount = sum([len(category) for category in categories])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

X_train_preprocessed = preprocess(X_train, attributes, categories_amount)
X_test_preprocessed  = preprocess(X_test,  attributes, categories_amount)

In [6]:
parameters = {
    "KNN": [1, 2, 3],
    "DTC": [10, 20],
    "RFC": [],
    "BNB": [],
    "MNB": [],
}

classifiers_dict, scores_dict = classify(X_train_preprocessed, y_train, X_test_preprocessed, y_test, parameters)

In [7]:
scores_dict

{'KNN': [0.9981006025674614, 0.9979041131778884, 0.9978386167146974],
 'DTC': [0.9226486769714436, 0.9885381189415772],
 'RFC': [],
 'BNB': [],
 'MNB': []}