In [98]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import math
import pandas as pd
import numpy as np

In [99]:
SHROOMS_DATASET_PATH = "datasets/mushrooms.csv"
TEST_SIZE = 0.3
EPSILON = 1e-12
RANDOM_SEED = 15

## Prepare data

Rename columns for easier access.

In [100]:
iris = load_iris(as_frame=True)
X = iris.data.rename(
    columns={
        "sepal length (cm)": "sl",
        "sepal width (cm)": "sw",
        "petal length (cm)": "pl",
        "petal width (cm)": "pw",
    }
)
y = iris.target

Divide into train and test datasets.

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

Check for missing values.

In [102]:
iris.frame.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

## Gaussian Naive Bayes Classifier

Define an enum representing possible iris types. Values correspond to target labels inside `y`.

In [103]:
from enum import Enum

In [104]:
class IrisType(Enum):
    SETOSA = 0
    VERSICOLOR = 1
    VIRGINICA = 2

Define a function for createing parametrized gaussian distribution function.

In [105]:
def create_gaussian_distrib_func(mi, sigma):
    def func(x):
        return 1 / (sigma * math.sqrt(2 * math.pi)) * math.exp(-0.5 * ((x - mi) / sigma) ** 2)

    return func

Create the classifier class.

In [106]:
class GaussianNaiveBayesClassifier:
    def __init__(self):
        # define species to be classified
        self.species = [
            IrisType.SETOSA.value,
            IrisType.VERSICOLOR.value,
            IrisType.VIRGINICA.value,
        ]
        self.species_funcs = {}

    def fit(self, X_train, y_train):
        for species in self.species:
            # get dataframe for desired species
            df_species = X_train[y_train == species]

            # calculate mean and std for each column
            mean = df_species.mean()
            std = df_species.std()

            # store functions for each column
            self.species_funcs[species] = {
                label: create_gaussian_distrib_func(mean[label], std[label])
                for label in ["sl", "sw", "pl", "pw"]
            }

            # store base probability
            self.species_funcs[species]["prob"] = len(df_species) / len(X_train)

    def predict(self, X_test):
        res = []

        for index, row in X_test.iterrows():
            sl, sw, pl, pw = row

            # get score for each species
            score = self.predict_proba(sl, sw, pl, pw)

            # get type with highest score
            iris_type = max(score, key=score.get)

            # build result
            res.append(iris_type)

        return res

    def predict_proba(self, sl, sw, pl, pw):
        score = {}

        # calculate score for each species
        for specie in self.species:
            self.species_funcs[specie]
            score[specie] = math.log(
                EPSILON # incorporate epsilon to avoid log(0)
                + self.species_funcs[specie]["prob"]
                * self.species_funcs[specie]["sl"](sl)
                * self.species_funcs[specie]["sw"](sw)
                * self.species_funcs[specie]["pl"](pl)
                * self.species_funcs[specie]["pw"](pw)
            )
        
        return score

Feed classifier with training data.

In [107]:
classifier = GaussianNaiveBayesClassifier()
classifier.fit(X_train, y_train)

Check accuracy against `y_test`.

In [108]:
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))

1.0


Optionally validate whether the sklearn implementation gives the same result.

In [109]:
from sklearn.naive_bayes import GaussianNB

In [110]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print(accuracy_score(y_test, y_pred))

1.0


# Prepare data

Read the mushrooms dataset from the CSV file and separate X and y.

In [111]:
df_shrooms = pd.read_csv(SHROOMS_DATASET_PATH)
df_shrooms['stalk-root'].replace('?', np.nan, inplace=True)
target_colunm = "class"

X = df_shrooms.drop(columns=[target_colunm])
y = df_shrooms[target_colunm]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_shrooms['stalk-root'].replace('?', np.nan, inplace=True)


Divide into train and test datasets.

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

Check for missing values.

In [113]:
df_shrooms.isna().sum()

class                          0
cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64

## Multinominal Naive Bayes Classifier

Define an enum representing possible mushroom types. Values correspond to target labels inside `y`.

In [114]:
class ShroomType(Enum):
    POISONOUS = "p"
    EDIBLE = "e"

Define a constant to ensure that there will be no zero probability.

In [115]:
EMPTY_VALUE_CONSTANT = 1

Based on the analysis inside `data_analysis.ipynb`, let's use the following features to distinguish mushrooms:

`cap-shape`, `cap-surface`, `cap-color`, `ring-number`, `habitat`, `odor`, `gill-color`, `cap-color`, `population`

In [116]:
class MulitnomialNaiveBayesClassifier:
    def __init__(self, X):
        self.X = X
        self.properties = [
            "cap-shape",
            "cap-surface",
            "cap-color",
            "ring-number",
            "habitat",
            "odor",
            "gill-color",
            "cap-color",
            "population",
        ]

    def fit(self, X_train, y_train):
        # sparate dataframes for each type
        df_poisonous = X_train[y_train == ShroomType.POISONOUS.value]
        df_edible = X_train[y_train == ShroomType.EDIBLE.value]

        def add_missing(col, value_counts):
            for value in self.X[col].unique():
                if value not in value_counts:
                    value_counts[value] = 0

            return value_counts

        self.poisonous_dict = {
            col: (
                add_missing(col, df_poisonous[col].value_counts())
                + EMPTY_VALUE_CONSTANT
            )
            / np.float64(len(df_poisonous))
            for col in self.properties
        }
        self.edible_dict = {
            col: (
                add_missing(col, df_edible[col].value_counts()) + EMPTY_VALUE_CONSTANT
            )
            / np.float64(len(df_poisonous))
            for col in self.properties
        }

        self.prob_poisonous = len(df_poisonous) / len(X_train)
        self.prob_edible = len(df_edible) / len(X_train)

    def predict(self, X_test):
        res = []

        for index, row in X_test[self.properties].iterrows():
            prob_poisonous = self.prob_poisonous
            prob_edible = self.prob_edible

            for col, value in row.items():
                prob_poisonous *= self.poisonous_dict[col][value]
                prob_edible *= self.edible_dict[col][value]

            if prob_poisonous > prob_edible:
                res.append(ShroomType.POISONOUS.value)
            else:
                res.append(ShroomType.EDIBLE.value)

        return res

    def predict_proba(self):
        pass

Feed classifier with training data.

In [117]:
classifier = MulitnomialNaiveBayesClassifier(X)
classifier.fit(X_train, y_train)

Check accuracy against `y_test`.

In [119]:
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9766201804757998
