In [1187]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import math
import pandas as pd

In [1188]:
SHROOMS_DATASET_PATH = "datasets/mushrooms.csv"
TEST_SIZE = 0.3
EPSILON = 1e-12
RANDOM_SEED = 15

## Prepare data

Rename columns for easier access.

In [1189]:
iris = load_iris(as_frame=True)
X = iris.data.rename(
    columns={
        "sepal length (cm)": "sl",
        "sepal width (cm)": "sw",
        "petal length (cm)": "pl",
        "petal width (cm)": "pw",
    }
)
y = iris.target

Divide into train and test datasets.

In [1190]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

Check for missing values.

In [1191]:
iris.frame.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

## Gaussian Naive Bayes Classifier

Define an enum representing possible iris types. Values correspond to target labels inside `y`.

In [1192]:
from enum import Enum

In [1193]:
class IrisType(Enum):
    SETOSA = 0
    VERSICOLOR = 1
    VIRGINICA = 2

Define a function for createing parametrized gaussian distribution function.

In [1194]:
def create_gaussian_distrib_func(mi, sigma):
    def func(x):
        return 1 / (sigma * math.sqrt(2 * math.pi)) * math.exp(-0.5 * ((x - mi) / sigma) ** 2)

    return func

Create the classifier class.

In [1195]:
class GaussianNaiveBayesClassifier:
    def __init__(self):
        # define species to be classified
        self.categories = [
            IrisType.SETOSA.value,
            IrisType.VERSICOLOR.value,
            IrisType.VIRGINICA.value,
        ]
        self.category_data = {}

    def fit(self, X_train, y_train):
        for category in self.categories:
            # get dataframe for desired species
            df_category = X_train[y_train == category]

            # calculate mean and std for each feature
            mean = df_category.mean()
            std = df_category.std()

            # store functions for each feature
            self.category_data[category] = {
                feature: create_gaussian_distrib_func(mean[feature], std[feature])
                for feature in X_train.columns
            }

            # store base probability
            self.category_data[category]["prob"] = len(df_category) / len(X_train)

    def predict(self, X_test):
        res = []

        for index, row in X_test.iterrows():
            # get score for each species
            score = self.predict_proba(row)

            # get type with highest score
            category = max(score, key=score.get)

            # build result
            res.append(category)

        return res

    def predict_proba(self, row):
        score = {}

        # calculate score for each species
        for category in self.categories:
            score[category] = math.log(EPSILON + self.category_data[category]["prob"])

            for feature, value in row.items():
                score[category] += math.log(
                    EPSILON + self.category_data[category][feature](value)
                )

        return score

Feed classifier with training data.

In [1196]:
classifier = GaussianNaiveBayesClassifier()
classifier.fit(X_train, y_train)

Check accuracy against `y_test`.

In [1197]:
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))

1.0


Validate whether the sklearn implementation gives the same result.

In [1198]:
from sklearn.naive_bayes import GaussianNB

In [1199]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print(accuracy_score(y_test, y_pred))

1.0


**Conclusion**: Matching accuracy is great!

# Prepare data

Read the mushrooms dataset from the CSV file and separate X and y.

In [1200]:
df_shrooms = pd.read_csv(SHROOMS_DATASET_PATH)

In [1201]:
df_shrooms['stalk-root'].value_counts()

stalk-root
b    3776
?    2480
e    1120
c     556
r     192
Name: count, dtype: int64

Replace missing values.

In [1202]:
df_shrooms.replace('?', pd.NA, inplace=True)
# df_shrooms.dropna(inplace=True)
df_shrooms['stalk-root'].fillna(df_shrooms['stalk-root'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_shrooms['stalk-root'].fillna(df_shrooms['stalk-root'].mode()[0], inplace=True)


In [1203]:
df_shrooms['stalk-root'].value_counts()

stalk-root
b    6256
e    1120
c     556
r     192
Name: count, dtype: int64

Based on data analysis keep only valuable properties.

In [1204]:
PROPERTIES = ["odor", "gill-color", "cap-color", "population", "cap-shape", "cap-surface", "ring-number", "habitat", "bruises"]

In [1205]:
TARGET_COLUMN = "class"

X = df_shrooms.drop(columns=[TARGET_COLUMN])
X = X[PROPERTIES]
y = df_shrooms[TARGET_COLUMN]

Divide into train and test datasets.

In [1206]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

## Multinominal Naive Bayes Classifier

Define an enum representing possible mushroom types. Values correspond to target labels inside `y`.

In [1207]:
class ShroomType(Enum):
    POISONOUS = "p"
    EDIBLE = "e"

In [1208]:
BLACK_BOX_CONSTANT = 1

In [1209]:
class MulitnomialNaiveBayesClassifier:
    def __init__(self, complete_features):
        # define categories to be classified
        self.categories = [ShroomType.POISONOUS.value, ShroomType.EDIBLE.value]
        self.complete_features = complete_features

    def fit(self, X_train, y_train):
        self.category_data = {}

        # calculate probabilities for each category
        for category in self.categories:
            df_category = X_train[y_train == category]

            # store probability for each feature
            # e.g. category_data['p']['cap-color']['n'] = 0.741
            self.category_data[category] = {
                # add missing features with zero probability
                # add BLACK_BOX_CONSTANT to avoid zero probability
                feature: (df_category[feature].value_counts().reindex(self.complete_features[feature], fill_value=0) + BLACK_BOX_CONSTANT) / (len(df_category) + df_category[feature].nunique() * BLACK_BOX_CONSTANT)
                for feature in X_train.columns
            }

            # store base probability
            self.category_data[category]["prob"] = len(df_category) / len(X_train)

    def predict(self, X_test):
        res = []

        for index, row in X_test.iterrows():
            # get score for each species
            score = self.predict_proba(row)

            # get type with highest score
            category = max(score, key=score.get)

            # build result
            res.append(category)

        return res

    def predict_proba(self, row):
        score = {}

        # calculate score for each category
        for category in self.categories:
            score[category] = math.log(EPSILON + self.category_data[category]["prob"])

            for feature, value in row.items():
                score[category] += math.log(EPSILON + self.category_data[category][feature][value])

        return score

Feed classifier with training data.

In [1210]:
classifier = MulitnomialNaiveBayesClassifier({ feature: X[feature].unique() for feature in X })
classifier.fit(X_train, y_train)

Check accuracy against `y_test`.

In [1211]:
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9803117309269893


Validate whether the sklearn implementation gives the same result.

In [1212]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.naive_bayes import MultinomialNB

# Encode labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# One-hot encode features
oe = OneHotEncoder()
X_train_encoded = oe.fit_transform(X_train)

# Train MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_encoded, y_train_encoded)

# Predict and evaluate
y_pred = mnb.predict(oe.transform(X_test))
print(accuracy_score(le.transform(y_test), y_pred))


0.9803117309269893


**Conclusion**: Great news!