In [1]:
import pandas as pd

from pathlib import Path

from sklearn.decomposition import PCA
from sklearn.naive_bayes import _BaseDiscreteNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
breeds_to_compare = ["saint_bernard", "siamese"]

In [None]:
files: Path = Path(f"../datasets").resolve()
    
naive_bayes_model_list: list[_BaseDiscreteNB] = [GaussianNB, MultinomialNB, ComplementNB]

accuracy_data: dict[str, list] = dict()

accuracy_data["dataset"] = list()
accuracy_data["train"] = list()

for model in naive_bayes_model_list:
    accuracy_data[model.__name__.lower()[:-2]] = list()

for file in files.iterdir():
    df: pd.DataFrame = pd.read_csv(file.as_posix())
    df = df[df.breed.isin(breeds_to_compare)]
    
    features_df: pd.DataFrame = df.iloc[:, 3:]
    metadata_df: pd.DataFrame = df.iloc[:, :3]
    animal_series: pd.Series = df.animal

    for model in naive_bayes_model_list:
        if model in [GaussianNB]:
            scaler = StandardScaler()
            features_scaled_df = scaler.fit_transform(features_df)

            pca: PCA = PCA(n_components=10)
            features_scaled_df = pca.fit_transform(features_scaled_df)
        elif model in [MultinomialNB, ComplementNB]:
            features_scaled_df = features_df.clip(lower=0).astype(float).values

        x_train, x_test, y_train, y_test = train_test_split(
            features_scaled_df,
            animal_series,
            test_size=0.3,
            random_state=42,
            stratify=animal_series
        )

        naive_bayes: _BaseDiscreteNB = model()
        naive_bayes.fit(x_train, y_train)
    
        y_predict = naive_bayes.predict(x_test)

        accuracy: float = accuracy_score(y_test, y_predict)
        
        accuracy_data[model.__name__.lower()[:-2]].append(accuracy)

    accuracy_data["dataset"].append(file.name)
    accuracy_data["train"].append("70/30")

accuracy_df: pd.DataFrame = pd.DataFrame.from_dict(accuracy_data)

output_file: Path = Path(f"../accuracies/naive_bayes_pca_70_30.csv").resolve()
output_file.parent.mkdir(parents=True, exist_ok=True)

accuracy_df.to_csv(output_file.as_posix(), index=False)

In [None]:
files: Path = Path(f"../datasets").resolve()
    
naive_bayes_model_list: list[_BaseDiscreteNB] = [GaussianNB, MultinomialNB, ComplementNB]

accuracy_data: dict[str, list] = dict()

accuracy_data["dataset"] = list()
accuracy_data["train"] = list()

for model in naive_bayes_model_list:
    accuracy_data[model.__name__.lower()[:-2]] = list()

for file in files.iterdir():
    df: pd.DataFrame = pd.read_csv(file.as_posix())
    df = df[df.breed.isin(breeds_to_compare)]
    
    features_df: pd.DataFrame = df.iloc[:, 3:]
    metadata_df: pd.DataFrame = df.iloc[:, :3]
    animal_series: pd.Series = df.animal

    for model in naive_bayes_model_list:
        if model in [GaussianNB]:
            scaler = StandardScaler()
            features_scaled_df = scaler.fit_transform(features_df)

            pca: PCA = PCA(n_components=10)
            features_scaled_df = pca.fit_transform(features_scaled_df)
        elif model in [MultinomialNB, ComplementNB]:
            features_scaled_df = features_df.clip(lower=0).astype(float).values

        kfold: KFold = KFold(n_splits=10, random_state=42, shuffle=True)

        naive_bayes: _BaseDiscreteNB = model()
        
        accuracy: float

        scores = cross_val_score(naive_bayes, features_scaled_df, animal_series, cv=kfold, scoring="accuracy")
        accuracy: float = scores.mean()

        accuracy_data[model.__name__.lower()[:-2]].append(accuracy)
    
    accuracy_data["dataset"].append(file.name)
    accuracy_data["train"].append("10-fold CV")

accuracy_df: pd.DataFrame = pd.DataFrame.from_dict(accuracy_data)

output_file: Path = Path(f"../accuracies/naive_bayes_pca_10_fold_cv.csv").resolve()
output_file.parent.mkdir(parents=True, exist_ok=True)

accuracy_df.to_csv(output_file.as_posix(), index=False)