In [2]:
import pandas as pd
from sklearn import datasets

iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target
display(df)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


## Pre-Processing

In [3]:
df.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
species              0
dtype: int64

In [4]:
df.duplicated().sum()

1

In [5]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [6]:
X = df.drop(columns='species')
y = df['species']
y.value_counts()

species
0    50
1    50
2    49
Name: count, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Tidak melakukan stratifikasi karena pembagian kelas sudah seimbang

## Gaussian

In [12]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

gauss = GaussianNB()
gauss.fit(X_train, y_train)
gauss_pred = gauss.predict(X_test)
print(classification_report(y_test, gauss_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



## Multinomial

In [13]:
from sklearn.naive_bayes import MultinomialNB

multi = MultinomialNB()
multi.fit(X_train, y_train)
multi_pred = multi.predict(X_test)
print(classification_report(y_test, multi_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.64      1.00      0.78         9
           2       1.00      0.55      0.71        11

    accuracy                           0.83        30
   macro avg       0.88      0.85      0.83        30
weighted avg       0.89      0.83      0.83        30



## Complement

In [14]:
from sklearn.naive_bayes import ComplementNB

comp = ComplementNB()
comp.fit(X_train, y_train)
comp_pred = comp.predict(X_test)
print(classification_report(y_test, comp_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.00      0.00      0.00         9
           2       0.55      1.00      0.71        11

    accuracy                           0.70        30
   macro avg       0.52      0.67      0.57        30
weighted avg       0.54      0.70      0.59        30



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Bernoulli

In [15]:
from sklearn.naive_bayes import BernoulliNB

bern = BernoulliNB()
bern.fit(X_train, y_train)
bern_pred = bern.predict(X_test)
print(classification_report(y_test, bern_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.30      1.00      0.46         9
           2       0.00      0.00      0.00        11

    accuracy                           0.30        30
   macro avg       0.10      0.33      0.15        30
weighted avg       0.09      0.30      0.14        30



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Categorical

In [16]:
from sklearn.naive_bayes import CategoricalNB

cat = CategoricalNB()
cat.fit(X_train, y_train)
cat_pred = cat.predict(X_test)
print(classification_report(y_test, cat_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.75      1.00      0.86         9
           2       1.00      0.73      0.84        11

    accuracy                           0.90        30
   macro avg       0.92      0.91      0.90        30
weighted avg       0.93      0.90      0.90        30



## Out of Core Naive Bayes Modeling
Menggunakan GaussianNB yang cocok untuk fitur kontinyu

In [17]:
# Membuat batch training
batch_size = 30
num_batches = len(X_train) // batch_size

X_batch = X_train[:batch_size]
y_batch = y_train[:batch_size]

gauss_ooc = GaussianNB()
gauss_ooc.partial_fit(X_batch, y_batch, classes=[0, 1, 2])  # Iris has 3 classes (0, 1, 2)

for i in range(1, num_batches):
    start = i * batch_size
    end = start + batch_size
    X_batch = X_train[start:end]
    y_batch = y_train[start:end]
    gauss_ooc.partial_fit(X_batch, y_batch)

accuracy = gauss_ooc.score(X_test, y_test)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 1.0000
