In [1]:
from sklearn.datasets import load_iris, load_wine
from sklearn.model_selection import train_test_split
import numpy as np
from functools import reduce
from operator import mul
from sklearn.metrics import confusion_matrix
import sklearn.naive_bayes
from sklearn.metrics import accuracy_score


In [2]:
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)


In [3]:
# 1. sepal length in cm  x1 kielich
# 2. sepal width in cm   x2
# 3. petal length in cm  x3 kwiatek
# 4. petal width in cm   x4
# 5. class:  
# -- Iris Setosa   C1
# -- Iris Versicolour  C2
# -- Iris Virginica  C3

In [4]:
#https://www.youtube.com/watch?v=kufuBE6TJew&ab_channel=MaheshHuddar
class GaussianNB:
    def __init__(self, n_features, n_classes):
        self.n_features = n_features
        self.n_classes = n_classes
        self.mean = {}
        self.var = {}
        self.class_probability = {}

    def learn(self, x, y):
        for class_id in range(self.n_classes):
            selected = x[y == class_id]
            mean = np.mean(selected, axis=0)
            var = np.var(selected, axis=0, ddof=1)
            self.mean[class_id] = mean
            self.var[class_id] = var
            self.class_probability[class_id] = len(y[y == class_id]) / len(y)

    def predict_sample(self, x_pred):

        return np.argmax([self._posterior(class_id, x_pred) for class_id in range(self.n_classes)])
    
    def predict(self,x_test):
        y_pred = []
        for x_pred in x_test:
            prediction = self.predict_sample(x_pred)
            y_pred.append(prediction)
        return y_pred
    

    def _posterior(self,class_id, x_pred):

        result = self.class_probability[class_id]
        for feature_id in range(self.n_features):
            result *= self._gauss(feature_id, class_id, x_pred[feature_id])

        return result

    def _gauss(self,feature_id, class_id, predicted):
        index = - np.power(predicted - self.mean[class_id][feature_id], 2) / (2 * self.var[class_id][feature_id])
        return 1 / np.sqrt(2 * np.pi * self.var[class_id][feature_id]) * np.exp(index)


gnb = GaussianNB(4, 3)
gnb.learn(X_train, y_train)
y_pred = gnb.predict(X_test)
confusion_matrix(y_pred, y_test)



array([[16,  0,  0],
       [ 0, 23,  4],
       [ 0,  0, 17]])

In [5]:

gnb1 = sklearn.naive_bayes.GaussianNB()
y_pred1 = gnb1.fit(X_train, y_train).predict(X_test)
confusion_matrix(y_pred1, y_test)

array([[16,  0,  0],
       [ 0, 23,  4],
       [ 0,  0, 17]])

In [6]:
scores = []
for _ in range(20):
    X, y = load_iris(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    gnb = GaussianNB(4, 3)
    gnb.learn(X_train, y_train)
    y_pred = gnb.predict(X_test)
    scores.append(accuracy_score(y_test,y_pred))

print('średni błąd klasyfikacji', np.mean(scores))
print('odchylenie standardowe',np.std(scores))

średni błąd klasyfikacji 0.95
odchylenie standardowe 0.024720661623652194


## Wine

In [7]:
X, y = load_wine(return_X_y=True)
n_features  = len(load_wine()['feature_names'])
n_classes = 3

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

gnb2 = GaussianNB(n_features, n_classes)
gnb2.learn(X_train, y_train)
y_pred = gnb2.predict(X_test)
confusion_matrix(y_pred, y_test)

array([[22,  2,  0],
       [ 0, 27,  0],
       [ 0,  2, 19]])

In [8]:

gnb = sklearn.naive_bayes.GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
confusion_matrix(y_pred, y_test)

array([[22,  2,  0],
       [ 0, 27,  0],
       [ 0,  2, 19]])

In [9]:
# Standardization (Z-score Normalization)  (średnia 0, odchylenie standardowe 1
X, y = load_wine(return_X_y=True)

from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler(with_mean=False, with_std=False)
scaler.fit(X)
X = scaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

gnb = sklearn.naive_bayes.GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print(accuracy_score(y_test,y_pred))


0.9444444444444444


In [10]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

X, y = load_wine(return_X_y=True)
X = pca.fit(X).transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
gnb = sklearn.naive_bayes.GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

print(accuracy_score(y_test,y_pred))
# po pca spadło do 80% pradwopodobnie pca za bardzo zredukowało zbiór danych

0.7962962962962963
