<a href="https://colab.research.google.com/github/robingenz/htwg-machine-learning-exercises/blob/main/exercises/02_Linear_Regression/03_naive_bayes_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naiver Bayesklassifikator

## Setup

### Einbinden von Paketen

In [94]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import skimage
import math
%matplotlib inline

np.__version__, pd.__version__, sklearn.__version__, skimage.__version__

('1.23.4', '1.5.0', '1.1.3', '0.19.3')

In [54]:
%load_ext version_information
%version_information

The version_information extension is already loaded. To reload it, use:
  %reload_ext version_information


Software,Version
Python,3.10.8 64bit [GCC 10.2.1 20210110]
IPython,8.6.0
OS,Linux 5.10.104 linuxkit aarch64 with glibc2.31
Sun Nov 13 22:45:07 2022 UTC,Sun Nov 13 22:45:07 2022 UTC


## Aufgabe

In [55]:
import os
from urllib.request import urlretrieve
import tarfile

tgz_file_path = 'data/lfw-funneled.tgz'
data_dir_path = 'data/lfw_funneled'

# Herunterladen des Datensatzes
if not os.path.isfile(tgz_file_path):
    print("Downloading...")
    urlretrieve('http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz', filename = tgz_file_path)
    print("Download finished.")

dateDownloaded = !date #Calling Linux
print(dateDownloaded)

# Entpacken des Datensatzes
if not os.path.isdir(data_dir_path):
    print("Extracting...")
    tar = tarfile.open(tgz_file_path, 'r:gz')
    tar.extractall(path='data/')
    tar.close()
    print("Extract finished.")

['Sun Nov 13 22:45:07 UTC 2022']


Finden von Personen mit **70 oder mehr** Bildern:

In [71]:
person_dir_paths = []
for subdir, _, files in os.walk(data_dir_path):
    if len(files) >= 70:
        person_dir_paths.append(subdir)

print('Found', len(person_dir_paths), 'persons with ≥ 70 images.')

Found 7 persons with ≥ 70 images.


Datensatz in 60% Trainings- und 40% Testbilder aufteilen.

In [57]:
from os import listdir
from sklearn.model_selection import train_test_split

image_paths = []
image_name_indexes = []

for i, person_dir_path in enumerate(person_dir_paths):
    files = sorted(listdir(person_dir_path))
    for file in files:
        image_name_indexes.append(i)
        image_paths.append(f'{person_dir_path}/{file}')

# x_training_image_path -> image paths
# y_training_name_index -> 0,1,2,3,4,5,6
# x_test_image_path -> image paths
# y_test_name_index -> 0,1,2,3,4,5,6
x_training_image_path, x_test_image_path, y_training_name_index, y_test_name_index = train_test_split(image_paths, image_name_indexes, test_size=0.4, random_state=0)

print('Splitted into', len(x_test_image_path), 'test images and', len(x_training_image_path), 'training images.')

Splitted into 516 test images and 772 training images.


Vorverarbeitung der Bilder:

In [106]:
from skimage import io, util, transform

def show_image(image):
    io.imshow(image) 
    io.show()

def load_image_from_path(path, as_gray=True):
    image = io.imread(path, as_gray=as_gray)
    return image

def transform_image(image):
    image = util.crop(image, ((100,70), (80,80))) # Values are based on first image of Hugo_Chavez
    image = transform.resize(image, (32,32))
    # show_image(image)
    image = np.hstack(image)
    return image

def create_image_stack(image):
    stack = np.hstack(image)
    return stack

x_training_images = []
x_training_stacks = []
for training_image_path in x_training_image_path:
    image = load_image_from_path(training_image_path)
    x_training_images.append(image)
    image = transform_image(image)
    stack = create_image_stack(image)
    x_training_stacks.append(stack)
x_training_matrix = np.asarray(x_training_stacks)

x_test_images = []
x_test_stacks = []
for test_image_path in x_test_image_path:
    image = load_image_from_path(test_image_path)
    x_test_images.append(image)
    image = transform_image(image)
    stack = create_image_stack(image)
    x_test_stacks.append(stack)
x_test_matrix = np.asarray(x_test_stacks)

Anwendung der **Hauptkomponentenanalyse**:

In [59]:
from sklearn import decomposition

pca = decomposition.PCA(n_components=7, whiten=True)
pca.fit(x_training_matrix)

x_training_projections = pca.transform(x_training_matrix)
x_test_projections = pca.transform(x_test_matrix)

In [88]:
def label_indexes(indexes):
    return [1 if index == 3 else -1 for index in indexes] # George_W_Bush is index 3

def evaluate_prediction(prediction_indexes, observation_indexes):
    prediction_labels = label_indexes(prediction_indexes)
    observation_labels = label_indexes(observation_indexes)

    true_positives = 0
    true_negatives = 0
    false_negatives = 0
    false_positives = 0

    for i in range(len(prediction_indexes)):
        prediction_label = prediction_labels[i]
        observation_label = observation_labels[i]
        if prediction_label == 1 and observation_label == 1:
            true_positives += 1
        if prediction_label == 1 and observation_label == -1:
            false_positives += 1
        if prediction_label == -1 and observation_label == -1:
            true_negatives += 1
        if prediction_label == -1 and observation_label == 1:
            false_negatives += 1

    print('True Positives Rate:', round(100 * true_positives / (true_positives + false_negatives)), '%')
    print('False Negatives Rate:', round(100 * false_negatives / (true_positives + false_negatives)), '%')
    print('True Negatives Rate:', round(100 * true_negatives / (true_negatives + false_positives)), '%')
    print('False Positives Rate:', round(100 * false_positives / (false_positives + true_negatives)), '%')


In [89]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

# Test data
print('Test data')
y_test_prediction = gnb.fit(x_training_projections, y_training_name_index).predict(x_test_projections)
evaluate_prediction(y_test_prediction, y_test_name_index)
print('\n')

# Training data
print('Training data')
y_train_prediction = gnb.fit(x_training_projections, y_training_name_index).predict(x_training_projections)
evaluate_prediction(y_train_prediction, y_training_name_index)

Test data
True Positives Rate: 81 %
False Negatives Rate: 19 %
True Negatives Rate: 49 %
False Positives Rate: 51 %


Training data
True Positives Rate: 84 %
False Negatives Rate: 16 %
True Negatives Rate: 53 %
False Positives Rate: 47 %


In [126]:
# See: https://github.com/tigju/Naive-Bayes-Classifier-from-scratch/blob/main/naive_bayes.ipynb
class NaiveBayesClassifier():

    def calc_prior(self, features, target):
        self.prior = (features.groupby(target).apply(lambda x: len(x)) / self.rows).to_numpy()
        return self.prior

    def calc_statistics(self, features, targets):
        self.mean = features.groupby(targets).apply(np.mean).to_numpy()
        self.var = features.groupby(targets).apply(np.var).to_numpy()
        return self.mean, self.var

    def gaussian_density(self, class_idx, x):
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        prob = numerator / denominator
        return prob

    def calc_posterior(self, x):
        posteriors = []
        for i in range(self.count):
            prior = np.log(self.prior[i])
            conditional = np.sum(np.log(self.gaussian_density(i, x)))
            posterior = prior + conditional
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]

    def fit(self, features, targets):
        self.classes = np.unique(targets)
        self.count = len(self.classes)
        self.feature_nums = features.shape[1]
        self.rows = features.shape[0]
        self.calc_statistics(features, targets)
        self.calc_prior(features, targets)

    def predict(self, features):
        preds = [self.calc_posterior(f) for f in features.to_numpy()]
        return preds

nbc = NaiveBayesClassifier()

# Test data
print('Test data')
x_training_projections_df = pd.DataFrame(x_training_projections)
y_training_name_index_df = pd.DataFrame(y_training_name_index)[0]
x_test_projections_df = pd.DataFrame(x_test_projections)

nbc.fit(x_training_projections_df, y_training_name_index_df)
y_test_prediction = nbc.predict(x_test_projections_df)
evaluate_prediction(y_test_prediction, y_test_name_index)
print('\n')

# Training data
print('Training data')
x_training_projections_df = pd.DataFrame(x_training_projections)
y_training_name_index_df = pd.DataFrame(y_training_name_index)[0]
x_training_projections_df = pd.DataFrame(x_training_projections)

nbc.fit(x_training_projections_df, y_training_name_index_df)
y_training_prediction = nbc.predict(x_training_projections_df)
evaluate_prediction(y_training_prediction, y_training_name_index)

Test data
True Positives Rate: 86 %
False Negatives Rate: 14 %
True Negatives Rate: 43 %
False Positives Rate: 57 %


Training data
True Positives Rate: 88 %
False Negatives Rate: 12 %
True Negatives Rate: 45 %
False Positives Rate: 55 %


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
