In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

from random import sample, shuffle
from skimage.feature import hog
from sklearn.datasets import fetch_olivetti_faces
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [None]:
# load labeled faces in wild dataset
lfw_people = fetch_olivetti_faces()

In [None]:
num_classes = 20

# create mask for samples
m = lfw_people.target < num_classes

In [None]:
labels = lfw_people.target[m]
images = lfw_people.images[m, ...]
num_images = len(labels)

In [None]:
# choose a random subset of images
subcount = 5
ex_ixs = sorted(sample(range(num_images), k=subcount))

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=subcount)
for ax_no, image_no in enumerate(ex_ixs):
    ax[ax_no].imshow(images[image_no], cmap='gray')
    ax[ax_no].axis("off")
    ax[ax_no].set_title(f"Img {image_no}\n Label {labels[image_no]}")

In [None]:
# extract features from raw data

# eg 1: luminance hist
lumhist_data = []
num_bins = 25

for i in range(num_images):
    # normalized histogram, with input data values ranging 0-1
    hist, _ = np.histogram(images[i], bins=num_bins, range=(0,1), density=True)
    lumhist_data.append(hist)
lumhist_data = np.array(lumhist_data)

fig, ax = plt.subplots(nrows=1, ncols=subcount, figsize=(13,3))
for ax_no, image_no in enumerate(ex_ixs):
    hist = lumhist_data[image_no]
    ax[ax_no].bar(range(num_bins), height=hist)
    ax[ax_no].set_ylim([0,5])
    ax[ax_no].set_title(f"Img {image_no}\n Label {labels[image_no]}")

In [None]:
# X indicates data, and y indicates labels
# reserve 20% of data for test set. 
# stratify=labels ensures labels should be either completely in train or completely in test
X_train, X_test, y_train, y_test = train_test_split(lumhist_data,labels, test_size = 0.2, stratify = labels, random_state=0)
print("train labels:", set(y_train))
print("test labels :", set(y_test))

# train KNN classifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

train_accuracy = neigh.score(X_train, y_train)
test_accuracy = neigh.score(X_test, y_test)
print(f"Overall train accuracy: {train_accuracy}")
print(f"Overall test accuracy: {test_accuracy}")

In [None]:
y_pred = neigh.predict(X_test)

C = confusion_matrix(y_test, y_pred)
sn.heatmap(C, annot=True, cmap="Blues")