In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd

from random import sample, choice
from glob import glob

# import cv2
from PIL import Image

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import PCA
from skimage.feature import hog
from skimage.color import rgb2gray


In [None]:
# grab filepaths
root_path = "./Intel Training Dataset/"

# split into subfolders based on class label
subfolders = sorted(glob(root_path + "*"))
label_names = [p.split("/")[-1] for p in subfolders]


In [None]:
# Get image data

data = {
    "image_labels": [],
    "image_filenames": [],
    "feature_vectors": [],
}


def build_feature_vector(img, grayscale=False):
    feature_vector_1 = hog(
        img,
        orientations=10,
        pixels_per_cell=(15, 15),
        cells_per_block=(1, 1),
        block_norm="L2",
        channel_axis= None if grayscale else -1,
    )
    if grayscale:
        img_gray = img
    else:
        img_gray = rgb2gray(img)
    feature_vector_2 = np.histogram(img_gray, bins=16, density=True)[0]
    feature_vector = np.hstack((feature_vector_1, feature_vector_2))
    return feature_vector


for i, subfolder in enumerate(subfolders):
    # get list of file paths for each subfolder
    file_paths = sorted(glob(subfolder + "/*.jpg"))
    for f in file_paths:
        # TODO: switch to cv2 for speed(?)
        # img = cv2.imread(f, 0)
        
        # TODO: decide on whether to resize image
        # img_base = Image.open(f).resize((100, 100))
        img_base = Image.open(f)
        img_flipped = img_base.transpose(Image.TRANSPOSE)
        img_gray = rgb2gray(img_base)
        
        # TODO: decide on whether or not to use grayscale images in dataset
        for j,img in enumerate((img_base, img_flipped)):
            # maybe resize image with img = img.resize((100,100))
            feature_vector = build_feature_vector(img, grayscale=(j == 2))
            data["feature_vectors"].append(feature_vector)
            data["image_filenames"].append(f)
            data["image_labels"].append(i)


In [None]:
# use principle componenet analysis to choose better features
pca = PCA(n_components=50)
X = np.array(data["feature_vectors"])
pca.fit(X)
X = pca.transform(X)
data["feature_vectors"] = list(X)

In [None]:
# convert to dataframe for storage
df = pd.DataFrame(data=data)

# store it
df.to_pickle("feature_backup.pkl")


In [None]:
df = pd.read_pickle("feature_backup.pkl")

In [None]:
random_ix = choice(range(len(df["image_filenames"])))
img = Image.open(df["image_filenames"][random_ix])
# hog_img = df["hog_images"][random_ix]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=True)

ax1.axis("off")
ax1.imshow(img, cmap=plt.cm.gray)
ax1.set_title("Input image")

ax2.axis("off")
# ax2.imshow(hog_image, cmap=plt.cm.gray)
# ax2.set_title("Histogram of Oriented Gradients")
plt.show()

print("feature vector size:", df["feature_vectors"][random_ix].shape)


In [None]:
# train/validation split
feature_vectors = np.vstack(df["feature_vectors"])
labels = df["image_labels"]

X_train, X_test, y_train, y_test = train_test_split(
    feature_vectors, labels, test_size=0.2, stratify=labels, random_state=0
)
print(X_train.shape, y_train.shape)

In [None]:
# train one model

clf = make_pipeline(StandardScaler(), SVC(gamma='scale', kernel="rbf", C=0.5))
clf.fit(X_train, y_train)

In [None]:
# report accuracy

train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)
print(f"Overall train accuracy:")
print(classification_report(y_train, train_pred))
print(f"Overall test accuracy:")
print(classification_report(y_test, test_pred))

C = confusion_matrix(y_test, test_pred)
sn.heatmap(C, annot=True, cmap="Blues", xticklabels=label_names, yticklabels=label_names)