# **1. Loading the Training dataset:**

In [None]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import math
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [None]:
path = "training"
width = 256
height = 256
dimension = (width, height)
encoding = 0
label = []
data = []
encoded_labels = []

In [None]:
for root, directory, files in os.walk(path):

    # example of the root drive/MyDrive/training/bedroom
    if root != path:
        counter = 0
        for fp in files:
            if fp != ".DS_Store":
                counter += 1

                # example of the filepath drive/MyDrive/training/bedroom/0.jpg
                filepath = os.path.join(root, fp)

                # Images are of different shapes
                img_array = cv2.imread(filepath, cv2.IMREAD_GRAYSCALE)

                # All images have the dimension (256, 256) => (width, height)
                img = cv2.resize(img_array, dimension)
                data.append(img)
                encoded_labels.append(encoding)

        encoding += 1
        # Getting the label
        label.append(root.split("/")[-1])

label = list(set(label))

In [None]:
# X contains all 1500 images
# Shape = 1500, 256, 256
X = np.array(data)
print(X.shape)

# y contains the labels of X
# Shape = 1500,
y = np.array(encoded_labels)
print(y.shape)

(1500, 256, 256)
(1500,)


# 2. Spliting the Data as Training set and Validation set:

In [None]:
# 90% for the Training set and 10% for the Validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42)

# Shape of X_train = (1350, 256, 256)
# Shape of y_train = (1350,)
# Shape of X_val = (150, 256, 256)
# Shape of y_val = (150,)

# 3. Using only the Training set to create the Vocabulary:

## **3.1. Visual Feature extraction using Dense SIFT:**

In [None]:
def sift_features(X):

    # Dense SIFT with different number of key points.
    # The whole idea of dense SIFT is to have key points over the entire image, having more irrelevant information as well
    # Since the image size is choosen as 256x256, we can choose say step_size = 4 or 8
    step_size = 8
    key_points = []
    dense_descriptors = []
    sift = cv2.xfeatures2d.SIFT_create()
    counter = 0

    for i in range(len(X)):
        counter += 1
        temp = []
        for m in range(0, len(X[0]), step_size):
            for n in range(0, len(X[1]), step_size):
                temp.append(cv2.KeyPoint(m, n, step_size))

        # List of key-points for each image
        key_points_image = tuple(temp)

        # Each image has 1024 key points
        key_points.append(key_points_image)

        _, dense_features =  sift.compute(X[i], temp)

        # There are 1024 key points in each image
        # That means there will be 128 feature descriptor for each of those 1024 key points
        # Thus the shape will be (1024, 128) for each of the dense features
        # There are a total of 1500 images, which means the dense_descriptor
        # will have a shape of (1500, 1024, 128)
        dense_descriptors.append(dense_features)

    return key_points, dense_descriptors

In [None]:
key_points_train, dense_descriptors_train = sift_features(X_train)

In [None]:
# Key points
print(len(key_points_train))
print(type(key_points_train))
print(len(key_points_train[0]))
print(type(key_points_train[0]))
print()
print()

# Descriptors
print(len(dense_descriptors_train))
print(type(dense_descriptors_train))
print(dense_descriptors_train[0].shape)
print(type(dense_descriptors_train[0]))

1350
<class 'list'>
1024
<class 'tuple'>


1350
<class 'list'>
(1024, 128)
<class 'numpy.ndarray'>


## **3.2. Creating the Bag of Visual Words (BoVW) or the Vocabulary:**

### 3.2.1. Creating all the Descriptors for the data:

In [None]:
def all_descriptors(dense_descriptors):
    list_of_descriptor = []

    for image_descriptor in dense_descriptors:
        for feature_vector in image_descriptor:
            list_of_descriptor.append(feature_vector)

    return list_of_descriptor

In [None]:
list_of_descriptor_train = all_descriptors(dense_descriptors_train)

In [None]:
print(len(list_of_descriptor_train))
print(type(list_of_descriptor_train))
print()

print(len(list_of_descriptor_train[0]))
print(type(list_of_descriptor_train[0]))

1382400
<class 'list'>

128
<class 'numpy.ndarray'>


In [None]:
# Since these are separate np.array's, we can convert them into a single array using numpy
print(list_of_descriptor_train[0:2])

# The shape of the output is (1350*1024, 128)
# This is becasue when we are training on the k-means clustering algorithm, we only care about the feature vectors
# The information regarding which image they come from is not needed
list_of_descriptor_train = np.stack(list_of_descriptor_train)
print(list_of_descriptor_train.shape)

[array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   1.,   1.,   1.,   0.,   0.,   0.,   0.,   2.,  10.,
         6.,   2.,   0.,   0.,   0.,   2.,   2.,   5.,   3.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         1.,  21.,   5.,   1.,   0.,   0.,   0.,   0.,   8., 147.,  38.,
         5.,   0.,   0.,   1.,   4.,  15., 108.,  23.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   4.,  32.,   1.,   0.,  10.,
       167.,   7.,   5.,  35., 106.,  30.,   7., 151., 226.,  56.,   3.,
        37.,  46.,  12.,  17., 226., 226.,  24.], dtype=float32), array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   

### 3.2.2. Using k-means to find the Vocabulary:

In [None]:
# K-means clustering is now performed and similar BoVW are grouped together
# Then the centre of those clusters are returned

k_means = KMeans(n_clusters = 200, random_state = 0, n_init = 1, verbose = 0)
k_means.fit(list_of_descriptor_train)
vocabulary = k_means.cluster_centers_

In [None]:
# This is our vocabulary for the training data
print(len(vocabulary))
print(type(vocabulary))
print(vocabulary.shape)

200
<class 'numpy.ndarray'>
(200, 128)


### 3.2.3. Creating a mapping between cluster centroids (vocabulary) and the descriptors:

In [None]:
def mapping(dense_descriptors):
    # Map the images to the appropriate vocabulary
    # This means we will now be storing the images as histograms and no longer as images
    visual_words = []

    for image_descriptors in dense_descriptors:
        map_to_vocab = []
        for descriptor in image_descriptors:

            # Eucledian Distance between these descriptors and vocabulary
            # Shape of the descriptor is (200, 128)
            descriptor_stack = np.tile(descriptor, (200, 1))

            # Shape is (200, 128)
            difference = descriptor_stack - vocabulary

            # This will have the euclidean distance between each descriptor (128,) and the visual word (200, 128)
            e_dist = pow(((pow(difference, 2)).sum(axis = 1)), 0.5)

            # Finding the index of the minimum distance and this is the cluster index that it belongs to
            temp = list(e_dist)
            index_of_vocab = temp.index(min(temp))

            # For the specific descriptor, this will be the cluster that it maps to
            map_to_vocab.append(index_of_vocab)
        map_to_vocab = np.array(map_to_vocab)
        visual_words.append(map_to_vocab)
    return visual_words

In [None]:
visual_words = mapping(dense_descriptors_train)

In [None]:
# The variable visual_words represents the mapping to the vocabulary
print(len(visual_words))
print(type(visual_words))
print()

print(len(visual_words[0]))
print(type(visual_words[0]))
print(visual_words[0])

1350
<class 'list'>

1024
<class 'numpy.ndarray'>
[136 101  22 ...  95  95  95]


### 3.2.4. Counting the vocabulary in the image:

In [None]:
def histogram(visual_words):
    # Converting all the images into histograms that represents
    # the count of the number of times that a specific vocabulary appeared in the image

    frequency_vec = []

    for image_visual_words in visual_words:

        # 200 represents the number of cluster centroids
        image_frequency = np.zeros(200)

        # val represents the index and this index refers to the vocabulary
        for val in image_visual_words:

            # image_frequency[val] will count the number of times that vocabulary appears within an image
            # Shape = (200,)
            image_frequency[val] += 1

        frequency_vec.append(image_frequency)

    return frequency_vec

In [None]:
frequency_vec = histogram(visual_words)

In [None]:
print(len(frequency_vec))
print(type(frequency_vec))
print()
print(len(frequency_vec[0]))
print(type(frequency_vec[0]))

1350
<class 'list'>

200
<class 'numpy.ndarray'>


## 3.3. Training Data to feed the Classifier

In [None]:
train_data = np.stack(frequency_vec)

In [None]:
print(train_data.shape)
print(type(train_data))

(1350, 200)
<class 'numpy.ndarray'>


# 4. Using the functions to create feature vectors for the Validation set:

## 4.1. Extracting Dense SIFT Features:

In [None]:
key_points_val, dense_descriptors_val = sift_features(X_val)

In [None]:
# Key points
print(len(key_points_val))
print(type(key_points_val))
print(len(key_points_val[0]))
print(type(key_points_val[0]))
print()
print()

# Descriptors
print(len(dense_descriptors_val))
print(type(dense_descriptors_val))
print(dense_descriptors_val[0].shape)
print(type(dense_descriptors_val[0]))

150
<class 'list'>
1024
<class 'tuple'>


150
<class 'list'>
(1024, 128)
<class 'numpy.ndarray'>


## 4.2. Mapping the features:

In [None]:
visual_words_val = mapping(dense_descriptors_val)

In [None]:
# The variable visual_words represents the mapping to the vocabulary
print(len(visual_words_val))
print(type(visual_words_val))
print()

print(len(visual_words_val[0]))
print(type(visual_words_val[0]))
print(visual_words_val[0])

150
<class 'list'>

1024
<class 'numpy.ndarray'>
[ 34  34  34 ... 129 129 129]


## 4.3. Creating the Histograms:

In [None]:
# Validation data input
frequency_vec_val = histogram(visual_words_val)

In [None]:
print(len(frequency_vec_val))
print(type(frequency_vec_val))
print()
print(len(frequency_vec_val[0]))
print(type(frequency_vec_val[0]))

150
<class 'list'>

200
<class 'numpy.ndarray'>


## 4.4. Validation Data to feed the Classifier:

In [None]:
val_data = np.stack(frequency_vec_val)

In [None]:
print(val_data.shape)
print(type(val_data))

(150, 200)
<class 'numpy.ndarray'>


# 5. Classification using SVM:

In [None]:
# Training the SVM
svc = SVC(kernel = "poly", degree = 8)
svc.fit(train_data, y_train)

SVC(degree=8, kernel='poly')

In [None]:
predictions = svc.predict(val_data)

In [None]:
print(classification_report(y_val, predictions))

              precision    recall  f1-score   support

           0       1.00      0.67      0.80        12
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00        10
           3       1.00      0.40      0.57        10
           4       0.92      0.60      0.73        20
           5       0.50      0.17      0.25         6
           6       0.00      0.00      0.00         8
           7       0.00      0.00      0.00        11
           8       1.00      0.25      0.40         8
           9       0.08      1.00      0.14         8
          10       1.00      0.09      0.17        11
          11       1.00      0.27      0.43        11
          12       0.00      0.00      0.00        12
          13       0.25      0.12      0.17         8
          14       0.25      0.11      0.15         9

    accuracy                           0.27       150
   macro avg       0.47      0.25      0.25       150
weighted avg       0.52   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
