In [1]:
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Initiate varibles

In [46]:
k = 50 # number of representative vectors
feature_type = 'spatial_pyramid' # [
'dense_sift', 'pyramid_dense_sift', 'spatial_pyramid']
sift_step_size = 5
num_level = 2

## 1. Dataset preparation

In [47]:
data = {'img_path': [], 'label': []}

for root, dirs, files in os.walk("./training/"):
    label = os.path.basename(root)
    for file in files:
        if file.endswith('.jpg'):
            data['img_path'].append(os.path.join(root, file))
            data['label'].append(label)

df_data = pd.DataFrame(data)

In [48]:
df_data.head()

Unnamed: 0,img_path,label
0,./training/Forest/63.jpg,Forest
1,./training/Forest/77.jpg,Forest
2,./training/Forest/88.jpg,Forest
3,./training/Forest/89.jpg,Forest
4,./training/Forest/76.jpg,Forest


## 2. Feature extraction

In [49]:
# SIFT Visual words
def sift(image):
    # construct a SIFT object 
    sift = cv2.SIFT_create()
    # find keypoints and descriptors
    kp, des = sift.detectAndCompute(image, None)
    
    return des

def dense_sift(image, step_size=5):
    # read image from img_path then convert to gray scale
    # construct a SIFT object 
    sift = cv2.SIFT_create()
    # create dense keypoints and compute descriptors
    kp = [cv2.KeyPoint(x, y, step_size) for x in range(0, image.shape[0], step_size) 
                                        for y in range(0, image.shape[1], step_size)]
    kp, des = sift.compute(image, kp)
    return des

## 3. Bag of visual words

In [50]:
visual_words = []

for i, row in df_data.iterrows():
    # read image from img_path then convert to gray scale
    image = cv2.imread(row['img_path'])
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # get descriptors for each image
    visual_words.append(dense_sift(image, sift_step_size))
    
df_data['visual_words'] = visual_words

In [51]:
df_data.head()

Unnamed: 0,img_path,label,visual_words
0,./training/Forest/63.jpg,Forest,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,./training/Forest/77.jpg,Forest,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,./training/Forest/88.jpg,Forest,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,./training/Forest/89.jpg,Forest,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,./training/Forest/76.jpg,Forest,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


### Create representation vectors - codebook

In [52]:
# preparing bag of visual words
BoVW = df_data['visual_words'].to_list()
BoVW = np.vstack(BoVW)

print('bag of visual words size:', BoVW.shape)

bag of visual words size: (4060424, 128)


In [53]:
# K-Means clustering
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
flags = cv2.KMEANS_RANDOM_CENTERS
compactness, labels, centres = cv2.kmeans(BoVW, k, None, criteria, 10, flags)

### Histograms of bags of visual words

In [54]:
def pyramid_dense_sift(image, step_size=5, num_level=2):
    # construct a SIFT object 
    sift = cv2.SIFT_create()
    all_des = []
    for l in range(num_level):
        # create dense keypoints and compute descriptors
        kp = [cv2.KeyPoint(x, y, step_size) for x in range(0, image.shape[0], step_size) 
                                            for y in range(0, image.shape[1], step_size)]
        kp, des = sift.compute(image, kp)
        all_des.append(des)
        
        # resample image to the next level
        image = cv2.pyrDown(image)
    return np.vstack(all_des)

def spatial_pyramid(image, codebook, k, step_size=5, num_level=2):
    img_h, img_w = image.shape
    concat_his = []
    for l in range(num_level):
        num_grid = 2**l
        
        grid_size_w = img_w//num_grid
        grid_co_w = np.arange(0, img_w, grid_size_w) #coordinate of each grid
        
        grid_size_h = img_h//num_grid
        grid_co_h = np.arange(0, img_h, grid_size_h) #coordinate of each grid
        
        for i in range(num_grid):
            for j in range(num_grid):
                des = dense_sift(image[grid_co_h[j]:grid_co_h[j]+grid_size_h, 
                                       grid_co_w[i]:grid_co_w[i]+grid_size_w], step_size)

                his = histogram_bovw(des, codebook, k)
                his = his * (1/2**(num_level-l)) # weight
                concat_his.append(his)

    concat_his = np.array(concat_his).ravel()
    # normalizing
    concat_his = concat_his / concat_his.sum()
    return concat_his

In [55]:
def histogram_bovw(visual_words, codebook, k):
    his = np.zeros(k)
    for vw in visual_words:
        # find distance from vw to each representation vector (codebook)
        dist = np.power(np.power(np.tile(vw, (k, 1)) - codebook, 2).sum(axis=1), 0.5)
        # min distance
        min_codebook = dist.argsort()[0]
        # calculate histogram
        his[min_codebook] += 1
    return his

In [56]:
his_bovw = []

for i, row in df_data.iterrows():
    # read image from img_path then convert to gray scale
    image = cv2.imread(row['img_path'])
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # dense SIFT
    if feature_type == 'dense_sift':
        his_bovw.append(histogram_bovw(dense_sift(image, sift_step_size), 
                                       centres, k))
    
    # pyramid dense SIFT
    elif feature_type == 'pyramid_dense_sift':
        his_bovw.append(histogram_bovw(pyramid_dense_sift(image, sift_step_size, 
                                                          num_level), centres, k))
        
    elif feature_type == 'spatial_pyramid':
        his_bovw.append(spatial_pyramid(image, centres, k, sift_step_size, num_level))
    
his_bovw = np.array(his_bovw)

In [57]:
his_bovw = np.array(his_bovw)
his_bovw.shape

(1500, 250)

## 4. Classifier

In [58]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, top_k_accuracy_score

In [59]:
X = his_bovw
y = np.array(df_data['label'].to_list())
print(X.shape, y.shape)

(1500, 250) (1500,)


In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### SVC (kernel: rbf)

In [61]:
clf = SVC(probability=True).fit(X_train, y_train)

In [62]:
y_pred = clf.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

accuracy: 0.67
              precision    recall  f1-score   support

       Coast       0.65      0.81      0.72        16
      Forest       0.85      0.92      0.88        24
     Highway       0.87      0.72      0.79        18
  Insidecity       0.90      0.45      0.60        20
    Mountain       0.77      0.85      0.81        20
      Office       0.67      0.64      0.65        25
 OpenCountry       0.76      0.76      0.76        17
      Street       0.72      0.76      0.74        17
      Suburb       0.79      1.00      0.88        19
TallBuilding       0.71      0.91      0.80        22
     bedroom       0.55      0.35      0.43        17
  industrial       0.61      0.42      0.50        26
     kitchen       0.53      0.40      0.46        20
  livingroom       0.36      0.43      0.39        21
       store       0.44      0.67      0.53        18

    accuracy                           0.67       300
   macro avg       0.68      0.67      0.66       300
weighted av

In [63]:
# top k accuracy
top_k_accuracy_score(y_test, clf.predict_proba(X_test), k=2)

0.83

In [64]:
confusion_matrix(y_test, y_pred)

array([[13,  1,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 22,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  0, 13,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  1,  1,  9,  1,  0,  0,  3,  0,  5,  0,  0,  0,  0,  0],
       [ 0,  2,  0,  0, 17,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 16,  0,  0,  1,  0,  2,  1,  2,  2,  1],
       [ 2,  0,  1,  0,  0,  0, 13,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  1,  1,  0,  0, 13,  0,  2,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 19,  0,  0,  0,  0,  0,  0],
       [ 1,  0,  0,  0,  1,  0,  0,  0,  0, 20,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  6,  4,  2,  3,  1],
       [ 0,  0,  0,  0,  0,  2,  0,  1,  2,  0,  0, 11,  0,  0, 10],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  8, 10,  1],
       [ 0,  0,  0,  0,  0,  3,  0,  0,  1,  0,  3,  2,  1,  9,  2],
       [ 0,  0,  0,  0,  0,  1,  0

### Hyperparameter tuning

In [65]:
from sklearn.model_selection import GridSearchCV

In [66]:
parameters = {'kernel':['rbf'], 'C':np.linspace(1, 10, 50)}
parameters = {'kernel':['rbf'], 'C':np.linspace(0.001, 10, 50), 'gamma': [0.1, 1.0, 10, 100]}
# parameters = {'C': [1, 5, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}

In [67]:
grid_search = GridSearchCV(SVC(), parameters)
grid_search.fit(X_train, y_train)

In [68]:
grid_search.best_score_

0.6741666666666666

In [69]:
grid_search.best_params_

{'C': 1.0213061224489797, 'gamma': 100, 'kernel': 'rbf'}

In [70]:
y_pred = grid_search.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

accuracy: 0.69
              precision    recall  f1-score   support

       Coast       0.65      0.81      0.72        16
      Forest       0.85      0.92      0.88        24
     Highway       0.88      0.78      0.82        18
  Insidecity       0.90      0.45      0.60        20
    Mountain       0.77      0.85      0.81        20
      Office       0.72      0.72      0.72        25
 OpenCountry       0.81      0.76      0.79        17
      Street       0.72      0.76      0.74        17
      Suburb       0.83      1.00      0.90        19
TallBuilding       0.71      0.91      0.80        22
     bedroom       0.64      0.41      0.50        17
  industrial       0.58      0.42      0.49        26
     kitchen       0.62      0.40      0.48        20
  livingroom       0.41      0.52      0.46        21
       store       0.46      0.67      0.55        18

    accuracy                           0.69       300
   macro avg       0.70      0.69      0.68       300
weighted av

### Multinomial Naive Bayes

In [71]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [79]:
clf = GaussianNB().fit(X_train, y_train)

In [80]:
y_pred = clf.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

accuracy: 0.6066666666666667
              precision    recall  f1-score   support

       Coast       0.45      0.56      0.50        16
      Forest       0.86      0.79      0.83        24
     Highway       0.91      0.56      0.69        18
  Insidecity       0.75      0.60      0.67        20
    Mountain       0.65      0.75      0.70        20
      Office       0.65      0.60      0.63        25
 OpenCountry       0.50      0.35      0.41        17
      Street       0.62      0.76      0.68        17
      Suburb       0.94      0.84      0.89        19
TallBuilding       0.70      0.73      0.71        22
     bedroom       0.40      0.24      0.30        17
  industrial       0.57      0.50      0.53        26
     kitchen       0.53      0.50      0.51        20
  livingroom       0.32      0.62      0.42        21
       store       0.58      0.61      0.59        18

    accuracy                           0.61       300
   macro avg       0.63      0.60      0.60       3

In [81]:
# top k accuracy
top_k_accuracy_score(y_test, clf.predict_proba(X_test), k=2)

0.7766666666666666