# Assignment 5: Scene Recognition with Bag of Words

In [5]:
import numpy as np
import os
import glob
from sklearn.cluster import KMeans

## Question 4: bags of SIFT descriptors
### Question 4a: clustering SIFT descriptors with K-means 

In [8]:
def load(ds_path):
    """ Load from the training/testing dataset.

    Parameters
    ----------
    ds_path: path to the training/testing dataset.
             e.g., sift/train or sift/test 
    
    Returns
    -------
    image_paths: a (n_sample, 1) array that contains the paths to the descriptors. 
    labels: class labels corresponding to each image
    """
    # Grab a list of paths that matches the pathname
    files = glob.glob(os.path.join(ds_path, "*", "*.txt"))
    n_files = len(files)
    image_paths = np.asarray(files)
 
    # Get class labels
    classes = glob.glob(os.path.join(ds_path, "*"))
    labels = np.zeros(n_files)

    for i, path in enumerate(image_paths):
        folder, fn = os.path.split(path)
        labels[i] = np.argwhere(np.core.defchararray.equal(classes, folder))[0,0]

    # Randomize the order
    idx = np.random.choice(n_files, size=n_files, replace=False)
    image_paths = image_paths[idx]
    labels = labels[idx]
    return image_paths, labels

def build_vocabulary(image_paths, vocab_size):
    """ Sample SIFT descriptors, cluster them using k-means, and return the fitted k-means model.
    NOTE: We don't necessarily need to use the entire training dataset. You can use the function
    sample_images() to sample a subset of images, and pass them into this function.

    Parameters
    ----------
    image_paths: an (n_image, 1) array of image paths.
    vocab_size: the number of clusters desired.
    
    Returns
    -------
    kmeans: the fitted k-means clustering model.
    """
    n_image = len(image_paths)

    # Since want to sample tens of thousands of SIFT descriptors from different images, we
    # calculate the number of SIFT descriptors we need to sample from each image.
    n_each = int(np.ceil(10000 / n_image))

    # Initialize an array of features, which will store the sampled descriptors
    # keypoints = np.zeros((n_image * n_each, 2))
    descriptors = np.zeros((n_image * n_each, 128))

    for i, path in enumerate(image_paths):
        # Load features from each image
        features = np.loadtxt(path, delimiter=',',dtype=float)
        sift_keypoints = features[:, :2]
        sift_descriptors = features[:, 2:]

        # TODO: Randomly sample n_each descriptors from sift_descriptor and store them into descriptors

    # TODO: pefrom k-means clustering to cluster sampled sift descriptors into vocab_size regions.
    # You can use KMeans from sci-kit learn.
    # Reference: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
    
    return kmeans

In [None]:
print('Getting paths and labels for all train and test data')
train_image_paths, train_labels = load("sift/train")
test_image_paths, test_labels = load("sift/test")

print('Extracting SIFT features')
kmeans = build_vocabulary(train_image_paths, vocab_size=200)

### Question 4b: representing images as bags of SIFT feature histograms

In [4]:
def get_bags_of_sifts(image_paths, kmeans):
    """ Represent each image as bags of SIFT features histogram.

    Parameters
    ----------
    image_paths: an (n_image, 1) array of image paths.
    kmeans: k-means clustering model with vocab_size centroids.

    Returns
    -------
    image_feats: an (n_image, vocab_size) matrix, where each row is a histogram.
    """
    n_image = len(image_paths)
    vocab_size = kmeans.cluster_centers_.shape[0]

    image_feats = np.zeros((n_image, vocab_size))

    for i, path in enumerate(image_paths):
        # Load features from each image
        features = np.loadtxt(path, delimiter=',',dtype=float)

        # TODO: Assign each feature to the closest cluster center
        # Again, each feature consists of the (x, y) location and the 128-dimensional sift descriptor
        # You can access the sift descriptors part by features[:, 2:]

        # TODO: Build a histogram normalized by the number of descriptors

    return image_feats

In [None]:
train_image_feats = get_bags_of_sifts(train_image_paths, kmeans)
test_image_feats = get_bags_of_sifts(test_image_paths, kmeans)

### Question 4c: average histogram for each scene category

## Question 5: scene recongition with KNN
This function will predict the category for every test image by finding the training image with most similar features. Instead of 1 nearest neighbor, you can vote based on k nearest neighbors which will increase performance (although you need to pick a reasonable value for k).

In [6]:
def nearest_neighbor_classify(train_image_feats, train_labels, test_image_feats):
    """
    Parameters
    ----------
    train_image_feats:  is an N x d matrix, where d is the dimensionality of the feature representation.
    train_labels: is an N x l cell array, where each entry is a string
                  indicating the ground truth one-hot vector for each training image.
    test_image_feats: is an M x d matrix, where d is the dimensionality of the
                      feature representation. You can assume M = N unless you've modified the starter code.

    Returns
    -------
    is an M x l cell array, where each row is a one-hot vector
    indicating the predicted category for each test image.
    """
    return predicted_labels

In [None]:
print('Using nearest neighbor classifier to predict test set categories')
pred_labels_knn = nearest_neighbor_classify(train_image_feats, train_labels, test_image_feats)

## Question 6: scene recognition with 1-vs-all linear SVMs
This function will train a linear SVM for every category (i.e. one vs all) and then use the learned linear classifiers to predict the category of very test image. Every test feature will be evaluated with all 15 SVMs and the most confident SVM will "win". Confidence, or distance from the margin, is W*X + B where '*' is the inner product or dot product and W and B are the learned hyperplane parameters. 

In [None]:
def svm_classify(train_image_feats, train_labels, test_image_feats):
    """
    Parameters
    ----------
    train_image_feats:  is an N x d matrix, where d is the dimensionality of the feature representation.
    train_labels: is an N x l cell array, where each entry is a string
                  indicating the ground truth one-hot vector for each training image.
    test_image_feats: is an M x d matrix, where d is the dimensionality of the
                      feature representation. You can assume M = N unless you've modified the starter code.

    Returns
    -------
    is an M x l cell array, where each row is a one-hot vector
    indicating the predicted category for each test image.
    """
    return predicted_labels

In [None]:
print('Using support vector machine to predict test set categories')
pred_labels_svm = svm_classify(train_image_feats, train_labels, test_image_feats)