# Assignment 2

# Part 2: Classic recognition 

In [29]:
%load_ext autoreload
%autoreload 2

import os, pickle
import numpy as np
import cv2
import matplotlib.pyplot as plt
from utils import get_CIFAR10_data, train, evaluate

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
meta = pickle.load(open('data/cifar-10-batches-py/batches.meta', 'rb'), encoding='bytes')
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data(cifar10_dir='data/cifar-10-batches-py',
                                                                  num_training=4500, 
                                                                  num_validation=500, 
                                                                  num_test=1000)

Train data shape:  (4500, 32, 32, 3)
Train labels shape:  (4500,)
Validation data shape:  (500, 32, 32, 3)
Validation labels shape:  (500,)
Test data shape:  (1000, 32, 32, 3)
Test labels shape:  (1000,)


## Question 2. Color features (5 points)

In [31]:
# ALL OPERATIONS
from assn2 import load_average_color_with_bias

regularization_strength = [0.09, 0.10, 0.11]

X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data(cifar10_dir='data/cifar-10-batches-py',
                                                                  num_training=4500, 
                                                                  num_validation=500, 
                                                                  num_test=1000)
X_train = load_average_color_with_bias(X_train)
X_val = load_average_color_with_bias(X_val)
X_test = load_average_color_with_bias(X_test)
best_color = train(X_train, y_train, X_val, y_val, X_test, y_test, regularization_strength)

Train data shape:  (4500, 32, 32, 3)
Train labels shape:  (4500,)
Validation data shape:  (500, 32, 32, 3)
Validation labels shape:  (500,)
Test data shape:  (1000, 32, 32, 3)
Test labels shape:  (1000,)
reg 1.000000e-01 train accuracy: 0.132444 val accuracy: 0.144000

best validation accuracy achieved during training: 0.144000

final test set accuracy: 0.134000


## Question 3. Bag of SIFT features (15 points)

In [32]:
from features import extract_sift_for_dataset
from assn2 import load_flatten
from assn2 import load_histogram_with_bias
from kmeans import kmeans

step_size = 4
K = 16
niter = 4

X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data(cifar10_dir='data/cifar-10-batches-py',
                                                                  num_training=4500, 
                                                                  num_validation=500, 
                                                                  num_test=1000)
X_train_features = extract_sift_for_dataset(X_train, step_size=step_size)
X_val_features = extract_sift_for_dataset(X_val, step_size=step_size)
X_test_features = extract_sift_for_dataset(X_test, step_size=step_size)
X_train_features_flattened = load_flatten(X_train_features)
labels_train, centroids = kmeans(X_train_features_flattened, K, niter)
train_hist = load_histogram_with_bias(X_train_features, centroids)
val_hist = load_histogram_with_bias(X_val_features, centroids)
test_hist = load_histogram_with_bias(X_test_features, centroids)
regularization_strengths = [1e3]
best_color = train(train_hist, y_train, val_hist, y_val, test_hist, y_test, regularization_strengths, skip_test=True)
evaluate(best_color, test_hist, y_test)

Train data shape:  (4500, 32, 32, 3)
Train labels shape:  (4500,)
Validation data shape:  (500, 32, 32, 3)
Validation labels shape:  (500,)
Test data shape:  (1000, 32, 32, 3)
Test labels shape:  (1000,)
reg 1.000000e+03 train accuracy: 0.237556 val accuracy: 0.246000

best validation accuracy achieved during training: 0.246000
final test set accuracy: 0.232000


## Question 4. SPM representation (15 points)

Above, we selected feature points in uniform-distanced pixels.
One drawback of the bag-of-words approach is that it discards spatial information. 

Hence, we will now try encoding spatial information using Spatial Pyramid Matching (SPM) proposed in Lazebnik et al. 2006. At a high level, SPM works by breaking up an image into different regions and computing the SIFT descriptor at each region, forming a histogram of visual words in each region, and then concatenatating them into a single 1D vector representation.

**Do this**: Construct a SPM representation of the images and train a classifier. Specifically, implement `spatial_pyramid_matching_with_bias()` in `features.py`.

In [33]:
from features import spatial_pyramid_matching_with_bias

L = 2
K = 16
niter = 4
regularization_strengths = [1e3]

X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data(cifar10_dir='data/cifar-10-batches-py',
                                                                  num_training=4500, 
                                                                  num_validation=500, 
                                                                  num_test=1000)
X_train_features = extract_sift_for_dataset(X_train, step_size=1)
X_train_features_flattened = load_flatten(X_train_features)
X_val_features = extract_sift_for_dataset(X_val, step_size=1)
X_test_features = extract_sift_for_dataset(X_test, step_size=1)
_, centroids = kmeans(X_train_features_flattened, K, niter)
X_train_spm = [spatial_pyramid_matching_with_bias(L, 
                                        X_train_features[i].reshape((32, 32, 128)), 
                                        centroids) 
            for i in range(len(X_train))]
X_val_spm = [spatial_pyramid_matching_with_bias(L,
                                      X_val_features[i].reshape((32, 32, 128)), 
                                      centroids) 
            for i in range(len(X_val))]
X_test_spm = [spatial_pyramid_matching_with_bias(L,
                                       X_test_features[i].reshape((32, 32, 128)),
                                       centroids)  
              for i in range(len(X_test))]
X_train_spm = np.stack(X_train_spm, 0)
X_val_spm = np.stack(X_val_spm, 0)
X_test_spm = np.stack(X_test_spm, 0)
best_color = train(X_train_spm, y_train, X_val_spm, y_val, X_test_spm, y_test, regularization_strengths, skip_test=True)
evaluate(best_color, X_test_spm, y_test)

Train data shape:  (4500, 32, 32, 3)
Train labels shape:  (4500,)
Validation data shape:  (500, 32, 32, 3)
Validation labels shape:  (500,)
Test data shape:  (1000, 32, 32, 3)
Test labels shape:  (1000,)
reg 1.000000e+03 train accuracy: 0.286444 val accuracy: 0.272000
reg 3.000000e+03 train accuracy: 0.278444 val accuracy: 0.268000
reg 5.000000e+03 train accuracy: 0.259333 val accuracy: 0.266000
reg 7.000000e+03 train accuracy: 0.250889 val accuracy: 0.238000
reg 1.000000e+04 train accuracy: 0.260444 val accuracy: 0.266000
reg 3.000000e+04 train accuracy: 0.217333 val accuracy: 0.220000

best validation accuracy achieved during training: 0.272000
final test set accuracy: 0.271000


## Question 5. Histogram of Oriented Gradients (10 points)

Rather than extracting local SIFT features, we can compute a global histogram of oriented gradients (HOG) image descriptor. 

**Do this**: Implement `get_differential_filter()` and `filter_image()` in `features.py`, and `load_hog_representation_with_bias()` in `assn2.py`. Then compute HOG descriptors and train a classifier. 

In [34]:
from assn2 import load_hog_representation_with_bias

cell_size = 2
block_size = 2
regularization_strengths = [1e3,3e3,5e3,7e3,1e4,3e4]

X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data(cifar10_dir='data/cifar-10-batches-py',
                                                                  num_training=4500, 
                                                                  num_validation=500, 
                                                                  num_test=1000)
X_train_hog =  load_hog_representation_with_bias(X_train, cell_size, block_size)
X_val_hog = load_hog_representation_with_bias(X_val, cell_size, block_size)
X_test_hog = load_hog_representation_with_bias(X_test, cell_size, block_size)
best_color = train(X_train_hog, y_train, X_val_hog, y_val, X_test_hog, y_test, regularization_strengths)

Train data shape:  (4500, 32, 32, 3)
Train labels shape:  (4500,)
Validation data shape:  (500, 32, 32, 3)
Validation labels shape:  (500,)
Test data shape:  (1000, 32, 32, 3)
Test labels shape:  (1000,)
reg 1.000000e+03 train accuracy: 0.122889 val accuracy: 0.094000
reg 3.000000e+03 train accuracy: 0.105556 val accuracy: 0.088000
reg 5.000000e+03 train accuracy: 0.105556 val accuracy: 0.088000
reg 7.000000e+03 train accuracy: 0.203556 val accuracy: 0.144000
reg 1.000000e+04 train accuracy: 0.104444 val accuracy: 0.104000
reg 3.000000e+04 train accuracy: 0.148000 val accuracy: 0.154000

best validation accuracy achieved during training: 0.154000

final test set accuracy: 0.158000


## Question 6. Pixels (5 points)

Finally, let's use the pixels themselves to train a classifier. That is, just reshape a 32x32x3 image into a 32x32x3=3072 vector.

**Do this:** Process the images and train a classifier. Specifically, implement `load_vector_image_with_bias()` in `assn2.py`.

In [35]:
from assn2 import load_vector_image_with_bias

regularization_strengths = [1e3,3e3,5e3,7e3,1e4,3e4]

X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data(cifar10_dir='data/cifar-10-batches-py',
                                                                  num_training=4500, 
                                                                  num_validation=500, 
                                                                  num_test=1000)
X_train, X_val, X_test = load_vector_image_with_bias(X_train, X_val, X_test)
best_color = train(X_train, y_train, X_val, y_val, X_test, y_test, regularization_strengths)

Train data shape:  (4500, 32, 32, 3)
Train labels shape:  (4500,)
Validation data shape:  (500, 32, 32, 3)
Validation labels shape:  (500,)
Test data shape:  (1000, 32, 32, 3)
Test labels shape:  (1000,)
reg 1.000000e+03 train accuracy: 0.257778 val accuracy: 0.242000
reg 3.000000e+03 train accuracy: 0.209111 val accuracy: 0.186000
reg 5.000000e+03 train accuracy: 0.108000 val accuracy: 0.090000
reg 7.000000e+03 train accuracy: 0.120000 val accuracy: 0.100000
reg 1.000000e+04 train accuracy: 0.144444 val accuracy: 0.120000
reg 3.000000e+04 train accuracy: 0.097778 val accuracy: 0.092000

best validation accuracy achieved during training: 0.242000

final test set accuracy: 0.207000
