In [3]:
%reload_ext autoreload
%autoreload 2

import os
import argparse
import sys
from time import time
import cv2

from joblib import Parallel, delayed

sys.path.append(os.path.join(os.environ['GORDON_REPO_DIR'], 'utilities'))
from utilities2015 import *

from matplotlib.path import Path
%matplotlib inline

import scipy.ndimage as nd
import scipy

from sklearn.cluster import KMeans
from multiprocessing import Pool
import random

In [4]:
# Input: an rgb image filen name, a kmeans object, M and L
def getHistogram(fileName, patchSize=16, stride=56):
    # Change image to gray scale
    image = imread(fileName);
    img = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY);
    # Generate key points
    height = img.shape[0];
    width = img.shape[1];
    xv, yv = np.meshgrid(np.arange(patchSize / 2, width - patchSize / 2, stride),
                         np.arange(patchSize / 2, height - patchSize / 2, stride),
                         indexing='ij');
    sample_points = np.c_[xv.flat, yv.flat];
    cv_keypoints = [cv2.KeyPoint(x,y, patchSize) for x, y in sample_points];
    # Get SIFT descriptor.
    sift = cv2.SIFT();
    _, descriptors = sift.compute(img, cv_keypoints);
    # Map key points with type.
    cluster_labels = kmeans.predict(descriptors);
    cluster_labels = cluster_labels;
    # Level 0
    weight_0 = 1.0 / (2**L);
    histogram = np.bincount(cluster_labels, minlength = M);
    histogram = histogram * weight_0;
    # Other levels
    for l in range(1, L + 1):
        weight = 1.0 / (2**(L - l + 1));
        grid_size_x = width / (2**l);
        grid_size_y = height / (2**l);
        grid_boundaries_x = range(0, width, grid_size_x);
        grid_boundaries_y = range(0, height, grid_size_y);
        number_of_grid = (2**l)**2;
    
        grid_crs = sample_points / [grid_size_x, grid_size_y];
        grid_cols = grid_crs[:,0];  # x
        grid_rows = grid_crs[:,1];  # y
    
        keypoint_grid_indices = grid_rows * (2**l) + grid_cols;
        hists = np.zeros(shape=(number_of_grid, M), dtype=np.int);
        for ki, label in zip(keypoint_grid_indices, cluster_labels):
            hists[ki][label] += 1;
        hists = hists * weight;
        histogram = np.hstack((histogram, hists.flatten()));
    return (histogram / len(sample_points)); # Normalization

In [14]:
def getHistogramsForLabel(label):
    dir_path = trainingDir + label + '/';
    fileNames = os.listdir(dir_path);
    # Choose 1000 samples for each class.
    if len(fileNames) > 1000:
        fileNames = random.sample(fileNames, 1000);
    his_list = [];
    for f in fileNames:
        his_list.append(getHistogram(dir_path + f));
    np.save('/oasis/projects/nsf/csd395/ruogu/svm3/histogram/' + label + '.npy', np.asarray(his_list));

In [10]:
vocabulary = np.load('/oasis/projects/nsf/csd395/ruogu/svm3/vocabulary.npy');
M = vocabulary.shape[0];
L = 2;
trainingDir = '/oasis/projects/nsf/csd395/yuncong/CSHL_data_patches/MD589_byLandmark/';

In [11]:
kmeans = KMeans(init='random', n_clusters=M, n_init=10);
kmeans.fit(vocabulary);

In [15]:
label_list = os.listdir(trainingDir);
t = time();
pool = Pool(processes=8);
pool.map(getHistogramsForLabel, label_list);
print "Generating histogram for all patches takes {} sec.".format(time() - t);

Generating histogram for all patches takes 81.742166996 sec.


In [16]:
print label_list;

['7n_surround', '7N', 'Gr', 'SuVe', 'Pn', 'LVe_surround', 'VLL_surround', 'VLL', 'LVe', '5N_surround', '12N', '7N_surround', '7n', 'Gr_surround', 'Pn_surround', 'SuVe_surround', '12N_surround', '5N']
