In [1]:
%reload_ext autoreload
%autoreload 2

import os
import argparse
import sys
from time import time
import cv2

from joblib import Parallel, delayed

sys.path.append(os.path.join(os.environ['GORDON_REPO_DIR'], 'utilities'))
from utilities2015 import *

from matplotlib.path import Path
%matplotlib inline

import scipy.ndimage as nd
import scipy

from sklearn.cluster import KMeans
from multiprocessing import Pool
import random
import pickle

In [2]:
vocabulary = np.load('/oasis/projects/nsf/csd395/ruogu/vocabulary/m_400/vocabulary_stride_8.npy');
M = vocabulary.shape[0];
L = 2;
trainingDir = '/oasis/projects/nsf/csd395/yuncong/CSHL_data_patches/MD589_byLandmark/';
trainingHistogramDir = '/oasis/projects/nsf/csd395/ruogu/boosting/training_L2/';
kmeans = KMeans(init='random', n_clusters=M, n_init=10);
kmeans.fit(vocabulary);

In [3]:
# Input: an rgb image file name (full file path)
def getHistogram(fileName, patchSize=16, stride=8):
    # Change image to gray scale
    image = imread(fileName);
    img = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY);
    # Generate key points
    height = img.shape[0];
    width = img.shape[1];
    xv, yv = np.meshgrid(np.arange(patchSize / 2, width - patchSize / 2, stride),
                         np.arange(patchSize / 2, height - patchSize / 2, stride),
                         indexing='ij');
    sample_points = np.c_[xv.flat, yv.flat];
    cv_keypoints = [cv2.KeyPoint(x,y, patchSize) for x, y in sample_points];
    # Get SIFT descriptor.
    sift = cv2.SIFT();
    _, descriptors = sift.compute(img, cv_keypoints);
    # Map key points with label.
    cluster_labels = kmeans.predict(descriptors);
    # Level 0
    weight_0 = 1.0 / (2**L);
    histogram = np.bincount(cluster_labels, minlength = M);
    histogram = histogram * weight_0;
    # Other levels
    for l in range(1, L + 1):
        weight = 1.0 / (2**(L - l + 1));
        grid_size_x = width / (2**l);
        grid_size_y = height / (2**l);
        grid_boundaries_x = range(0, width, grid_size_x);
        grid_boundaries_y = range(0, height, grid_size_y);
        number_of_grid = (2**l)**2;
    
        grid_crs = sample_points / [grid_size_x, grid_size_y];
        grid_cols = grid_crs[:,0];  # col index
        grid_rows = grid_crs[:,1];  # row index
        keypoint_grid_indices = grid_rows * (2**l) + grid_cols;
        hists = np.zeros(shape=(number_of_grid, M), dtype=np.int);
        for ki, label in zip(keypoint_grid_indices, cluster_labels):
            hists[ki][label] += 1;
        hists = hists * weight;
        histogram = np.hstack((histogram, hists.flatten()));
    return (histogram / len(sample_points)); # Normalization

In [4]:
# Generating histograms for training data
def getHistogramsForLabel(label):
    dir_path = trainingDir + label + '/';
    fileNames = os.listdir(dir_path);
    # Choose 1000 samples for each class.
    if len(fileNames) > 1000:
        fileNames = random.sample(fileNames, 1000);
    his_list = [];
    filename_list = [];
    for f in fileNames:
        his_list.append(getHistogram(dir_path + f));
        filename_list.append(dir_path + f);
    np.save(trainingHistogramDir + label + '.npy', np.asarray(his_list));
    pickle.dump(filename_list, open(trainingHistogramDir + label + '.p', "wb" ));
    del his_list;
    del filename_list;

In [5]:
label_list = os.listdir(trainingDir);
t = time();
pool = Pool(processes=16);
fileNames_label = pool.map(getHistogramsForLabel, label_list);
pool.close();
pool.join();
print "Generating histogram for all patches takes {} sec.".format(time() - t);

Generating histogram for all patches takes 956.393076181 sec.


In [None]:
# Generating histogram for testing data
# stack = 589;
# section = 161
# testingDir = '/oasis/projects/nsf/csd395/yuncong/CSHL_data_patches/MD'+str(stack)+'_byROI/'+str(section).zfill(4)+'/roi1/';
# testingHistogramDir = '/oasis/projects/nsf/csd395/ruogu/svm7/histogram/testing/'+str(stack) + '/';

# def getHistogramByName(fileName):
#     his = getHistogram(testingDir + fileName);
#     return (fileName, his);

# testing_fileNames = os.listdir(testingDir);
# t = time();
# pool = Pool(processes=8);
# testing_fileName_histograms = pool.map(getHistogramByName, testing_fileNames);
# pool.close();
# pool.join();
# print "Generating histogram for testing patches takes {} sec.".format(time() - t);

In [None]:
# testing_histograms = np.asarray([x[1] for x in testing_fileName_histograms]);
# testing_fileNames = [x[0] for x in testing_fileName_histograms];
# np.save(testingHistogramDir + str(section).zfill(4) + '.npy', testing_histograms); # Save histogram files

In [None]:
# import pickle
# # Save fileName list.
# pickle.dump(testing_fileNames, open(testingHistogramDir + str(section).zfill(4) + '.p', 'wb'));