In [55]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import skimage.io as io
from skimage import feature
from skimage.transform import resize
from sklearn.feature_selection import VarianceThreshold
from skimage.color import rgb2gray
from sklearn.decomposition import PCA
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from skimage.filters import gabor
from skimage.feature import corner_fast, corner_peaks, corner_harris
import sys
import os
sys.path.append('../')
import helpers.histogram_classifier as hc

In [56]:
def sub_grid(image, size=32):
    """
    Split an image into a grid of non-overlapping square boxes, and return each box in turn
    :params image: image as numpy array
    :params size: side-length of box to use; should be a power of 2
    :return: yields each box in turn
    """
    pic_size = image.shape
    box_side = size
    for x in range(0, pic_size[0], box_side):
        for y in range(0, pic_size[1], box_side):
            yield image[x:x + box_side, y:y + box_side, :]

In [57]:
from IPython.core.display import HTML
css = open('styles/table-style.css').read() #+ open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [58]:
def get_image_features(img, box_size):
    """
    Get RGB and edge features for a single image by splitting the image into a grid of non-overlapping
    square boxes, and obtaining features for each box
    :params img: image as a numpy array
    :params box_size: the side-length in pixels of the box to be used
    :return: mean Red pixel value in each box
    :return: mean Green pixel value in each box
    :return: mean Blue pixel value in each box
    :return: number of edges in each box, extracted using skimage canny edges algorithm
    :return: number of corners in each box, extracted using skimage corner_fast algorithm
    """
    image_features = []
    for box in sub_grid(img, size=box_size):
        features = []
        features.append(np.mean(box[:, :, 0]))
        features.append(np.mean(box[:, :, 1]))
        features.append(np.mean(box[:, :, 2]))
        features.append(np.sum(feature.canny(rgb2gray(box), sigma=1)))
        features.append(corner_peaks(corner_fast(rgb2gray(box))).shape[0])
        image_features.append(np.array(features))
    image_features = np.array(image_features)
    return image_features[:, 0], image_features[:, 1], image_features[:, 2], image_features[:, 3], image_features[:, 4]

In [59]:
def get_complete_features(image_classes, image_names, box_size):
    """
    Get RGB and edge features for a dictionary of image names
    :params image_classes: list of n image classes
    :params image_names: dictionary of m image names for each class
    :params size: size as tuple for reshaping images for extracting edges and corners
    :return: red_features: average red pixel value for each box in each image
    :return: green_features: average green pixel value for each box in each image
    :return: blue_features: average blue pixel value for each box in each image
    :return: edge_features: number of edges detected for each box in each image
    :return: corner_features: number of corners detected for each box in each image
    :return: labels: array of true labels for each image
    """
    red_features = []
    green_features = []
    blue_features = []
    edge_features = []
    corner_features = []
    labels = []
    for c in image_classes:
        for i in image_names[c]:
            img = io.imread(os.path.join("../data/food-101/top_classes/", c, "", i))
            red, green, blue, edge, corners = get_image_features(img, box_size=box_size)
            red_features.append(red)
            green_features.append(green)
            blue_features.append(blue)
            edge_features.append(edge)
            corner_features.append(corners)
            labels.append(c)
    
    return np.array(red_features), np.array(green_features), np.array(blue_features), np.array(edge_features), np.array(corner_features), np.array(labels)

In [60]:
PCA_components = 256 + 1024
BOX = 32

In [61]:
classes = pd.read_csv('../data/food-101/meta/top_classes.csv', index_col=0)
class_list = list(classes['class'].unique())

image_names = {}
for c in class_list:
    image_names[c] = np.array(classes[classes['class'] == c]['name'])

In [62]:
train, test = hc.split_data(image_names, 0.75)

for c in class_list:
    train[c] = train[c][:10]
    test[c] = test[c][:10]

In [63]:
#Get complete histogram features, edge and corners
all_red_features, all_green_features, all_blue_features, all_edge_features, all_corner_features, train_labels = get_complete_features(class_list, train, BOX)

In [64]:
#Get top features using PCA for red features
if all_red_features.shape[1] > PCA_components:
    red_pca = PCA(n_components=PCA_components)
    red_pca.fit(all_red_features)
    final_red_features = red_pca.transform(kept_red_features)
else:
    final_red_features = all_red_features

In [65]:
#Get top features using PCA for green features
if all_green_features.shape[1] > PCA_components:
    green_pca = PCA(n_components=PCA_components)
    green_pca.fit(all_green_features)
    final_green_features = green_pca.transform(all_green_features)
else:
    final_green_features = all_green_features

In [66]:
#Get top features using PCA for blue features
if all_blue_features.shape[1] > PCA_components:
    blue_pca = PCA(n_components=PCA_components)
    blue_pca.fit(all_blue_features)
    final_blue_features = blue_pca.transform(all_blue_features)
else:
    final_blue_features = all_blue_features

In [67]:
#Get top features using PCA for edge features
if all_edge_features.shape[1] > PCA_components:
    edge_pca = PCA(n_components=PCA_components)
    edge_pca.fit(all_edge_features)
    final_edge_features = edge_pca.transform(all_edge_features)
else:
    final_edge_features = all_edge_features

In [68]:
#Get top features using PCA for corner features
if all_corner_features.shape[1] > PCA_components:
    corner_pca = PCA(n_components=PCA_components)
    corner_pca.fit(all_corner_features)
    final_corner_features = corner_pca.transform(all_corner_features)
else:
    final_corner_features = all_corner_features

In [69]:
all_features_1 = np.concatenate((final_red_features, final_green_features, final_blue_features, final_edge_features, final_corner_features), axis=1)

In [70]:
#Get complete histogram features, edge and corners
all_red_features, all_green_features, all_blue_features, all_edge_features, all_corner_features, train_labels = get_complete_features(class_list, train, 16)

In [71]:
#Get top features using PCA for red features
if all_red_features.shape[1] > PCA_components:
    red_pca = PCA(n_components=PCA_components)
    red_pca.fit(all_red_features)
    final_red_features = red_pca.transform(kept_red_features)
else:
    final_red_features = all_red_features

In [72]:
#Get top features using PCA for green features
if all_green_features.shape[1] > PCA_components:
    green_pca = PCA(n_components=PCA_components)
    green_pca.fit(all_green_features)
    final_green_features = green_pca.transform(all_green_features)
else:
    final_green_features = all_green_features

In [73]:
#Get top features using PCA for blue features
if all_blue_features.shape[1] > PCA_components:
    blue_pca = PCA(n_components=PCA_components)
    blue_pca.fit(all_blue_features)
    final_blue_features = blue_pca.transform(all_blue_features)
else:
    final_blue_features = all_blue_features

In [74]:
#Get top features using PCA for edge features
if all_edge_features.shape[1] > PCA_components:
    edge_pca = PCA(n_components=PCA_components)
    edge_pca.fit(all_edge_features)
    final_edge_features = edge_pca.transform(all_edge_features)
else:
    final_edge_features = all_edge_features

In [75]:
#Get top features using PCA for corner features
if all_corner_features.shape[1] > PCA_components:
    corner_pca = PCA(n_components=PCA_components)
    corner_pca.fit(all_corner_features)
    final_corner_features = corner_pca.transform(all_corner_features)
else:
    final_corner_features = all_corner_features

In [76]:
all_features_2 = np.concatenate((final_red_features, final_green_features, final_blue_features, final_edge_features, final_corner_features), axis=1)

In [77]:
all_features = np.concatenate([all_features_1, all_features_2], axis=1)

In [78]:
#Setup grid-search for top parameters
depth = np.array([8, 10, 12, 14])
estimators = np.array([10, 25, 50, 100, 250, 500])
model = RandomForestClassifier()
grid = GridSearchCV(estimator=model, param_grid=dict(n_estimators=estimators, max_depth=depth))

In [79]:
grid.fit(all_features, train_labels)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([ 10,  25,  50, 100, 250, 500]), 'max_depth': array([ 8, 10, 12, 14])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [80]:
print("Best score = {}".format(grid.best_score_))
print("N_estimators: {}".format(grid.best_estimator_.n_estimators))
print("Max depth: {}".format(grid.best_estimator_.max_depth))

Best score = 0.31277777777777777
N_estimators: 500
Max depth: 14


In [81]:
%%time
rf = RandomForestClassifier(n_estimators=grid.best_estimator_.n_estimators, max_depth=grid.best_estimator_.max_depth)
rf.fit(all_features, train_labels)

CPU times: user 3min 35s, sys: 538 ms, total: 3min 35s
Wall time: 3min 35s


In [82]:
#Get Test Features
all_red_features_t, all_green_features_t, all_blue_features_t, all_edge_features_t, all_corner_features_t, test_labels = get_complete_features(class_list, test, BOX)

In [83]:
if all_red_features_t.shape[1] > PCA_components:
    final_red_t = red_pca.transform(all_red_features_t)
else:
    final_red_t = all_red_features_t

if all_green_features_t.shape[1] > PCA_components:
    final_green_t = green_pca.transform(all_green_features_t)
else:
    final_green_t = all_green_features_t
    
if all_blue_features_t.shape[1] > PCA_components:
    final_blue_t = blue_pca.transform(all_blue_features_t)
else:
    final_blue_t = all_blue_features_t

if all_edge_features_t.shape[1] > PCA_components:
    final_edge_t = edge_pca.transform(all_edge_features_t)
else:
    final_edge_t = all_edge_features_t
    
if all_corner_features_t.shape[1] > PCA_components:
    final_corner_t = corner_pca.transform(all_corner_features_t)
else:
    final_corner_t = all_corner_features_t

In [84]:
final_test_features_1 = np.concatenate((final_red_t, final_green_t, final_blue_t, final_edge_t, final_corner_t), axis=1)

In [85]:
#Get Test Features
all_red_features_t, all_green_features_t, all_blue_features_t, all_edge_features_t, all_corner_features_t, test_labels = get_complete_features(class_list, test, 16)

In [86]:
if all_red_features_t.shape[1] > PCA_components:
    final_red_t = red_pca.transform(all_red_features_t)
else:
    final_red_t = all_red_features_t

if all_green_features_t.shape[1] > PCA_components:
    final_green_t = green_pca.transform(all_green_features_t)
else:
    final_green_t = all_green_features_t
    
if all_blue_features_t.shape[1] > PCA_components:
    final_blue_t = blue_pca.transform(all_blue_features_t)
else:
    final_blue_t = all_blue_features_t

if all_edge_features_t.shape[1] > PCA_components:
    final_edge_t = edge_pca.transform(all_edge_features_t)
else:
    final_edge_t = all_edge_features_t
    
if all_corner_features_t.shape[1] > PCA_components:
    final_corner_t = corner_pca.transform(all_corner_features_t)
else:
    final_corner_t = all_corner_features_t

In [87]:
final_test_features_2 = np.concatenate((final_red_t, final_green_t, final_blue_t, final_edge_t, final_corner_t), axis=1)

In [88]:
final_test_features = np.concatenate([final_test_features_1, final_test_features_2], axis=1)

In [89]:
%%time
predicted_labels = rf.predict(final_test_features)

CPU times: user 959 ms, sys: 124 ms, total: 1.08 s
Wall time: 1.09 s


In [90]:
%%time
predicted_probabilities = rf.predict_proba(final_test_features)

CPU times: user 854 ms, sys: 75.7 ms, total: 930 ms
Wall time: 931 ms


In [91]:
overall, metrics, confusion = hc.get_metrics(test_labels, predicted_labels, class_list)

In [92]:
overall

Unnamed: 0,Accuracy,F1,Precision,Recall
Results,0.325,0.313,0.317,0.325


In [93]:
metrics

Unnamed: 0,Precision,Recall,F1-Score,Support
pork_chop,0.29,0.29,0.29,250.0
lasagna,0.31,0.22,0.25,250.0
french_toast,0.38,0.26,0.31,250.0
guacamole,0.37,0.43,0.4,250.0
apple_pie,0.18,0.15,0.16,250.0
cheesecake,0.34,0.49,0.4,250.0
hamburger,0.33,0.22,0.26,250.0
fried_rice,0.35,0.47,0.4,250.0
carrot_cake,0.25,0.12,0.16,250.0
chocolate_cake,0.38,0.48,0.42,250.0


In [94]:
confusion

Unnamed: 0,pork_chop,lasagna,french_toast,guacamole,apple_pie,cheesecake,hamburger,fried_rice,carrot_cake,chocolate_cake,steak,pizza
pork_chop,73,11,18,11,12,5,7,12,7,23,57,14
lasagna,18,54,14,17,18,19,12,39,11,4,7,37
french_toast,24,23,66,14,11,11,6,25,10,15,11,34
guacamole,8,4,6,107,17,7,18,31,7,12,15,18
apple_pie,9,19,11,25,37,55,16,26,13,12,9,18
cheesecake,6,3,7,11,22,122,11,5,8,44,5,6
hamburger,15,15,4,26,28,24,54,18,5,21,13,27
fried_rice,9,9,5,29,16,10,9,118,4,1,2,38
carrot_cake,24,7,19,11,21,55,6,16,29,26,19,17
chocolate_cake,12,3,10,8,3,30,6,3,15,120,27,13


plt.style.use('ggplot')

fig, ax = plt.subplots(figsize=(20,5))

num_red = final_red_features.shape[1]
num_green = final_green_features.shape[1]
num_blue = final_blue_features.shape[1]
num_edges = final_edge_features.shape[1]
num_corners = final_corner_features.shape[1]
x = [num_red, num_red + num_green, num_red + num_green + num_blue, num_red + num_green + num_blue + num_edges, num_red + num_green + num_blue + num_edges + num_corners]
ax.bar([i for i in range(x[0])], rf.feature_importances_[:x[0]], color='red', label='Red Features')
ax.bar([i for i in range(x[0], x[1])], rf.feature_importances_[x[0]:x[1]], color='green', label='Green Features')
ax.bar([i for i in range(x[1], x[2])], rf.feature_importances_[x[1]:x[2]], color='blue', label='Blue Features')
ax.bar([i for i in range(x[2], x[3])], rf.feature_importances_[x[2]:x[3]], color='brown', label='Edge Features')
ax.bar([i for i in range(x[3], x[4])], rf.feature_importances_[x[3]:x[4]], color='grey', label='Corner Features')
ax.set_title("Feature Importance")
ax.set_xlim(0,x[4])
ax.legend()
plt.show()