In [None]:
%lsmagic

### Setup, exploration

In [2]:
%matplotlib inline

import os, glob

import numpy as np
import matplotlib.pyplot as plt

from scipy import stats

from skimage import io

from skimage.filters import gabor
from skimage.feature import hog

from sklearn import model_selection

In [3]:
all_categories = [os.path.split(path)[-1] for path in glob.glob("50_categories/*")]
print(all_categories)
all_filenames = glob.glob("50_categories/*/*.jpg")

['gorilla', 'raccoon', 'crab', 'blimp', 'snail', 'airplanes', 'dog', 'dolphin', 'goldfish', 'giraffe', 'bear', 'killer-whale', 'penguin', 'zebra', 'duck', 'conch', 'camel', 'owl', 'helicopter', 'starfish', 'saturn', 'galaxy', 'goat', 'iguana', 'elk', 'hummingbird', 'triceratops', 'porcupine', 'teddy-bear', 'comet', 'hot-air-balloon', 'leopards', 'toad', 'mussels', 'kangaroo', 'speed-boat', 'bat', 'swan', 'octopus', 'frog', 'cormorant', 'unicorn', 'horse', 'skunk', 'mars', 'ostrich', 'goose', 'llama', 'snake', 'elephant']


In [71]:
subset_filenames = all_filenames[::5]
n_files_to_classify = len(subset_filenames)

In [72]:
def read_image(filename):
    img = io.imread(filename)
    print(f"Image shape: {img.shape}")
    return img

def split_into_rgb_channels(image):
    r = image[:,:,0]
    g = image[:,:,1]
    b = image[:,:,2]
    return r, g, b

In [None]:
filename = "50_categories/blimp/blimp_0014.jpg"
img = read_image(filename)
plt.imshow(img[:,:,2], cmap="gray")

### Functions that take an image and return a feature

In [97]:
def feat_imsize(img):
    return img.shape[:2]

def feat_chanratio(img, c1, c2):
    return np.mean(img[:,:,c1])/np.mean(img[:,:,c2]) if len(img.shape) > 2 else 1

def feat_quadrant(img, channel, func=np.max):
    h, w = img.shape[:2]
    half_h = h//2
    half_w = w//2
    ic = img[:,:,channel] if len(img.shape) > 2 else img
    qs = {1: func(ic[:half_h,:half_w]),
         2: func(ic[half_h:,:half_w]),
         3: func(ic[:half_h,half_w:]),
         4: func(ic[half_h:,half_w:])}
    return max(qs, key=lambda key: qs[key])

def feat_histogram(img, channel, func=stats.kurtosis):
    ic = img[:,:,channel] if len(img.shape) > 2 else img
    fd, h = np.histogram(ic)
    #print(ic.shape, h)
    return func(fd)
    
# smarter features
def feat_gabor(img, channel, f, func=np.max):
    ic = img[:,:,channel] if len(img.shape) > 2 else img
    f_real, f_imag = gabor(ic, frequency=f)
    return func(f_real)

def feat_hog(img, channel, func=np.count_nonzero):
    ic = img[:,:,channel] if len(img.shape) > 2 else img
    return func(hog(ic))
 
def feat_all(img):
    h, w = feat_imsize(img)
    n_ch = img.shape[2] if len(img.shape)>2 else 1
    rgavg = feat_chanratio(img, 0, 1)
    rbavg = feat_chanratio(img, 0, 2)
    gbavg = feat_chanratio(img, 1, 2)
    #rquadmax = feat_quadrant(img, 0)
    #gquadmax = feat_quadrant(img, 1)
    #bquadmax = feat_quadrant(img, 2)
    #rquadmin = feat_quadrant(img, 0, np.min)
    #gquadmin = feat_quadrant(img, 1, np.min)
    #bquadmin = feat_quadrant(img, 2, np.min)
    r_gabor = feat_gabor(img, 0, 0.4, np.count_nonzero)
    g_gabor = feat_gabor(img, 1, 0.4, np.count_nonzero)
    b_gabor = feat_gabor(img, 2, 0.4, np.count_nonzero)
#    r_gabor_1 = feat_gabor(img, 0, 0.2, np.count_nonzero)
#    g_gabor_1 = feat_gabor(img, 1, 0.2, np.count_nonzero)
#    b_gabor_1 = feat_gabor(img, 2, 0.2, np.count_nonzero)
#     rkurt = feat_histogram(img, 0)
#     gkurt = feat_histogram(img, 1)
#     bkurt = feat_histogram(img, 2)
    rskew = feat_histogram(img, 0, stats.skew)
    gskew = feat_histogram(img, 1, stats.skew)
    bskew = feat_histogram(img, 2, stats.skew)
    return np.array([h, w, n_ch, 
                     rgavg, rbavg, gbavg, 
                     #bquadmax, bquadmin, 
                     #rkurt/gkurt, rkurt/bkurt, 
                     rskew, gskew, bskew, 
                     r_gabor/b_gabor, r_gabor/g_gabor, b_gabor/g_gabor,
                     #r_gabor_1/b_gabor_1, r_gabor_1/g_gabor_1, b_gabor_1/g_gabor_1,
                     #r_gabor/r_gabor_1, b_gabor/b_gabor_1, g_gabor/g_gabor_1
                    ])

### Calculate features of all the images, then split into train and test

In [98]:
def build_xy(files, all_categories):
    n_files = len(files)
    eg_feats = feat_all(read_image(files[np.random.randint(n_files)]))
    n_feats = len(eg_feats)
    print(f"Will calculate {n_feats} features for {n_files} images. Feature vectors look like:\n{eg_feats}")
    x = np.empty((n_files, n_feats), dtype="float16")
    y = np.zeros(n_files)
    for i,f in enumerate(files):
        img = read_image(f)
        x[i, :] = feat_all(img)
        head, tail = os.path.split(f)
        _, target = os.path.split(head)
        y[i] = all_categories.index(target)
    return x, y

In [99]:
x, y = build_xy(subset_filenames, all_categories)
print(x.shape, y.shape)
print(y)

Image shape: (665, 1000, 3)
Will calculate 12 features for 849 images. Feature vectors look like:
[  6.65000000e+02   1.00000000e+03   3.00000000e+00   1.26943462e+00
   1.83528205e+00   1.44574760e+00   5.15701483e-01   9.58837392e-01
   1.21753995e+00   9.38710345e-01   1.45342184e+00   1.54831769e+00]
Image shape: (349, 319, 3)
Image shape: (225, 169, 3)
Image shape: (756, 500, 3)
Image shape: (548, 774, 3)
Image shape: (300, 400, 3)
Image shape: (400, 334)
Image shape: (589, 549, 3)
Image shape: (160, 243, 3)
Image shape: (585, 573, 3)
Image shape: (400, 600, 3)
Image shape: (466, 350, 3)
Image shape: (230, 218, 3)
Image shape: (266, 202, 3)
Image shape: (168, 220, 3)
Image shape: (192, 283, 3)
Image shape: (312, 208, 3)
Image shape: (192, 288, 3)
Image shape: (401, 259, 3)
Image shape: (478, 720, 3)
Image shape: (237, 265, 3)
Image shape: (345, 234, 3)
Image shape: (202, 300, 3)
Image shape: (268, 346, 3)
Image shape: (268, 400, 3)
Image shape: (230, 186, 3)
Image shape: (325, 173

Image shape: (312, 266, 3)
Image shape: (382, 579, 3)
Image shape: (400, 600, 3)
Image shape: (225, 216, 3)
Image shape: (266, 190, 3)
Image shape: (164, 265, 3)
Image shape: (313, 480, 3)
Image shape: (600, 800, 3)
Image shape: (372, 250, 3)
Image shape: (484, 689, 3)
Image shape: (375, 500, 3)
Image shape: (490, 402, 3)
Image shape: (142, 213, 3)
Image shape: (400, 275, 3)
Image shape: (299, 450, 3)
Image shape: (467, 700, 3)
Image shape: (480, 640, 3)
Image shape: (219, 286, 3)
Image shape: (360, 400, 3)
Image shape: (512, 768, 3)
Image shape: (267, 400, 3)
Image shape: (456, 671, 3)
Image shape: (576, 510, 3)
Image shape: (407, 392, 3)
Image shape: (250, 381, 3)
Image shape: (441, 350, 3)
Image shape: (200, 200, 3)
Image shape: (195, 240, 3)
Image shape: (393, 422, 3)
Image shape: (352, 504, 3)
Image shape: (443, 520, 3)
Image shape: (360, 434, 3)
Image shape: (457, 640, 3)
Image shape: (383, 478, 3)
Image shape: (450, 338, 3)
Image shape: (280, 350, 3)
Image shape: (357, 400, 3)
I

Image shape: (274, 300, 3)
Image shape: (204, 300, 3)
Image shape: (300, 225, 3)
Image shape: (300, 245, 3)
Image shape: (254, 300, 3)
Image shape: (300, 258, 3)
Image shape: (466, 700, 3)
Image shape: (203, 203, 3)
Image shape: (259, 216, 3)
Image shape: (324, 500, 3)
Image shape: (398, 607, 3)
Image shape: (202, 190, 3)
Image shape: (143, 250, 3)
Image shape: (238, 274, 3)
Image shape: (184, 400, 3)
Image shape: (443, 596, 3)
Image shape: (141, 234, 3)
Image shape: (235, 350, 3)
Image shape: (450, 564, 3)
Image shape: (600, 700, 3)
Image shape: (257, 350, 3)
Image shape: (378, 243, 3)
Image shape: (213, 183, 3)
Image shape: (497, 340, 3)
Image shape: (225, 280, 3)
Image shape: (333, 460, 3)
Image shape: (288, 264, 3)
Image shape: (565, 448, 3)
Image shape: (480, 640, 3)
Image shape: (230, 304, 3)
Image shape: (346, 520, 3)
Image shape: (464, 600, 3)
Image shape: (384, 256, 3)
Image shape: (970, 529, 3)
Image shape: (308, 288, 3)
Image shape: (364, 550, 3)
Image shape: (526, 800, 3)
I

In [None]:
np.any(np.isnan(x))

### Classifier building and testing

In [100]:
from sklearn.ensemble import RandomForestClassifier

In [101]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=3, random_state=0)

In [102]:
model_selection.cross_val_score(clf, x, y, cv=5)

array([ 0.2       ,  0.18644068,  0.19186047,  0.20625   ,  0.26666667])

### OH notes

1. Do GridSearchCV to figure out the correct hyperparameters
2. Small things like ratio of blue in top to blue in bottom, quadrant with highest blue pixel.
3. Moments of a histogram, kurtosis of histogram
4. Make class, in __init__ you load it in, etc, build the features