In [1]:
#-----------------------------------
# GLOBAL FEATURE EXTRACTION
#-----------------------------------
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import mahotas
import cv2
import os
import h5py

#--------------------
# tunable-parameters
#--------------------
images_per_class = 5000
fixed_size       = tuple((768, 768))
train_path       = "dataset/train"
h5_data          = 'output/data.h5'
h5_labels        = 'output/labels.h5'
bins             = 5

In [2]:
# feature-descriptor-1: Hu Moments
def fd_hu_moments(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    feature = cv2.HuMoments(cv2.moments(image)).flatten()
    return feature

In [3]:
# feature-descriptor-2: Haralick Texture
def fd_haralick(image):
    # convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # compute the haralick texture feature vector
    haralick = mahotas.features.haralick(gray).mean(axis=0)
    # return the result
    return haralick

In [4]:
# feature-descriptor-3: Color Histogram
def fd_histogram(image, mask=None):
    # convert the image to HSV color-space
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    # compute the color histogram
    hist  = cv2.calcHist([image], [0, 1, 2], None, [bins, bins, bins], [0, 256, 0, 256, 0, 256])
    # normalize the histogram
    cv2.normalize(hist, hist)
    # return the histogram
    return hist.flatten()

In [5]:
# get the training labels
train_labels = os.listdir(train_path)

# sort the training labels
train_labels.sort()
print(train_labels)

# empty lists to hold feature vectors and labels
global_features = []
labels          = []

['lung_aca', 'lung_n', 'lung_scc']


In [6]:
# loop over the training data sub-folders
for training_name in train_labels:
    # join the training data path and each species training folder
    dir = os.path.join(train_path, training_name)

    # get the current training label
    current_label = training_name

    # loop over the images in each sub-folder
    for x in range(1,images_per_class+1):
        # get the image file name
        file = dir + "/" + str(x) + ".jpg"

        # read the image and resize it to a fixed-size
        image = cv2.imread(file)
        image = cv2.resize(image, fixed_size)

        ####################################
        # Global Feature extraction
        ####################################
        fv_hu_moments = fd_hu_moments(image)
        fv_haralick   = fd_haralick(image)
        fv_histogram  = fd_histogram(image)

        ###################################
        # Concatenate global features
        ###################################
        global_feature = np.hstack([fv_histogram, fv_haralick, fv_hu_moments])

        # update the list of labels and feature vectors
        labels.append(current_label)
        global_features.append(global_feature)
        
        if x % 100 == 0:
            print("Completed processing ", x, "images from type", training_name)
            print("[STATUS] feature vector size {}".format(np.array(global_features).shape))
            print("[STATUS] training Labels {}".format(np.array(labels).shape))
        
    print("[STATUS] processed folder: {}".format(current_label))

print("[STATUS] completed Global Feature Extraction...")

Completed processing  100 images from type lung_aca
[STATUS] feature vector size (100, 145)
[STATUS] training Labels (100,)
Completed processing  200 images from type lung_aca
[STATUS] feature vector size (200, 145)
[STATUS] training Labels (200,)
Completed processing  300 images from type lung_aca
[STATUS] feature vector size (300, 145)
[STATUS] training Labels (300,)
Completed processing  400 images from type lung_aca
[STATUS] feature vector size (400, 145)
[STATUS] training Labels (400,)
Completed processing  500 images from type lung_aca
[STATUS] feature vector size (500, 145)
[STATUS] training Labels (500,)
Completed processing  600 images from type lung_aca
[STATUS] feature vector size (600, 145)
[STATUS] training Labels (600,)
Completed processing  700 images from type lung_aca
[STATUS] feature vector size (700, 145)
[STATUS] training Labels (700,)
Completed processing  800 images from type lung_aca
[STATUS] feature vector size (800, 145)
[STATUS] training Labels (800,)
Complete

Completed processing  1600 images from type lung_n
[STATUS] feature vector size (6600, 145)
[STATUS] training Labels (6600,)
Completed processing  1700 images from type lung_n
[STATUS] feature vector size (6700, 145)
[STATUS] training Labels (6700,)
Completed processing  1800 images from type lung_n
[STATUS] feature vector size (6800, 145)
[STATUS] training Labels (6800,)
Completed processing  1900 images from type lung_n
[STATUS] feature vector size (6900, 145)
[STATUS] training Labels (6900,)
Completed processing  2000 images from type lung_n
[STATUS] feature vector size (7000, 145)
[STATUS] training Labels (7000,)
Completed processing  2100 images from type lung_n
[STATUS] feature vector size (7100, 145)
[STATUS] training Labels (7100,)
Completed processing  2200 images from type lung_n
[STATUS] feature vector size (7200, 145)
[STATUS] training Labels (7200,)
Completed processing  2300 images from type lung_n
[STATUS] feature vector size (7300, 145)
[STATUS] training Labels (7300,)


Completed processing  3100 images from type lung_scc
[STATUS] feature vector size (13100, 145)
[STATUS] training Labels (13100,)
Completed processing  3200 images from type lung_scc
[STATUS] feature vector size (13200, 145)
[STATUS] training Labels (13200,)
Completed processing  3300 images from type lung_scc
[STATUS] feature vector size (13300, 145)
[STATUS] training Labels (13300,)
Completed processing  3400 images from type lung_scc
[STATUS] feature vector size (13400, 145)
[STATUS] training Labels (13400,)
Completed processing  3500 images from type lung_scc
[STATUS] feature vector size (13500, 145)
[STATUS] training Labels (13500,)
Completed processing  3600 images from type lung_scc
[STATUS] feature vector size (13600, 145)
[STATUS] training Labels (13600,)
Completed processing  3700 images from type lung_scc
[STATUS] feature vector size (13700, 145)
[STATUS] training Labels (13700,)
Completed processing  3800 images from type lung_scc
[STATUS] feature vector size (13800, 145)
[S

In [7]:
# get the overall feature vector size
print("[STATUS] feature vector size {}".format(np.array(global_features).shape))

# get the overall training label size
print("[STATUS] training Labels {}".format(np.array(labels).shape))

# encode the target labels
targetNames = np.unique(labels)
le          = LabelEncoder()
target      = le.fit_transform(labels)
print("[STATUS] training labels encoded...")

# scale features in the range (0-1)
scaler            = MinMaxScaler(feature_range=(0, 1))
rescaled_features = scaler.fit_transform(global_features)
print("[STATUS] feature vector normalized...")

print("[STATUS] target labels: {}".format(target))
print("[STATUS] target labels shape: {}".format(target.shape))

# save the feature vector using HDF5
h5f_data = h5py.File(h5_data, 'w')
h5f_data.create_dataset('dataset_1', data=np.array(rescaled_features))

h5f_label = h5py.File(h5_labels, 'w')
h5f_label.create_dataset('dataset_1', data=np.array(target))

h5f_data.close()
h5f_label.close()

print("[STATUS] end of training..")

[STATUS] feature vector size (15000, 145)
[STATUS] training Labels (15000,)
[STATUS] training labels encoded...
[STATUS] feature vector normalized...
[STATUS] target labels: [0 0 0 ... 2 2 2]
[STATUS] target labels shape: (15000,)
[STATUS] end of training..
