**Use GPU: Runtime -> Change runtime type -> GPU (Hardware Accelerator)**

Setup

In [None]:
!cat ~/.keras/keras.json

{
    "epsilon": 1e-07, 
    "floatx": "float32", 
    "image_data_format": "channels_last", 
    "backend": "tensorflow"
}

HDF5

In [None]:
# import the necessary packages
import h5py
import os

class HDF5DatasetWriter:
  def __init__(self, dims, outputPath, dataKey="images",
    bufSize=1000):
    # check to see if the output path exists, and if so, raise
    # an exception
    if os.path.exists(outputPath):
      raise ValueError("The supplied `outputPath` already exists and cannot be overwritten. Manually delete the file before continuing.", outputPath)

    # open the HDF5 database for writing and create two datasets:
    # one to store the images/features and another to store the
    # class labels
    self.db = h5py.File(outputPath, "w")
    self.data = self.db.create_dataset(dataKey, dims, dtype="float")
    self.labels = self.db.create_dataset("labels", (dims[0],), dtype="int")

    # store the buffer size, then initialize the buffer itself
    # along with the index into the datasets
    self.bufSize = bufSize
    self.buffer = {"data": [], "labels": []}
    self.idx = 0

  def add(self, rows, labels):
    # add the rows and labels to the buffer
    self.buffer["data"].extend(rows)
    self.buffer["labels"].extend(labels)

    # check to see if the buffer needs to be flushed to disk
    if len(self.buffer["data"]) >= self.bufSize:
      self.flush()

  def flush(self):
    # write the buffers to disk then reset the buffer
    i = self.idx + len(self.buffer["data"])
    self.data[self.idx:i] = self.buffer["data"]
    self.labels[self.idx:i] = self.buffer["labels"]
    self.idx = i
    self.buffer = {"data": [], "labels": []}

  def storeClassLabels(self, classLabels):
    # create a dataset to store the actual class label names,
    # then store the class labels
    dt = h5py.special_dtype(vlen=str) # `vlen=unicode` for Py2.7
    labelSet = self.db.create_dataset("label_names", (len(classLabels),), dtype=dt)
    labelSet[:] = classLabels

  def close(self):
    # check to see if there are any other entries in the buffer
    # that need to be flushed to disk
    if len(self.buffer["data"]) > 0:
        self.flush()

    # close the dataset
    self.db.close()

Extract Features (VGG16)

In [None]:
# import the necessary packages
from keras.applications import VGG16
from keras.applications import imagenet_utils
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import load_img
from sklearn.preprocessing import LabelEncoder
from imutils import paths
import numpy as np
import progressbar
import argparse
import random
import os

In [None]:
def extractFeatures_vgg16(dataset_filepath, outputFile_filepath, bs=32, buffer_size=1000):
    # grab the list of images that we'll be describing then randomly
    # shuffle them to allow for easy training and testing splits via
    # array slicing during training time
    print("[INFO] loading images...")
    imagePaths = list(paths.list_images(dataset_filepath))
    random.shuffle(imagePaths)

    # extract the class labels from the image paths then encode the
    # labels
    labels = [p.split(os.path.sep)[-2] for p in imagePaths]
    le = LabelEncoder()
    labels = le.fit_transform(labels)

    # load the VGG16 network
    print("[INFO] loading network...")
    model = VGG16(weights="imagenet", include_top=False)

    # initialize the HDF5 dataset writer, then store the class label
    # names in the dataset
    dataset = HDF5DatasetWriter((len(imagePaths), 512 * 7 * 7), outputFile_filepath, dataKey="features", bufSize=buffer_size)
    dataset.storeClassLabels(le.classes_)

    # initialize the progress bar
    widgets = ["Extracting Features: ", progressbar.Percentage(), " ", progressbar.Bar(), " ", progressbar.ETA()]
    pbar = progressbar.ProgressBar(maxval=len(imagePaths),
    widgets=widgets).start()

    # loop over the images in patches
    for i in np.arange(0, len(imagePaths), bs):
        # extract the batch of images and labels, then initialize the
        # list of actual images that will be passed through the network
        # for feature extraction
        batchPaths = imagePaths[i:i + bs]
        batchLabels = labels[i:i + bs]
        batchImages = []

        # loop over the images and labels in the current batch
        for (j, imagePath) in enumerate(batchPaths):
            # load the input image using the Keras helper utility
            # while ensuring the image is resized to 224x224 pixels
            image = load_img(imagePath, target_size=(224, 224))
            image = img_to_array(image)

            # preprocess the image by (1) expanding the dimensions and
            # (2) subtracting the mean RGB pixel intensity from the
            # ImageNet dataset
            image = np.expand_dims(image, axis=0)
            image = imagenet_utils.preprocess_input(image)

            # add the image to the batch
            batchImages.append(image)

        # pass the images through the network and use the outputs as
        # our actual features
        batchImages = np.vstack(batchImages)
        features = model.predict(batchImages, batch_size=bs)

        # reshape the features so that each image is represented by
        # a flattened feature vector of the `MaxPooling2D` outputs
        features = features.reshape((features.shape[0], 512 * 7 * 7))

        # add the features and labels to our HDF5 dataset
        dataset.add(features, batchLabels)
        pbar.update(i)

    # close the dataset
    dataset.close()
    pbar.finish()

In [None]:
extractFeatures_vgg16(dataset_filepath="drive/MyDrive/pyimagesearch/datasets/animals", outputFile_filepath="drive/MyDrive/pyimagesearch/output/21-feature-extraction/animals_features.hdf5")

[INFO] loading images...
[INFO] loading network...


Extracting Features: 100% |####################################| Time:  0:15:21


In [None]:
extractFeatures_vgg16(dataset_filepath="drive/MyDrive/pyimagesearch/datasets/flowers17", outputFile_filepath="drive/MyDrive/pyimagesearch/output/21-feature-extraction/flowers17_features.hdf5")

[INFO] loading images...
[INFO] loading network...


Extracting Features: 100% |####################################| Time:  0:06:09


Investigating the .hdf5 files (animals)

In [None]:
db = h5py.File("drive/MyDrive/pyimagesearch/output/21-feature-extraction/animals_features.hdf5")
print("keys: {}".format(list(db.keys())))
print("features: {}".format(db["features"].shape))
print("labels: {}".format(db["labels"].shape))
print("label_names: {}".format(db["label_names"].shape))

keys: ['features', 'label_names', 'labels']
features: (3000, 25088)
labels: (3000,)
label_names: (3,)


  """Entry point for launching an IPython kernel.


Train Linear Model

In [None]:
# import the necessary packages
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import argparse
import pickle
import h5py

In [None]:
def train_linearModel_hdf5(db_filename, model_filename, jobs=-1):
    # open the HDF5 database for reading then determine the index of
    # the training and testing split, provided that this data was
    # already shuffled *prior* to writing it to disk
    db = h5py.File(db_filename, "r")
    i = int(db["labels"].shape[0] * 0.75) # initial 75% of data is training, last 25% is testing

    # define the set of parameters that we want to tune then start a
    # grid search where we evaluate our model for each value of C
    print("[INFO] tuning hyperparameters...")
    params = {"C": [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]}
    model = GridSearchCV(LogisticRegression(solver="lbfgs", multi_class="auto"), params, cv=3, n_jobs=jobs)
    model.fit(db["features"][:i], db["labels"][:i])
    print("[INFO] best hyperparameters: {}".format(model.best_params_))

    # evaluate the model
    print("[INFO] evaluating...")
    preds = model.predict(db["features"][i:])
    print(classification_report(db["labels"][i:], preds, target_names=db["label_names"]))

    # serialize the model to disk
    print("[INFO] saving model...")
    f = open(model_filename, "wb")
    f.write(pickle.dumps(model.best_estimator_))
    f.close()

    # close the database
    db.close()

In [None]:
train_linearModel_hdf5(db_filename="drive/MyDrive/pyimagesearch/output/21-feature-extraction/animals_features.hdf5", model_filename="drive/MyDrive/pyimagesearch/output/21-feature-extraction/animals.cpickle")

[INFO] tuning hyperparameters...
[INFO] best hyperparameters: {'C': 100.0}
[INFO] evaluating...
              precision    recall  f1-score   support

        cats       0.97      0.99      0.98       245
        dogs       0.98      0.97      0.98       257
       panda       1.00      0.99      1.00       248

    accuracy                           0.98       750
   macro avg       0.98      0.98      0.98       750
weighted avg       0.98      0.98      0.98       750

[INFO] saving model...


In [None]:
train_linearModel_hdf5(db_filename="drive/MyDrive/pyimagesearch/output/21-feature-extraction/flowers17_features.hdf5", model_filename="drive/MyDrive/pyimagesearch/output/21-feature-extraction/flowers17.cpickle")

[INFO] tuning hyperparameters...
[INFO] best hyperparameters: {'C': 1000.0}
[INFO] evaluating...
              precision    recall  f1-score   support

    bluebell       0.91      1.00      0.95        20
   buttercup       0.95      0.88      0.91        24
   coltsfoot       1.00      0.84      0.91        19
     cowslip       0.62      0.76      0.68        17
      crocus       0.88      1.00      0.94        23
    daffodil       0.74      0.77      0.76        22
       daisy       1.00      0.85      0.92        13
   dandelion       0.90      1.00      0.95        19
  fritillary       0.96      0.96      0.96        23
        iris       1.00      0.88      0.93        16
  lilyvalley       0.92      0.96      0.94        24
       pansy       1.00      0.85      0.92        13
    snowdrop       0.75      0.95      0.84        19
   sunflower       1.00      1.00      1.00        26
   tigerlily       0.91      0.95      0.93        22
       tulip       0.94      0.67     