In [1]:
import os.path
import numpy as np

import keras.backend as K
from keras.preprocessing.image import Iterator, load_img, img_to_array, ImageDataGenerator

Using TensorFlow backend.


In [None]:
def train_test_split(X, y, test_size=0.2):
    X = np.array(X)
    y = np.array(y)
    indexes = np.random.permutation(range(len(X)))
    X = X[indexes]
    y = y[indexes]
    
    test_idx = int(len(X) * test_size)
    X_test, y_test = X[:test_idx], y[:test_idx]
    X_train, y_train = X[test_idx:], y[test_idx:]
    
    return (X_train, y_train), (X_test, y_test)


def get_image_inputs_targets(df, target_columns, directory):
    """Return inputs and targets array"""
    
    mask = df.index.map(lambda filename: os.path.exists(os.path.join(directory, filename)))
    data_df = df.ix[mask, target_columns]
    
    return data_df.index.values, data_df.values

def load_index_targets(labels_df, target_col, directory, is_multi_classed=False):
    """Return inputs and targets array"""
    
    vald_files_mask = labels_df.index.map(lambda filename: os.path.exists(os.path.join(directory, filename)))
    not_null_mask = labels_df[target_col].notnull()
    mask = (vald_files_mask & not_null_mask)
    if is_multi_classed:
        data_df = labels_df.ix[mask, target_col].map(lambda values: [int(val) for val in values.split(",")])
    else:
        data_df = labels_df.ix[mask, target_col].astype(int)
        
    data = data_df.to_dict()
    inputs, targets = zip(*data.items())
    return inputs, targets


class ImageFilesIterator(Iterator):
    """Iterator capable of reading images from a directory on disk.
    # Arguments
        directory: Path to the directory to read images from.
            Each subdirectory in this directory will be
            considered to contain images from one class,
            or alternatively you could specify class subdirectories
            via the `classes` argument.
        image_data_generator: Instance of `ImageDataGenerator`
            to use for random transformations and normalization.
        target_size: tuple of integers, dimensions to resize input images to.
        color_mode: One of `"rgb"`, `"grayscale"`. Color mode to read images.
        classes: Optional list of strings, names of sudirectories
            containing images from each class (e.g. `["dogs", "cats"]`).
            It will be computed automatically if not set.
        class_mode: Mode for yielding the targets:
            `"binary"`: binary targets (if there are only two classes),
            `"categorical"`: categorical targets,
            `"sparse"`: integer targets,
            `None`: no targets get yielded (only input images are yielded).
        batch_size: Integer, size of a batch.
        shuffle: Boolean, whether to shuffle the data between epochs.
        seed: Random seed for data shuffling.
        data_format: String, one of `channels_first`, `channels_last`.
        save_to_dir: Optional directory where to save the pictures
            being yielded, in a viewable format. This is useful
            for visualizing the random transformations being
            applied, for debugging purposes.
        save_prefix: String prefix to use for saving sample
            images (if `save_to_dir` is set).
        save_format: Format to use for saving sample images
            (if `save_to_dir` is set).
    """

    def __init__(self, directory, filenames, classes,
                 image_data_generator,
                 num_class=None, class_mode='categorical',
                 target_size=(256, 256), color_mode='rgb',
                 batch_size=32, shuffle=True, seed=None,
                 data_format=None,
                 save_to_dir=None, save_prefix='', save_format='jpeg',
                 follow_links=False):
        
        if data_format is None:
            data_format = K.image_data_format()
            
        self.directory = directory
        self.filenames = filenames
        self.classes = classes
        self.num_class = num_class
        self.class_mode = class_mode
        self.samples = len(self.filenames)
        self.image_data_generator = image_data_generator
        self.target_size = tuple(target_size)

        if color_mode not in {'rgb', 'grayscale'}:
            raise ValueError('Invalid color mode:', color_mode,
                             '; expected "rgb" or "grayscale".')
        self.color_mode = color_mode
        self.data_format = data_format
        if self.color_mode == 'rgb':
            if self.data_format == 'channels_last':
                self.image_shape = self.target_size + (3,)
            else:
                self.image_shape = (3,) + self.target_size
        else:
            if self.data_format == 'channels_last':
                self.image_shape = self.target_size + (1,)
            else:
                self.image_shape = (1,) + self.target_size

        self.save_to_dir = save_to_dir
        self.save_prefix = save_prefix
        self.save_format = save_format

        white_list_formats = {'png', 'jpg', 'jpeg', 'bmp'}

        super().__init__(self.samples, batch_size, shuffle, seed)

    def next(self):
        """For python 2.x.
        # Returns
            The next batch.
        """
        with self.lock:
            index_array, current_index, current_batch_size = next(self.index_generator)
        # The transformation of images is not under thread lock
        # so it can be done in parallel
        batch_x = np.zeros((current_batch_size,) + self.image_shape, dtype=K.floatx())
        grayscale = self.color_mode == 'grayscale'
        # build batch of image data
        err_indexes = []
        for i, idx in enumerate(index_array):
            fname = self.filenames[idx]
            try:
                img = load_img(os.path.join(self.directory, fname),
                               grayscale=grayscale,
                               target_size=self.target_size)
                x = img_to_array(img, data_format=self.data_format)
            except Exception:
                err_indexes.append(idx)
                continue
            x = self.image_data_generator.random_transform(x)
            x = self.image_data_generator.standardize(x)
            batch_x[i] = x
            
        
        # optionally save augmented images to disk for debugging purposes
        if self.save_to_dir:
            for i in range(current_batch_size):
                img = array_to_img(batch_x[i], self.data_format, scale=True)
                fname = '{prefix}_{index}_{hash}.{format}'.format(prefix=self.save_prefix,
                                                                  index=current_index + i,
                                                                  hash=np.random.randint(1e4),
                                                                  format=self.save_format)
                img.save(os.path.join(self.save_to_dir, fname))

        # build batch of labels
        if self.class_mode == 'sparse':
            batch_y = self.classes[index_array]
        elif self.class_mode == 'binary':
            batch_y = self.classes[index_array].astype(K.floatx())
        elif self.class_mode == 'categorical':
            batch_y = np.zeros((len(batch_x), self.num_class), dtype=int)
            for i, label in enumerate(self.classes[index_array]):
                batch_y[i, label] = 1.
        else:
            return batch_x
        return batch_x, batch_y
    

class ImageFilesGenerator(ImageDataGenerator):
    
    def flow_from_files(self, directory, 
                            filenames, 
                            classes, num_class=None,
                            target_size=(256, 256), color_mode='rgb',
                            class_mode='categorical',
                            batch_size=32, shuffle=True, seed=None,
                            save_to_dir=None,
                            save_prefix='',
                            save_format='jpeg',
                            follow_links=False,
                            files_iterator_cls=ImageFilesIterator):
        return files_iterator_cls(
            directory, filenames, classes, self, 
            num_class=num_class,
            target_size=target_size, color_mode=color_mode,
            class_mode=class_mode,
            data_format=self.data_format,
            batch_size=batch_size, shuffle=shuffle, seed=seed,
            save_to_dir=save_to_dir,
            save_prefix=save_prefix,
            save_format=save_format,
            follow_links=follow_links)


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline



In [3]:
import keras
# from keras_tqdm import TQDMNotebookCallback

# from utils import PlotLossNotebookCallback

In [None]:
from loaders import ImageFilesGenerator, train_test_split, load_index_targets
from keras_preprocessing import preprocess_input_vgg16

In [14]:
category_labels = pd.read_csv("data/labels.csv")[["image", "category_GT"]].dropna()
category_labels.rename(columns={"category_GT": "category"}, inplace=True)
category_labels["category"] = category_labels.category.astype(int)

In [15]:
category_labels.to_csv("data/category_labels.csv", index=False)

In [1]:
pd.read_csv("data/category_labels.csv").info()

NameError: name 'pd' is not defined