In [None]:
import os
import logging
logging.basicConfig(level=logging.INFO)
import glob
import cv2
import numpy as np
import tensorflow as tf

In [None]:
data_dir = 'data'
size = (227, 227)

### resize images to the same size

In [None]:
def may_create_directory(path):
    if os.path.isfile(path):
        os.remove(path)
        logging.info('[create_directory] remove file: %s', path)
    if not os.path.isdir(path):
        os.makedirs(path)
        logging.info('[create_directory] create directory: %s', path)
    else:
        logging.info('[create_directory] directory already exists: %s', path)

def may_resize_images(from_dir, to_dir, size):
    logging.info('[resize_images] start')
    may_create_directory(to_dir)
    count = 0
    for image_path in glob.glob(os.path.join(from_dir, '*.jpg')):
        image_file_name = os.path.split(image_path)[1]
        to_path = os.path.join(to_dir, image_file_name)
        if not os.path.isfile(to_path):
            image = cv2.imread(image_path)
            resized_image = cv2.resize(image, size)  # interpolation=cv2.INTER_LINEAR
            #resized_image = cv2.resize(image, size, interpolation=cv2.INTER_CUBIC))
            cv2.imwrite(to_path, resized_image)
            count += 1
        if count != 0 and count % 1000 == 0:
            logging.info('[resize_images] finish: count = %d', count)
    logging.info('[resize_images] finished: total_written_files = %d', count)

In [None]:
# resize training images
from_train_dir = os.path.join(data_dir, 'train')
to_train_dir = os.path.join(data_dir, 'resized_train')
may_resize_images(from_train_dir, to_train_dir, size)

# resize test images
from_test_dir = os.path.join(data_dir, 'test1')
to_test_dir = os.path.join(data_dir, 'resized_test1')
may_resize_images(from_test_dir, to_test_dir, size)

### get the images and labels

In [None]:
def get_images_and_labels(dir, train_ratio):
    if not (0.0 <= train_ratio and train_ratio <= 1.0):
        raise ValueError('train_ratio must between 0.0 and 1.0')
    images = []
    labels = []
    for image_path in glob.glob(os.path.join(dir, '*.jpg')):
        images.append(image_path)
        image_file_name = os.path.split(image_path)[1]
        if image_file_name.startswith('cat'):
            labels.append(0)    # cat is 0
        elif image_file_name.startswith('dog'):
            labels.append(1)    # dog is 1
        else:
            raise ValueError("image name must be starts with 'cat' or 'dog': {}".format(image_file_name))
    assert(len(images) == len(labels))
    # shuffle samples
    samples = np.array([images, labels])
    samples = samples.transpose()
    np.random.shuffle(samples)
    # split dataset into train-part and val-part
    num = len(images)
    train_num = int(num * train_ratio)
    train_images = list(samples[:train_num, 0])
    train_labels = list(samples[:train_num, 1].astype(np.uint8))
    val_images = list(samples[train_num:, 0])
    val_labels = list(samples[train_num:, 1].astype(np.uint8))
    return (train_images, train_labels, val_images, val_labels)

In [None]:
train_val_dir = to_train_dir
train_ratio = 0.96
train_images, train_labels, val_images, val_labels = get_images_and_labels(train_val_dir, train_ratio)
logging.info('len(train_images) = %d', len(train_images))
logging.info('len(train_labels) = %d', len(train_labels))
logging.info('len(val_images) = %d', len(val_images))
logging.info('len(val_labels) = %d', len(val_labels))

### create train and val record

In [None]:
def create_record(images, labels, save_dir, file_name):
    path = os.path.join(save_dir, file_name)
    assert(len(images) == len(labels))
    num = len(images)
    writer = tf.python_io.TFRecordWriter(path)
    logging.info('[create_record] start: write to %s', path)
    for i in np.arange(num):
        image = cv2.imread(images[i]) # differ from open(images[i], 'rb').read()
        image_raw = image.tostring()
        #with open(train_images[i], 'rb') as fid:
        #    image_raw = fid.read()
        label = int(labels[i])
        # ohe-hot code
        #onehot_code = [[1, 0], [0, 1]]
        #label = onehot_code[label] 
        example = tf.train.Example(features = tf.train.Features(
            feature = {
                'image_raw': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_raw])),
                'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
            }
        ))
        writer.write(example.SerializeToString())
        if (i + 1) % 1000 == 0:
            logging.info('[create_record] finish: count = %d', i + 1)
    writer.close()
    logging.info('[create_record] finished: total_examples = %d', i + 1)

In [None]:
create_record(train_images, train_labels, data_dir, 'train.record')
create_record(val_images, val_labels, data_dir, 'val.record')