In [1]:
import numpy as np
import os
import cv2
import pickle
import matplotlib.pyplot as plt

TRAIN_DIR = 'F:/dogs-vs-cats/train/'
TEST_DIR = 'F:/dogs-vs-cats/test/'

In [2]:
IMAGE_SIZE = 150
CHANNELS = 3
pixel_depth = 255.0
training_fraction = 0.8

train_images = [TRAIN_DIR+i for i in os.listdir(TRAIN_DIR)] 
train_dogs =   [TRAIN_DIR+i for i in os.listdir(TRAIN_DIR) if 'dog' in i]
train_cats =   [TRAIN_DIR+i for i in os.listdir(TRAIN_DIR) if 'cat' in i]
test_images =  [TEST_DIR+i for i in os.listdir(TEST_DIR)]
test_ids =     [i[:-4] for i in os.listdir(TEST_DIR)]


TRAINING_AND_VALIDATION_SIZE_DOGS = int(len(train_dogs))
TRAINING_AND_VALIDATION_SIZE_CATS = int(len(train_cats)) 
TRAINING_AND_VALIDATION_SIZE_ALL  = TRAINING_AND_VALIDATION_SIZE_DOGS + TRAINING_AND_VALIDATION_SIZE_CATS
TRAINING_SIZE = int(training_fraction * TRAINING_AND_VALIDATION_SIZE_ALL)
VALID_SIZE = TRAINING_AND_VALIDATION_SIZE_ALL - TRAINING_SIZE
TEST_SIZE_ALL = len(test_images)

train_images = train_dogs[:TRAINING_AND_VALIDATION_SIZE_DOGS] + train_cats[:TRAINING_AND_VALIDATION_SIZE_CATS]
train_labels = np.array ((['dogs'] * TRAINING_AND_VALIDATION_SIZE_DOGS) + (['cats'] * TRAINING_AND_VALIDATION_SIZE_CATS))
test_images =  test_images[:TEST_SIZE_ALL]

In [3]:
def read_image(file_path):
    img = cv2.imread(file_path, cv2.IMREAD_COLOR)
    if (img.shape[0] >= img.shape[1]):
        resizeto = (IMAGE_SIZE, int (round (IMAGE_SIZE * (float (img.shape[1])  / img.shape[0]))));
    else:
        resizeto = (int (round (IMAGE_SIZE * (float (img.shape[0])  / img.shape[1]))), IMAGE_SIZE);
    
    img_resized = cv2.resize(img, (resizeto[1], resizeto[0]), interpolation=cv2.INTER_CUBIC)
    img3 = cv2.copyMakeBorder(img_resized, 0, IMAGE_SIZE - img_resized.shape[0], 0,
                              IMAGE_SIZE - img_resized.shape[1], cv2.BORDER_CONSTANT, 0)
        
    return img3[:,:,::-1]  #cv2 uses bgr, must reverse to get rgb

def prep_data(images):
    count = len(images)
    data = np.ndarray((count, IMAGE_SIZE, IMAGE_SIZE, CHANNELS), dtype=np.float32)

    for i, image_file in enumerate(images):
        image = read_image(image_file);
        image_data = np.array (image, dtype=np.float32);
        image_data[:,:,0] = (image_data[:,:,0].astype(float) - pixel_depth / 2) / pixel_depth
        image_data[:,:,1] = (image_data[:,:,1].astype(float) - pixel_depth / 2) / pixel_depth
        image_data[:,:,2] = (image_data[:,:,2].astype(float) - pixel_depth / 2) / pixel_depth
        
        data[i] = image_data;  
    return data

In [4]:
train_normalized = prep_data(train_images)
test_normalized = prep_data(test_images)

Train shape: (25000, 150, 150, 3)
Test shape: (12500, 150, 150, 3)


In [5]:
#randomly shuffle data
def randomize(dataset, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_dataset = dataset[permutation,:,:,:]
    shuffled_labels = labels[permutation]
    return shuffled_dataset, shuffled_labels

train_dataset_rand, train_labels_rand = randomize(train_normalized, train_labels)

#clear memory
del train_normalized

# split up into training + valid
valid_dataset = train_dataset_rand[:VALID_SIZE,:,:,:]
valid_labels = train_labels_rand[:VALID_SIZE]
train_dataset = train_dataset_rand[VALID_SIZE:VALID_SIZE+TRAINING_SIZE,:,:,:]
train_labels = train_labels_rand[VALID_SIZE:VALID_SIZE+TRAINING_SIZE]

#clear memory
del train_dataset_rand

print ('Training', train_dataset.shape, train_labels.shape)
print ('Validation', valid_dataset.shape, valid_labels.shape)
print ('Test', test_normalized.shape)

Training (20000, 150, 150, 3) (20000,)
Validation (5000, 150, 150, 3) (5000,)
Test (12500, 150, 150, 3)


In [6]:
np.save('F:/dogs-vs-cats/train_dataset.npy',train_dataset)
np.save('F:/dogs-vs-cats/train_labels.npy',train_labels)
np.save('F:/dogs-vs-cats/valid_dataset.npy',valid_dataset)
np.save('F:/dogs-vs-cats/valid_labels.npy',valid_labels)
np.save('F:/dogs-vs-cats/test_dataset.npy',test_normalized)
np.save('F:/dogs-vs-cats/test_ids.npy',test_ids)