## Data preprocessing and train/test splitting:

In [1]:
# import libraries
import os
from PIL import Image
import numpy as np
import shutil

First, we resize all images to 64x64x4 RGB images and save them in a new folder.

In [2]:
# specify directories of raw data and new directory for resized data
raw_data_dir = 'raw_data/101_ObjectCategories/'
resized_dir = 'data/101_ObjectCategories/'

In [3]:
# function to resize an image
def resize_images(input_dir, output_dir, new_size=64):
    for img_file in os.listdir(input_dir):
        path = os.path.join(input_dir, img_file)
        if os.path.isfile(path):
            img = Image.open(path)
            img_resized = img.resize((new_size, new_size), 
                                     Image.ANTIALIAS)
            new_path = os.path.join(output_dir, img_file)
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            with open(new_path, "w"):
                img_resized.save(new_path, 'JPEG')

In [4]:
# resize all images to 64x64x3 RGB images
category_dirs = os.listdir(raw_data_dir)  # get folder names of each object category

# if images have not been resized, then resize them
if not os.path.exists(resized_dir):
    for cat in category_dirs:
        if cat != '.DS_Store':
            resize_images(os.path.join(raw_data_dir, cat), 
                          os.path.join(resized_dir, cat))

Next, we split the data into training (90%) and test sets (10%).

In [5]:
# specify new directories for train/test data
train_dir = 'data/train/'
test_dir = 'data/test/'

In [6]:
def move_files(files, origin_dir, dest_dir):
    for img_file in files:
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        shutil.copy(os.path.join(origin_dir, img_file), 
                    os.path.join(dest_dir, img_file))

In [7]:
# split the data
categories = os.listdir(resized_dir)

for cat in categories:
    image_files = os.listdir(os.path.join(resized_dir, cat))
    train_indices = np.random.choice([0, 1], size=len(image_files), p=[.1, .9])
    train_files = np.compress(train_indices, image_files)
    test_files = np.compress(1 - train_indices, image_files)
    
    move_files(train_files, 
               os.path.join(resized_dir, cat),
               train_dir + cat)
    move_files(test_files,
               os.path.join(resized_dir, cat),
               test_dir + cat)