In [3]:
import glob
import numpy as np
import os
import pandas as pd
import pickle
import skimage  

from skimage import io, color, feature
from skimage import transform
from sklearn.decomposition import PCA
from tqdm import tqdm

In [5]:
TRAINING_DIR = '/data/smannan/train/train/'
NUM_IMAGES = 25000
PCA_DIM = 100
IMAGE_SIZE = (100, 100)
LABELS = ['cat', 'dog']
RS = 20150101

In [2]:
'''
Directory structure used to train and test nueral nets
data/
    train_master/
        dog001.jpg
        dog002.jpg
        ...
        cat001.jpg
        cat002.jpg
        ...
    train/
        dogs/
            dog001.jpg
            dog002.jpg
            ...
        cats/
            cat001.jpg
            cat002.jpg
            ...
    validation/
        dogs/
            dog001.jpg
            dog002.jpg
            ...
        cats/
            cat001.jpg
            cat002.jpg
            ...
'''
def change_dir_structure(num_images):
    os.chdir('/data/smannan/train/train_master/')
    files = glob.glob("*.jpg")
    seen = set()
    
    for i in range(int(num_images*0.3)):
        idx = np.random.randint(num_images)
        while idx in seen: idx = np.random.randint(num_images)
        curr = files[idx]
        if curr.split('.')[0] == 'cat': shutil.copy(curr, '../validation/dog/')
        else: shutil.copy(curr, '../validation/cat/')
        seen.add(idx)
    
    print (len(seen))
    for i in range(num_images):
        if i not in seen:
            curr = files[i]
            if curr.split('.')[0] == 'cat': shutil.copy(curr, '../train/cat')
            else: shutil.copy(curr, '../train/dog')

In [4]:
def tidy_training_data(pathname, images, labels):
    os.chdir(pathname)
    files = glob.glob("*.jpg")
    tidy_data = []
    labels_train = []
    
    for i in tqdm(range(len(files))):
        label = LABELS.index(files[i].split('.')[0])
        image = transform.resize(io.imread(files[i], as_grey=True), IMAGE_SIZE)
        tidy_data.append(image)
        labels_train.append(label)
    
    im = open(images, 'wb')
    labels = open(labels, 'wb')
    
    np.save(im, tidy_data)
    im.close()
    
    np.save(labels, labels_train)
    labels.close()

In [6]:
def extract_HOG(data, output):
    HOG_feat = []
    
    for i in tqdm(range(len(data))):
        image = color.rgb2gray(data[i,:])
        hog_vec = feature.hog(image)
        HOG_feat.append(hog_vec)

    f = open(output, 'wb')
    np.save(f, HOG_feat)
    f.close()

In [18]:
def extract_pca(data, output, ncomp=PCA_DIM):
    pca = PCA(n_components=ncomp)
    projected = pca.fit_transform(data)
    f = open(output, 'wb')
    np.save(f, projected)
    f.close()

In [6]:
tidy_training_data(TRAINING_DIR, 
                '/home/smannan/finalproject/training_im_100x100.npy',
                 '/home/smannan/finalproject/training_labels_100x100.npy')

In [8]:
data = np.load(open('/home/smannan/finalproject/training_im_100x100.npy','rb'))
labels = np.load(open('/home/smannan/finalproject/training_labels_100x100.npy','rb'))
data_reshape = data.reshape(data.shape[0], data.shape[1]*data.shape[1])

In [9]:
extract_HOG(data, '/home/smannan/finalproject/training_hog_100x100.npy')

100%|██████████| 25000/25000 [02:09<00:00, 193.26it/s]


In [14]:
assert len(data.shape) == 3
assert data.shape[0] == NUM_IMAGES
assert data.shape[1] == IMAGE_SIZE[0]
assert data_reshape.shape[0] == NUM_IMAGES
assert data_reshape.shape[1] == IMAGE_SIZE[0] * IMAGE_SIZE[0]
assert labels.shape[0] == NUM_IMAGES

In [8]:
hog = np.load(open('/home/smannan/finalproject/training_hog_50x50.npy','rb'))

In [9]:
assert hog.shape[0] == NUM_IMAGES

In [11]:
extract_pca(data_reshape, '/home/smannan/finalproject/training_pca_100x100.npy')

In [12]:
pca = np.load(open('/home/smannan/finalproject/training_pca_100x100.npy','rb'))

In [19]:
assert pca.shape[0] == NUM_IMAGES
assert pca.shape[1] == PCA_DIM

In [6]:
os.chdir('/data/smannan/train/validation/cat')
test_cats = len(glob.glob("*.jpg"))

os.chdir('../dog/')
test_dogs = len(glob.glob("*.jpg"))

assert test_cats + test_dogs == 0.3 * NUM_IMAGES

os.chdir('/data/smannan/train/train/cat')
train_cat = len(glob.glob("*.jpg"))

os.chdir('../dog/')
train_dog = len(glob.glob("*.jpg"))

assert train_cat + train_dog == 0.7 * NUM_IMAGES

os.chdir('/home/smannan/finalproject/')