In [None]:
"""
References
1. https://www.analyticsvidhya.com/blog/2017/06/architecture-of-convolutional-neural-networks-simplified-demystified/

"""
import os
import numpy as np
import pandas as pd
import scipy
import sklearn
import cv2

print ('Imported the basic libs ...')

import keras
from keras.models import Sequential

In [None]:
import os
# import scipy.misc
import scipy.ndimage as spimg
import cv2
import sys
import numpy as np
import pickle
import joblib
import random
import matplotlib.pyplot as plt
%matplotlib inline

# SPLIT DATA INTO BATCHES
def get_data(dir_filenames, dir_path, label, filename, file_splits=4, filename_extension='.gz', obj_resize = (300,300)):
    labels_all = []
    dir_filenames_done = 0
    
    dir_filenames_blockssize = int(len(dir_filenames)/file_splits)
    dir_filenames_lists = [dir_filenames[i*dir_filenames_blockssize : i*dir_filenames_blockssize + dir_filenames_blockssize] for i in range(0, file_splits+1)]
    
    print ('0. Total Number of files:', len(dir_filenames))
    print ('0. Batches and their sizes:', [len(each) for each in dir_filenames_lists], '\n')
    
    for j, dir_filenames_list in enumerate(dir_filenames_lists):
        print ('\n 0.  ---------------> BATCH NUM:', j+1, ' Total Images:', len(dir_filenames_list))
        print ('0. Sample Filenames:', dir_filenames_list[:10])
        labels = []
        objs = []
        if len(dir_filenames_lists):
            for i, file in enumerate(dir_filenames_list):
                if i % 500 == 0:
                    print ('1. Reading... ', i, '/', len(dir_filenames_list))
                obj = spimg.imread(dir_path + file, flatten=True, mode='L')
                objs.append(obj)
                labels.append(label)

            ## PRINT INFO ON OBJECTS
            rand_idx = random.randint(1,len(dir_filenames_list))
            print ('2. Total Images :', len(objs))
            print ('2. Single Image Size (bytes):', sys.getsizeof(objs[rand_idx]), '\n')

            ## RESIZE ABOVE OBJECTS
            tot_objs = len(objs)
            for i in range(0,tot_objs):
                if i % 500 == 0:
                    print ('3. Resizing...', i, '/', tot_objs)
                # objs[i] = cv2.resize(objs[i], obj_resize, interpolation=cv2.INTER_NEAREST).flatten()
                objs[i] = cv2.resize(objs[i], obj_resize, interpolation=cv2.INTER_NEAREST)

            print ('3. Image Size (resized) (bytes):', sys.getsizeof(objs[rand_idx]))

            ## PRINT INFO ON RESIZED OBJECTS
            objs_numpy = np.array(objs, dtype=np.int)
            print ('4. Final Object Array (ByteSize):', objs_numpy.itemsize)
            print ('4. Final Object Array:', objs_numpy.shape, '\t Memory:', objs_numpy.nbytes/1024.0/1024.0, ' MB')
            print ('4. Final Object Array (Single Sample)', objs_numpy[rand_idx])

            ## STORE RESIZED OBJECTS
            filename_tmp = filename + '_' + str(j+1) + filename_extension
            with open(filename_tmp, 'wb') as handle:
                # pickle.dump(objs_numpy, handle, protocol=-1)
                # np.save(handle, objs_numpy, allow_pickle=True)
                joblib.dump(objs_numpy, handle, compress=True)
                print ('5. Finished writing file : ', filename_tmp)
                labels_all.extend(labels)
                dir_filenames_done += len(dir_filenames_list)
                print ('5. Total Files Done:', dir_filenames_done, '/', len(dir_filenames))
            
        
    return labels

# SAMPLE BATCHES
def get_data_sample(dir_filenames, dir_path, label, filename, obj_resize = (300,300), idx_data = 1000):
    labels = []
    objs = []
    for i, file in enumerate(dir_filenames[:idx_data]):
        if i % 500 == 0:
            print ('1. Reading... ', i, '/', len(dir_filenames))
        obj = spimg.imread(dir_path + file, flatten=True, mode='L')
        objs.append(obj)
        labels.append(label)
    
    rand_idx = random.randint(1,len(dir_filenames[:idx_data]))
    # print ('Total Size (MB)', (np.array(objs).nbytes*1.0)/1024.0/1024.0)
    print ('2. Total Images :', len(objs))
    print ('2. Single Image Size (bytes):', sys.getsizeof(objs[rand_idx]), '\n')
    
    tot_objs = len(objs)
    for i in range(0,tot_objs):
        if i % 500 == 0:
            print ('3. Resizing...', i, '/', tot_objs)
        # objs[i] = cv2.resize(objs[i], obj_resize, interpolation=cv2.INTER_NEAREST).flatten()
        objs[i] = cv2.resize(objs[i], obj_resize, interpolation=cv2.INTER_NEAREST)
        
    
    print ('3. Image Size (resized) (bytes):', sys.getsizeof(objs[rand_idx]))
    
    objs_numpy = np.array(objs, dtype=np.int)
    print ('4. Final Object Array (ByteSize):', objs_numpy.itemsize)
    print ('4. Final Object Array:', objs_numpy.shape, '\t Memory:', objs_numpy.nbytes/1024.0/1024.0, ' MB')
    print ('4. Final Object Array', objs_numpy[rand_idx])
    
    with open(filename, 'wb') as handle:
        # pickle.dump(objs_numpy, handle, protocol=-1)
        # np.save(handle, objs_numpy, allow_pickle=True)
        joblib.dump(objs_numpy, handle, compress=True)
        
    return labels

## SAMPLE DATA
def view_sample_data(dir_filenames, dir_path, label, filename, obj_resize = (300,300), idx_data = 1000):
    f, axarr = plt.subplots(1,2, figsize=(11,11))
    rand_idx = random.randint(1,len(dir_filenames))
    for i, file in enumerate(dir_filenames):
        if i == rand_idx:
            print ('Image Name:', file)
            obj = spimg.imread(dir_path + file, flatten=True, mode='L')
            sample = np.array(obj)
            print ('Sample data (Original): ', sample, sample.shape)
            axarr[0].imshow(sample, cmap = plt.cm.gray)

            obj_resize  = cv2.resize(obj, obj_resize, interpolation=cv2.INTER_NEAREST)
            print ('Sample Data (Resized)', obj_resize)
            obj_resize_numpy = np.array(obj_resize, dtype=np.int)
            print ('Sample Data (resized)', obj_resize_numpy, obj_resize_numpy.shape)
            axarr[1].imshow(obj_resize_numpy, cmap = plt.cm.gray)
            
            print ('-------- BYTES -------')
            print ('Original Array (bytes):', sys.getsizeof(obj), ' MB:', sys.getsizeof(obj)/1024.0/1024.0)
            print ('Resized Array (bytes):', sys.getsizeof(obj_resize), ' MB:', sys.getsizeof(obj_resize)/1024.0/1024.0)
            print ('Resized Array (bytes) (numpy) (sys.getsizeof):', sys.getsizeof(obj_resize_numpy))
            print ('Resized Array (bytes) (numpy) (np.nbytes):', obj_resize_numpy.nbytes)

            break
    

## CATS YO!
# dir_filenames = os.listdir('../data/train/cats/')
# dir_path = '../data/train/cats/'
# label = 0
# dump_filename = 'data/cats/cats'
## view_sample_data(dir_filenames, dir_path, label, filename)
## labels_cats = get_data_sample(dir_filenames, dir_path, label, filename)
# labels_cats = get_data(dir_filenames, dir_path, label, dump_filename, file_splits = 4)


## DOGS YO!
dir_filenames = os.listdir('../data/train/dogs/')
dir_path = '../data/train/dogs/'
label = 0
dump_filename = 'data/dogs/dogs'
# view_sample_data(dir_filenames, dir_path, label, filename)
# labels_dogs = get_data_sample(dir_filenames, dir_path, label, filename)
labels_dogs = get_data(dir_filenames, dir_path, label, dump_filename, file_splits = 4)

0. Total Number of files: 12500
0. Batches and their sizes: [3125, 3125, 3125, 3125, 0]
0.  ---------------> BATCH NUM: 1  Total Images: 3125
0. Sample Filenames: ['dog.0.jpg', 'dog.1.jpg', 'dog.10.jpg', 'dog.100.jpg', 'dog.1000.jpg', 'dog.10000.jpg', 'dog.10001.jpg', 'dog.10002.jpg', 'dog.10003.jpg', 'dog.10004.jpg']
1. Reading...  0 / 3125
1. Reading...  500 / 3125
1. Reading...  1000 / 3125
1. Reading...  1500 / 3125
1. Reading...  2000 / 3125
1. Reading...  2500 / 3125
1. Reading...  3000 / 3125
2. Total Images : 3125
2. Single Image Size (bytes): 664112 

3. Resizing... 0 / 3125
3. Resizing... 500 / 3125
3. Resizing... 1000 / 3125
3. Resizing... 1500 / 3125
3. Resizing... 2000 / 3125
3. Resizing... 2500 / 3125
3. Resizing... 3000 / 3125
3. Image Size (resized) (bytes): 360112
4. Final Object Array (ByteSize): 4
4. Final Object Array: (3125, 300, 300) 	 Memory: 1072.8836059570312  MB
4. Final Object Array (Single Sample) [[ 79  70  61 ..., 159 158 158]
 [ 98  92  87 ..., 160 160 15