## Aim

GitHub Repo: [stanford_dogs](https://github.com/darthv115/ml-projects/tree/master/stanford_dogs)

So, This notebook is basically for pre-processing the input images from the Stanford Dogs dataset which can be downloaded [here](http://vision.stanford.edu/aditya86/ImageNetDogs/).

In this notebook, I will use methods to basically make the data in a format to be fed to a Convolutional Neural Network.

### Directory Structure

In [1]:
%ls ../data

[0m[01;34mAnnotation[0m/      [01;34mcropped_images[0m/  [01;34mfeatures[0m/  [01;34mlists[0m/
[01;34mcleaned_images[0m/  [01;34mdims[0m/            [01;34mImages[0m/


- Images/  
    Images of different breeds are in separate folders (The actual data)  
- Annotations/  
    Contains XML files with bounding box annotations for each image   
(This basically describes the part of the image which best describes the dog, probably facial area)  
- lists/
    - file_list.mat - List of all files in the dataset
    - train_list.mat - List and labels of all training images in dataset
    - test_list.mat - List and labels of all test images in dataset
- features/  
    Contains the features of the network after training

In [1]:
# libraries
import scipy.io as sio   # for saving and loading mat files
import os
from xml.dom.minidom import parse    # XML DOM api
import matplotlib.pyplot as plt
import numpy as np
import cv2 as cv    # opencv for image-processing
import time    # for timing code blocks

# enable Optimization in case it's false with the following command
# cv.setUseOptimized(True)
print "OpenCV optimization:", cv.useOptimized()
# here, it is already true

# so that we can see the images in the notebook
%matplotlib inline

# for moving half the images
import shutil

OpenCV optimization: True


Just to show you how the xml data is formatted.

In [3]:
%cat /home/ashish/ml-projects/stanford_dogs/data/Annotation/n02104365-schipperke/n02104365_8998

<annotation>
	<folder>02104365</folder>
	<filename>n02104365_8998</filename>
	<source>
		<database>ImageNet database</database>
	</source>
	<size>
		<width>500</width>
		<height>333</height>
		<depth>3</depth>
	</size>
	<segment>0</segment>
	<object>
		<name>schipperke</name>
		<pose>Unspecified</pose>
		<truncated>0</truncated>
		<difficult>0</difficult>
		<bndbox>
			<xmin>139</xmin>
			<ymin>69</ymin>
			<xmax>305</xmax>
			<ymax>286</ymax>
		</bndbox>
	</object>
	<object>
		<name>schipperke</name>
		<pose>Unspecified</pose>
		<truncated>0</truncated>
		<difficult>0</difficult>
		<bndbox>
			<xmin>394</xmin>
			<ymin>113</ymin>
			<xmax>499</xmax>
			<ymax>230</ymax>
		</bndbox>
	</object>
</annotation>

So now, I will parse and extract the bounding box attributes, namely <code>xmin</code>, <code>xmax</code>, <code>ymin</code> and <code>ymax</code>.

And for that, I will use the DOM API provided by Python. An another alternative is the SAX API and it can give be advantageous if you have pretty long documents. Since we don't have large documents and my familiarity with the DOM tree and its peculiarities, I chose the DOM API to work with.

In [159]:
xml_path = '/home/ashish/ml-projects/stanford_dogs/data/Annotation/n02104365-schipperke/n02104365_8998'

domTree = parse(xml_path)
annotation = domTree.documentElement
objs = annotation.getElementsByTagName("object") # since object is a global variable

breeds = []

for obj in objs:
    breed = []
    dims = []
    name = obj.getElementsByTagName("name")[0]
    breed.append(str(name.childNodes[0].data))
    
    bndbox = obj.getElementsByTagName("bndbox")[0]

    dims.append(bndbox.getElementsByTagName("xmin")[0])
    dims.append(bndbox.getElementsByTagName("ymin")[0])
    dims.append(bndbox.getElementsByTagName("xmax")[0])
    dims.append(bndbox.getElementsByTagName("ymax")[0])

    dims = [int(dim.childNodes[0].data) for dim in dims]
    for dim in dims: breed.append(dim)
    breeds.append(breed)

# print dims
print breeds

dir_name, file_name = os.path.split(os.path.abspath(xml_path))

breed_dir = dir_name.split('/')[-1]
# print breed_dir

dims_dir = os.path.join(dir_name, '..', '..', 'dims', breed_dir)

if not os.path.exists(dims_dir):
    os.mkdir(dims_dir)

dims_file = os.path.join(dims_dir, file_name)
# print dims_file

with open(dims_file, 'w') as outf:
    for breed in breeds:
        outf.write('{:s} {:d} {:d} {:d} {:d}\n'.format(*breed))

# Just checking if it went right
print "\nFile contents:"
with open(dims_file, 'r') as f:
    classes = [line.rstrip('\n') for line in f]
    names, dims = [], []
    for cls in classes:
        e = cls.split(' ')
        names.append(str(e[0]))
        dim = [int(i) for i in e[1:]]
        dims.append(dim)
    print names, dims

[['schipperke', 139, 69, 305, 286], ['schipperke', 394, 113, 499, 230]]

File contents:
['schipperke', 'schipperke'] [[139, 69, 305, 286], [394, 113, 499, 230]]


In [178]:
# Looping over all the files and storing the dims in a separate directory
data_dir = os.path.abspath(os.path.join(os.path.dirname('Data_Preprocessing.ipynb'), '..', 'data'))
annot_dir = os.path.join(data_dir, 'Annotation')
# print annot_dir

breeds = []
for folder in os.listdir(annot_dir):
    breeds.append(os.path.join(annot_dir, folder))

j = 0
for breed in breeds:
#     print breed
    breed_name = breed.split('/')[-1]
#     print breed_name
    for xml in os.listdir(breed):
        xml_path = os.path.join(breed, xml)
#         print xml_path
        try:
            domTree = parse(xml_path)
            annotation = domTree.documentElement
            objs = annotation.getElementsByTagName("object") # since object is a global variable
            classes = []
            for obj in objs:
                cls = []
                dims = []
                name = obj.getElementsByTagName("name")[0]
                cls.append(str(name.childNodes[0].data))
                
                bndbox = obj.getElementsByTagName("bndbox")[0]
                dims.append(bndbox.getElementsByTagName("xmin")[0])
                dims.append(bndbox.getElementsByTagName("ymin")[0])
                dims.append(bndbox.getElementsByTagName("xmax")[0])
                dims.append(bndbox.getElementsByTagName("ymax")[0])
                
                dims = [int(dim.childNodes[0].data) for dim in dims]
                for dim in dims: cls.append(dim)
                classes.append(cls)

            dir_name, file_name = os.path.split(xml_path)
            dims_dir = os.path.join(dir_name, '..', '..', 'dims', breed_name)

            if not os.path.exists(dims_dir):
                os.mkdir(dims_dir)

            dims_file = os.path.join(dims_dir, file_name)

            with open(dims_file, 'w') as outf:
                for cls in classes:
                    outf.write('{:s} {:d} {:d} {:d} {:d}\n'.format(*cls))
                    
            if j == 0:
                print "Case #{}".format(j)
                print "To write:", classes
                
                with open(dims_file, 'r') as f:
                    print "File contents:"
                    classes = [line.rstrip('\n') for line in f]
                    names, dims = [], []
                    for cls in classes:
                        e = cls.split(' ')
                        names.append(str(e[0]))
                        dim = [int(i) for i in e[1:]]
                        dims.append(dim)
                    print names, dims
                
            j += 1
        except:
            print "Problem with the xml file:", xml_path
            
print "No. of files written:", j

Case #0
To write: [['schipperke', 73, 10, 405, 498]]
File contents:
['schipperke'] [[73, 10, 405, 498]]
No. of files written: 20580


In [10]:
# cd to data_dir
data_dir = os.path.abspath(os.path.join(os.path.dirname('Data_Preprocessing.ipynb'), '..', 'data'))
imgs_dir = os.path.join(data_dir, 'Images')
cropped_dir = os.path.join(data_dir, 'cleaned_images')

if not os.path.exists(cropped_dir):
    os.mkdir(cropped_dir)

print 'cropped dir:', cropped_dir

breeds = []
for folder in os.listdir(imgs_dir):
    breeds.append(os.path.join(imgs_dir, folder))
    
# lmin, lmax, hmin, hmax = [0]*4
j = 0
k = 0
st_time = time.time()

for breed in breeds:
    breed_name = breed.split('/')[-1]
    for img in os.listdir(breed):
        img_path = os.path.join(breed, img)
        image = cv.imread(img_path)
        
        img_dir, img_file = img_path.split('/')[-2:]
        img_file = img_file.split('.')[0]    # get rid of .jpg
        dims_file = os.path.join(data_dir, 'dims', img_dir, img_file)
        names = []
        boxes = []
        cropped_imgs = []
        with open(dims_file, 'r') as f:
            classes = [line.rstrip('\n') for line in f]
            names, dims = [], []
            for cls in classes:
                e = cls.split(' ')
                names.append(str(e[0]))
                box = [int(i) for i in e[1:]]
                boxes.append(box)

#         print names, boxes
        numImages = 1
        for name, box in zip(names, boxes):
            ymin, xmin, ymax, xmax = box
            cropped = image[xmin:xmax, ymin:ymax, :]
            cropped_imgs.append(cropped)
            
            cropped_input_file = img.split('.')[0] + '_' + str(numImages) + '.jpg'
            breed_dir = os.path.join(cropped_dir, name)
            if not os.path.exists(breed_dir):
                os.mkdir(breed_dir)
            
            cropped_file_path = os.path.join(breed_dir, cropped_input_file)
            cv.imwrite(cropped_file_path, cropped)
            
            numImages += 1
            k += 1 
            
#             cropped_output_path = os.path.join(cropped_dir, 'classes.csv')
#             with open(cropped_output_path, 'a') as outf:
#                 outf.write('{}, '.format(name))
            
#             if j == 0: lmin, hmin, _ = cropped.shape
#             l, h, _ = cropped.shape
#             if l < lmin: lmin = l
#             if h < hmin: hmin = h
#             if h > hmax: hmax = h
#             if l > lmax: lmax = l
#             if l >= 200 and h >= 200:
#                 j += 1
#             if (l >= 18 and l < 200) or (h >= 17 and h < 200): k += 1

        
#         if j == 1000:
#             print 'img_file:', img
#             print 'img_path:', img_path
#             print 'dims_file:', dims_file
#             print 'Original image dims:', image.shape
#             plt.imshow(image)
#             plt.show()
#             for name, img, box in zip(names, cropped_imgs, boxes):
#                 print img_path
#                 print "Class:", name
#                 print box
#                 print "Cropped image dims:", img.shape
#                 plt.imshow(img)
#                 plt.show()

        if j == 3000: print "Cropped {} images into {} images.\n Elapsed time: {}" \
                            .format(j,k,time.time() - st_time)
        if j == 6000: print "Cropped {} images into {} images.\n Elapsed time: {}" \
                            .format(j,k,time.time() - st_time)
        if j == 8000: print "Cropped {} images into {} images.\n Elapsed time: {}" \
                            .format(j,k,time.time() - st_time)
        if j == 10000: print "Cropped {} images into {} images.\n Elapsed time: {}" \
                            .format(j,k,time.time() - st_time)
        if j == 12000: print "Cropped {} images into {} images.\n Elapsed time: {}" \
                            .format(j,k,time.time() - st_time)
        if j == 14000: print "Cropped {} images into {} images.\n Elapsed time: {}" \
                            .format(j,k,time.time() - st_time)
        if j == 16000: print "Cropped {} images into {} images.\n Elapsed time: {}" \
                            .format(j,k,time.time() - st_time)
        if j == 18000: print "Cropped {} images into {} images.\n Elapsed time: {}" \
                            .format(j,k,time.time() - st_time)
        if j == 20000: print "Cropped {} images into {} images.\n Elapsed time: {}" \
                            .format(j,k,time.time() - st_time)
        
        j += 1
        
print 'Total Original images:', j
print 'Total Cropped images:', k

# print '\nExtremities:'
# print 'lmin:', lmin, 'lmax:', lmax
# print 'hmin:', hmin, 'hmax:', hmax

cropped dir: /home/ashish/ml-projects/stanford_dogs/data/cleaned_images
Cropped 3000 images into 3248 images
Cropped 6000 images into 6441 images
Cropped 8000 images into 8651 images
Cropped 10000 images into 10825 images
Cropped 12000 images into 12971 images
Cropped 14000 images into 15144 images
Cropped 16000 images into 17283 images
Cropped 18000 images into 19405 images
Cropped 20000 images into 21512 images
Total Original images: 20580
Total Cropped images: 22126


In [4]:
# cd to data_dir
data_dir = os.path.abspath(os.path.join(os.path.dirname('Data_Preprocessing.ipynb'), '..', 'data'))
imgs_dir = os.path.join(data_dir, 'Images')
cropped_dir = os.path.join(data_dir, 'cropped_images')

if not os.path.exists(cropped_dir):
    os.mkdir(cropped_dir)

print 'cropped dir:', cropped_dir

breeds = []
for folder in os.listdir(imgs_dir):
    breeds.append(os.path.join(imgs_dir, folder))
    
lmin, lmax, hmin, hmax = [0]*4
j = 0
k = 0
for breed in breeds:
    breed_name = breed.split('/')[-1]
    for img in os.listdir(breed):
        img_path = os.path.join(breed, img)
        image = cv.imread(img_path)
        
        img_dir, img_file = img_path.split('/')[-2:]
        img_file = img_file.split('.')[0]    # get rid of .jpg
        dims_file = os.path.join(data_dir, 'dims', img_dir, img_file)
        names = []
        boxes = []
        cropped_imgs = []
        with open(dims_file, 'r') as f:
            classes = [line.rstrip('\n') for line in f]
            names, dims = [], []
            for cls in classes:
                e = cls.split(' ')
                names.append(str(e[0]))
#                 box = [int(i) for i in e[1:]]
#                 boxes.append(box)

#         print names, boxes
#         numImages = 1
        for name in names:
#             ymin, xmin, ymax, xmax = box
#             cropped = image[xmin:xmax, ymin:ymax, :]
#             cropped_imgs.append(cropped)
            
#             cropped_input_file = img.split('.')[0] + '_' + str(numImages) + '.jpg'
#             cropped_file_path = os.path.join(cropped_dir, cropped_input_file)
#             cv.imwrite(cropped_file_path, cropped)
            
            cropped_output_path = os.path.join(cropped_dir, 'classes.csv')
            with open(cropped_output_path, 'a') as outf:
                outf.write('{}, '.format(name))
            
#             if j == 0: lmin, hmin, _ = cropped.shape
#             l, h, _ = cropped.shape
#             if l < lmin: lmin = l
#             if h < hmin: hmin = h
#             if h > hmax: hmax = h
#             if l > lmax: lmax = l
#             if l >= 200 and h >= 200:
#                 j += 1
#             if (l >= 18 and l < 200) or (h >= 17 and h < 200): k += 1

        
#         if j == 12000:
#             print 'img_file:', img
#             print 'img_path:', img_path
#             print 'dims_file:', dims_file
#             print 'Original image dims:', image.shape
#             plt.imshow(image)
#             plt.show()
#             for name, img, box in zip(names, cropped_imgs, boxes):
#                 print img_path
#                 print "Class:", name
#                 print box
#                 print "Cropped image dims:", img.shape
#                 plt.imshow(img)
#                 plt.show()
        
        j += 1
        
print 'Images cropped:', j
print 'Images left:', k

# print '\nExtremities:'
# print 'lmin:', lmin, 'lmax:', lmax
# print 'hmin:', hmin, 'hmax:', hmax

cropped dir: /home/ashish/ml-projects/stanford_dogs/data/cropped_images
Images cropped: 20580
Images left: 0


### Creating a smaller dataset
Taking only the first 100 images from every class, cause constraints. :/

In [30]:
data_dir = os.path.abspath(os.path.join(os.path.dirname('Data_Preprocessing.ipynb'), '..', 'data'))

smaller_dataset_dir = os.path.join(data_dir, 'smaller_dataset')

if not os.path.exists(smaller_dataset_dir):
    os.mkdir(smaller_dataset_dir)
    
cleaned_imgs_dir = os.path.join(data_dir, 'cleaned_images')

breeds = []
for folder in os.listdir(cleaned_imgs_dir):
    breeds.append(os.path.join(cleaned_imgs_dir, folder))
    
j = 0
k = 0
st_time = time.time()

for breed in breeds:
    breed_name = breed.split('/')[-1]
    for img in os.listdir(breed)[:100]:
        img_path = os.path.join(breed, img)
#         image = cv.imread(img_path)
        
# So the breed_name is the class
# and out_img_path is the output file

#         l,h,_ = image.shape
#         if l <= 224 or h <= 224: continue


        breed_dir = os.path.join(smaller_dataset_dir, breed_name)
        if not os.path.exists(breed_dir):
            os.mkdir(breed_dir)
        out_img_path = os.path.join(breed_dir, img)
#         print img_path
#         print out_img_path
        
        shutil.copyfile(img_path, out_img_path)

#         cv.imwrite(out_img_path, image)
    
        j += 1
        if j == 1000: print "Done with {}".format(j)
        if j == 2000: print "Done with {}".format(j)
        if j == 5000: print "Done with {}".format(j)
        if j == 8000: print "Done with {}".format(j)
        if j == 10000: print "Done with {}".format(j)
        if j == 12000: print "Done with {}".format(j)
        
print "images:", j
print "time:", time.time() - st_time

Done with 1000
Done with 2000
Done with 5000
Done with 8000
Done with 10000
Done with 12000
images: 12000
time: 98.5753428936


### Splitting data for Keras

And now, Keras needs the data to be in another format. Wow! :/

First for faster implementation, I am separating the smaller dataset I created above.

In [37]:
data_dir = os.path.abspath(os.path.join(os.path.dirname('Data_Preprocessing.ipynb'), '..', 'data'))

keras_small_dir = os.path.join(data_dir, 'keras_small_ds')

if not os.path.exists(keras_small_dir):
    os.mkdir(keras_small_dir)
    
smaller_dataset_dir = os.path.join(data_dir, 'smaller_dataset')

breeds = []
for folder in os.listdir(smaller_dataset_dir):
    breeds.append(os.path.join(smaller_dataset_dir, folder))
    
# k = 0
j = 0
st_time = time.time()

for breed in breeds:
    breed_name = breed.split('/')[-1]
    for img in os.listdir(breed)[:70]:
        img_path = os.path.join(breed, img)

        breed_dir = os.path.join(keras_small_dir, 'train', breed_name)
        if not os.path.exists(breed_dir):
            os.mkdir(breed_dir)
        out_img_path = os.path.join(breed_dir, img)
#         print img_path
#         print out_img_path
        
        shutil.copyfile(img_path, out_img_path)
        j += 1

for breed in breeds:
    breed_name = breed.split('/')[-1]
    for img in os.listdir(breed)[70:]:
        img_path = os.path.join(breed, img)

        breed_dir = os.path.join(keras_small_dir, 'validation', breed_name)
        if not os.path.exists(breed_dir):
            os.mkdir(breed_dir)
        out_img_path = os.path.join(breed_dir, img)
#         print img_path
#         print out_img_path
        
        shutil.copyfile(img_path, out_img_path)
        j += 1

print "images:", j
print "time:", time.time() - st_time

images: 12000
time: 57.6521160603


### Creating a miniature dataset for testing
Because CPU :/  
With 2 train and 1 test image per class

In [None]:
data_dir = os.path.abspath(os.path.join(os.path.dirname('Data_Preprocessing.ipynb'), '..', 'data'))

keras_mini_dir = os.path.join(data_dir, 'keras_mini_ds')

if not os.path.exists(keras_mini_dir):
    os.mkdir(keras_mini_dir)
    
smaller_dataset_dir = os.path.join(data_dir, 'smaller_dataset')

breeds = []
for folder in os.listdir(smaller_dataset_dir):
    breeds.append(os.path.join(smaller_dataset_dir, folder))
    
# k = 0
j = 0
st_time = time.time()

for breed in breeds:
    breed_name = breed.split('/')[-1]
    for img in os.listdir(breed)[3:9]:
        img_path = os.path.join(breed, img)

        keras_train_dir = os.path.join(keras_mini_dir, 'train')
        if not os.path.exists(keras_train_dir):
            os.mkdir(keras_train_dir)
            
        breed_dir = os.path.join(keras_mini_dir, 'train', breed_name)
        if not os.path.exists(breed_dir):
            os.mkdir(breed_dir)
        out_img_path = os.path.join(breed_dir, img)
#         print img_path
#         print out_img_path
        
        shutil.copyfile(img_path, out_img_path)
        j += 1

for breed in breeds:
    breed_name = breed.split('/')[-1]
    for img in os.listdir(breed)[9:10]:
        img_path = os.path.join(breed, img)
        
        keras_validation_dir = os.path.join(keras_mini_dir, 'validation')
        if not os.path.exists(keras_validation_dir):
            os.mkdir(keras_validation_dir)
            
        breed_dir = os.path.join(keras_mini_dir, 'validation', breed_name)
        if not os.path.exists(breed_dir):
            os.mkdir(breed_dir)
        out_img_path = os.path.join(breed_dir, img)
#         print img_path
#         print out_img_path
        
        shutil.copyfile(img_path, out_img_path)
        j += 1

print "images:", j
print "time:", time.time() - st_time

### Creating Distributed datasets
Because memory allocation constraints :/

In [5]:
data_dir = os.path.abspath(os.path.join(os.path.dirname('Data_Preprocessing.ipynb'), '..', 'data'))

keras_mini_dir = os.path.join(data_dir, 'keras_distributed_ds')

if not os.path.exists(keras_mini_dir):
    os.mkdir(keras_mini_dir)
    
smaller_dataset_dir = os.path.join(data_dir, 'smaller_dataset')

breeds = []
for folder in os.listdir(smaller_dataset_dir):
    breeds.append(os.path.join(smaller_dataset_dir, folder))
    
# k = 0
j = 0
st_time = time.time()

for breed in breeds:
    breed_name = breed.split('/')[-1]
    
    for i in xrange(0,70,10):
        for img in os.listdir(breed)[i:i+10]:
            img_path = os.path.join(breed, img)
            
            train_dir = 'train' + '_' + str(i/10 + 1)
            keras_train_dir = os.path.join(keras_mini_dir, train_dir)
            if not os.path.exists(keras_train_dir):
                os.mkdir(keras_train_dir)

            breed_dir = os.path.join(keras_train_dir, breed_name)
            if not os.path.exists(breed_dir):
                os.mkdir(breed_dir)

            out_img_path = os.path.join(breed_dir, img)
#             print img_path
#             print out_img_path

            shutil.copyfile(img_path, out_img_path)
            j += 1


#     validation set #1 [70:80]
#     validation set #2 [80:90]
#     validation set #3 [90:100]
#     for img in os.listdir(breed)[90:100]:
#         img_path = os.path.join(breed, img)
        
#         keras_validation_dir = os.path.join(keras_mini_dir, 'validation_3')
#         if not os.path.exists(keras_validation_dir):
#             os.mkdir(keras_validation_dir)
            
#         breed_dir = os.path.join(keras_mini_dir, 'validation_3', breed_name)
#         if not os.path.exists(breed_dir):
#             os.mkdir(breed_dir)
#         out_img_path = os.path.join(breed_dir, img)
# #         print img_path
# #         print out_img_path
        
#         shutil.copyfile(img_path, out_img_path)
#         j += 1

print "images:", j
print "time:", time.time() - st_time

images: 8400
time: 31.1456160545
