Modified from source: https://github.com/mprat/pascal-voc-python

In [1]:
%matplotlib inline

In [2]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import voc_utils
from more_itertools import unique_everseen

import numpy as np
from scipy.misc import imread, imresize, imsave

In [3]:
root_dir = 'C:/Users/xx/Downloads/PASCAL/VOCtrainval_11-May-2012/VOCdevkit/VOC2012/'
img_dir = os.path.join(root_dir, 'JPEGImages/')
ann_dir = os.path.join(root_dir, 'Annotations/')
set_dir = os.path.join(root_dir, 'ImageSets', 'Main/')

## Exploratory Code

In [5]:
# list image sets
all_files = os.listdir(set_dir)
#all_files = !ls {set_dir}
image_sets = sorted(list(set([filename.replace('.txt', '').strip().split('_')[0] for filename in all_files])))
print(image_sets)

['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'trainval', 'tvmonitor', 'val']


In [7]:
len(image_sets)-3

19

In [4]:
# category name is from above, dataset is either "train" or
# "val" or "train_val"
def imgs_from_category(cat_name, dataset):
    filename = os.path.join(set_dir, cat_name + "_" + dataset + ".txt")
    df = pd.read_csv(
        filename,
        delim_whitespace=True,
        header=None,
        names=['filename', 'true'])
    return df

def imgs_from_category_as_list(cat_name, dataset):
    df = imgs_from_category(cat_name, dataset)
    df = df[df['true'] == 1]
    return df['filename'].values

def annotation_file_from_img(img_name):
    return os.path.join(ann_dir, img_name) + '.xml'

In [5]:
# annotation operations
def load_annotation(img_filename):
    xml = ""
    with open(annotation_file_from_img(img_filename)) as f:
        xml = f.readlines()
    xml = ''.join([line.strip('\t') for line in xml])
    return BeautifulSoup(xml,"html.parser")#return BeautifulSoup(xml)

def get_all_obj_and_box(objname, img_set):
    img_list = imgs_from_category_as_list(objname, img_set)
    
    for img in img_list:
        annotation = load_annotation(img)
        

In [6]:
from PIL import Image

# image operations
def load_img(img_filename):
    return Image.open(os.path.join(img_dir, img_filename))
    #return Image.open(os.path.join(img_dir, img_filename + '.jpg'))
    #return io.load_image(os.path.join(img_dir, img_filename + '.jpg'))

In [7]:
def load_data(category, dataset):
    '''
    Args:
    category (str): name of category to load
    dataset  (str): 'train' or 'val' set
    
    Returns: dataframe with xxxx data
    '''
    to_find = category
    
    filePath = '{}csvs/'.format(root_dir)
    if not os.path.exists(filePath):
        os.makedirs(filePath)     
    filename = '{}{}_{}.csv'.format(filePath,dataset,category)
      
    if os.path.isfile(filename):
        return pd.read_csv(filename)
    else:
        img_list = imgs_from_category_as_list(to_find, dataset)
        data = []
        for item in img_list:
            anno = load_annotation(item)
            objs = anno.findAll('object')
            
            fname = anno.findChild('filename').contents[0]
            
            hasCat = 0
            hasDog = 0
            hasHorse=0
            hasPerson = 0
            
            for obj in objs:
                obj_names = obj.findChildren('name')
                if obj_names[0].contents[0] == 'cat':
                    hasCat = 1
                elif obj_names[0].contents[0] == 'dog':
                    hasDog = 1
                elif obj_names[0].contents[0] == 'horse':
                    hasHorse = 1
                elif obj_names[0].contents[0] == 'person':
                    hasPerson = 1
            
            data.append([fname, hasCat, hasDog, hasHorse, hasPerson])
        df = pd.DataFrame(data, columns=['fname', 'cat', 'dog', 'horse', 'person'])
        df.to_csv(filename, index=False)
        return df

## Split Train-Val-Test data (save npy array)

In [None]:
# 07/04/2018: Split (x,y) data and then save them in npy array
#=======================================

#train-val-test = 85-10-5
val_to_trainSplit = 0.10/0.85
testSplit = 0.05

# Intended size = 128, leave a bit of buffer for rounding
imgSize = 128.
data_dir = '{}data/'.format(root_dir)

categories = ['cat', 'dog', 'horse', 'person']
dataset = 'trainval'

trainFiles= []
valFiles  = []
testFiles = []
yTrain    = []
yVal      = []
yTest     = []
xTrain    = []
xVal      = []
xTest     = []


for categ in categories:
    df = load_data(categ, dataset)

    # >3 classes in img
    indices = np.where(df.sum(axis=1)>=3)[0]
    nTest = int(np.ceil(testSplit*len(indices)))
    nLeft = int(len(indices) - nTest)
    nVal  = int(np.ceil(val_to_trainSplit*nLeft))
    nTrain= int(nLeft - nVal)

    if nTest !=0:
        testFiles += df.iloc[indices[:nTest]]['fname'].tolist()
        currY = df.iloc[indices[:nTest]].drop(labels='fname',axis=1).as_matrix()
        yTest += list(currY)
    if nVal  !=0:
        valFiles  += df.iloc[indices[nTest:(nTest+nVal)]]['fname'].tolist()
        currY = df.iloc[indices[nTest:(nTest+nVal)]].drop(labels='fname',axis=1).as_matrix()
        yVal += list(currY)   
    if nTrain!=0:
        trainFiles+= df.iloc[indices[-nTrain:]]['fname'].tolist()
        currY = df.iloc[indices[-nTrain:]].drop(labels='fname',axis=1).as_matrix()
        yTrain+=list(currY)

    # Only 2 classes in img
    indices = np.where(df.sum(axis=1)==2)[0]
    nTest = int(np.ceil(testSplit*len(indices)))
    nLeft = int(len(indices) - nTest)
    nVal  = int(np.ceil(val_to_trainSplit*nLeft))
    nTrain= int(nLeft - nVal)

    if nTest !=0:
        testFiles += df.iloc[indices[:nTest]]['fname'].tolist()
        currY = df.iloc[indices[:nTest]].drop(labels='fname',axis=1).as_matrix()
        yTest += list(currY)
    if nVal  !=0:
        valFiles  += df.iloc[indices[nTest:(nTest+nVal)]]['fname'].tolist()
        currY = df.iloc[indices[nTest:(nTest+nVal)]].drop(labels='fname',axis=1).as_matrix()
        yVal += list(currY)   
    if nTrain!=0:
        trainFiles+= df.iloc[indices[-nTrain:]]['fname'].tolist()
        currY = df.iloc[indices[-nTrain:]].drop(labels='fname',axis=1).as_matrix()
        yTrain+=list(currY)

    # Only 1 class in img
    indices = np.where(df.sum(axis=1)==1)[0]
    nTest = int(np.ceil(testSplit*len(indices)))
    nLeft = int(len(indices) - nTest)
    nVal  = int(np.ceil(val_to_trainSplit*nLeft))
    nTrain= int(nLeft - nVal)

    if nTest !=0:
        testFiles += df.iloc[indices[:nTest]]['fname'].tolist()
        currY = df.iloc[indices[:nTest]].drop(labels='fname',axis=1).as_matrix()
        yTest += list(currY)
    if nVal  !=0:
        valFiles  += df.iloc[indices[nTest:(nTest+nVal)]]['fname'].tolist()
        currY = df.iloc[indices[nTest:(nTest+nVal)]].drop(labels='fname',axis=1).as_matrix()
        yVal += list(currY)   
    if nTrain!=0:
        trainFiles+= df.iloc[indices[-nTrain:]]['fname'].tolist()
        currY = df.iloc[indices[-nTrain:]].drop(labels='fname',axis=1).as_matrix()
        yTrain+=list(currY)

                

total = len(testFiles) + len(valFiles) + len(trainFiles)
print(':: total={}; test={}, val={}, train={}'.format(total,len(testFiles),len(valFiles),len(trainFiles)))

# Process img to npy
for xFiles, x in zip([testFiles, valFiles, trainFiles], [xTest, xVal, xTrain]):
    for imgName in xFiles:
        img_np = np.array(imread(img_dir+imgName),dtype=np.float32)
        #print('original:{}'.format(img_np.size))
        
        x.append( imresize(img_np,(int(imgSize),int(imgSize)), interp='bilinear') )
        #print(img_np.size)


#convert from list of arrays back to np.array
yTest  = np.array(yTest, dtype=np.float32)
yVal   = np.array(yVal, dtype=np.float32)
yTrain = np.array(yTrain, dtype=np.float32)

xTest  = np.array(xTest, dtype=np.float32) /255.
xVal   = np.array(xVal, dtype=np.float32)  /255.
xTrain = np.array(xTrain, dtype=np.float32)/255.

np.save('{}xTrain.npy'.format(data_dir), xTrain)
np.save('{}xVal.npy'.format(data_dir), xVal)
np.save('{}xTest.npy'.format(data_dir), xTest)
np.save('{}yTrain.npy'.format(data_dir), yTrain)
np.save('{}yVal.npy'.format(data_dir), yVal)
np.save('{}yTest.npy'.format(data_dir), yTest)

In [12]:
#Save associated filenames
import pickle

with open("../CS6208/data/xTestFileNames.txt", "wb") as fp:
  pickle.dump(testFiles, fp)
with open("../CS6208/data/xValFileNames.txt", "wb") as fp:
  pickle.dump(valFiles, fp)
with open("../CS6208/data/xTrainFileNames.txt", "wb") as fp:
  pickle.dump(trainFiles, fp)

## Preprocess images to 128x128 and save into folders

In [None]:
# 07/04/2018: Split data and then place into folders
#=======================================
# ASSUMES FOLDERS ARE ALREADY CREATED
# ./data/{train, val, test}/{cat,dog,horse,person}

#train-val-test = 85-10-5
val_to_trainSplit = 0.10/0.85
testSplit = 0.05

imgSize = 128.
train_dir = '{}data/train/'.format(root_dir)
val_dir = '{}data/val/'.format(root_dir)
test_dir = '{}data/test/'.format(root_dir)

categories = ['cat', 'dog', 'horse', 'person']
dataset = 'trainval'

for categ in categories:
    df = load_data(categ, dataset)
    
    trainFiles =[]
    valFiles   =[]
    testFiles  =[]

    # >3 classes in img
    indices = np.where(df.sum(axis=1)>=3)[0]
    nTest = int(np.ceil(testSplit*len(indices)))
    nLeft = int(len(indices) - nTest)
    nVal  = int(np.ceil(val_to_trainSplit*nLeft))
    nTrain= int(nLeft - nVal)

    if nTest !=0:testFiles += df.iloc[indices[:nTest]]['fname'].tolist()
    if nVal  !=0:valFiles  += df.iloc[indices[nTest:(nTest+nVal)]]['fname'].tolist()
    if nTrain!=0:trainFiles+= df.iloc[indices[-nTrain:]]['fname'].tolist()

    # Only 2 classes in img
    indices = np.where(df.sum(axis=1)==2)[0]
    nTest = int(np.ceil(testSplit*len(indices)))
    nLeft = int(len(indices) - nTest)
    nVal  = int(np.ceil(val_to_trainSplit*nLeft))
    nTrain= int(nLeft - nVal)

    if nTest !=0:testFiles += df.iloc[indices[:nTest]]['fname'].tolist()
    if nVal  !=0:valFiles  += df.iloc[indices[nTest:(nTest+nVal)]]['fname'].tolist()
    if nTrain!=0:trainFiles+= df.iloc[indices[-nTrain:]]['fname'].tolist()

    # Only 1 class in img
    indices = np.where(df.sum(axis=1)==1)[0]
    nTest = int(np.ceil(testSplit*len(indices)))
    nLeft = int(len(indices) - nTest)
    nVal  = int(np.ceil(val_to_trainSplit*nLeft))
    nTrain= int(nLeft - nVal)

    if nTest !=0:testFiles += df.iloc[indices[:nTest]]['fname'].tolist()
    if nVal  !=0:valFiles  += df.iloc[indices[nTest:(nTest+nVal)]]['fname'].tolist()
    if nTrain!=0:trainFiles+= df.iloc[indices[-nTrain:]]['fname'].tolist()
    
    total = len(testFiles) + len(valFiles) + len(trainFiles)
    print('{}:: total={}; test={}, val={}, train={}'.format(categ, total,len(testFiles),len(valFiles),len(trainFiles)))
    
    #################################################
    # Save images into folders per categ, per dataset
    #################################################
    for xFiles,dirName in zip([testFiles, valFiles, trainFiles], [test_dir, val_dir, train_dir]):
        for imgName in xFiles:
            img_np = imread(img_dir+imgName)
            #print('original:{}'.format(img_np.size))
            img_np = imresize(img_np,(int(imgSize),int(imgSize)), interp='bilinear')
            imsave('{}{}/{}'.format(dirName, categ, imgName), img_np)

In [None]:
# Assert that %split is correct
total = len(testFiles) + len(valFiles) + len(trainFiles)
print(float(len(trainFiles))/total)
print('total (incl. overlap)={};\ntest={},val={},train={}'.format(total,len(testFiles),len(valFiles),len(trainFiles)))
del total

In [None]:
categories = ['cat', 'dog', 'horse', 'person']
datasets = ['trainval']#,'train','val']

summary = []
for dataset in datasets:
    for categ in categories:
        df = load_data(categ,dataset)

        clsMore3 = (df.sum(axis=1) >= 3).sum()
        cls2 = (df.sum(axis=1) == 2).sum()
        cls1 = (df.sum(axis=1) == 1).sum()
        
        summary.append([clsMore3,cls2,cls1])
df_summary = pd.DataFrame(summary, columns=['clsMore3', 'cls2', 'cls1'])
df_summary['categ'] = categories
df_summary    
    #print('{}: >3({}), 2({}), 1({})'.format(categ, clsMore3,cls2,cls1))

#(df[['cat', 'person']].sum(axis=1) == 2).sum()
np.where(df.sum(axis=1) >= 3)