This script is used to extract small image crops from the actual data.

In [215]:
import os
import PIL
import numpy as np
import cv2
import pickle
from collections import namedtuple
from torchvision import transforms
from tqdm import tqdm as tqdm
import time

IndexEntry Objects are used to store all the information regarding a particular bounding box/ bounding box crop

In [213]:
IndexEntry = namedtuple('IndexEntry', ['img_path', 'sub_idx', 'classname', 'left', 'top', 'right', 'bottom'], verbose=False)
IndexEntryCrop = namedtuple('IndexEntryCrop', ['img_path', 'sub_idx', 'crop_path_1p0', 'classname', 'left', 'top', 'right', 'bottom'], verbose=False)

The following pickle file contains an list of IndexEntry Objects with the full information of the whole dataset

In [211]:
pickle_file = '/raid/user-data/lscheucher/projects/bounding_box_classifier/full_object_index.pickle'
with open(pickle_file, 'rb') as f:
    idxs = pickle.load(f)

Define the directory where the croped images should be written to

In [57]:
DATADIR = '/raid/user-data/lscheucher/tmp/pytorch_classifier_data'

In [48]:
transformer = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ])

The follwoing function is used to process the huge list of IndexEntry objects in parallel.
The index object are stored in a big list names idxs.
So to work on parrallel, this function needs a tuple `indices` as argument, specifying which part of the full list an instance of the function should work on

In [197]:
def parallelfun(indices):
    sublist= copy.deepcopy(idxs[indices[0]:indices[1]])
    idxs_crop = []
    i=0
    start=time.time()
    for entry in sublist:
        
        img_final = np.zeros(shape=(224,224,3), dtype='uint8')
        img = cv2.imread(entry.img_path)
        img = img[entry.top:entry.bottom, entry.left:entry.right, :]

        H, W, C = img.shape
        if H<=224 and W<=224:
            img=img

        elif H>W:
            fac = 224/H
            img = cv2.resize(img, (int(W*fac),224))
            H, W, C = img.shape
        else:#W>H
            fac = 224/W           # W    H
            img = cv2.resize(img, (224 ,int(H*fac)))
            H, W, C = img.shape
        dh = (224-H)>>1
        dw = (224-W)>>1
        img_final[dh:dh+H, dw:dw+W, :] = img
        
        
        if not os.path.isdir(os.path.join(DATADIR,entry.classname)):
            os.mkdir(os.path.join(DATADIR,entry.classname))
        #crop_path_1p0 = os.path.join(DATADIR,entry.classname,str(i)+'.png')
        crop_path_1p0 = entry.img_path.split('/')[-1].split('.')[0]+str(entry.sub_idx)+'.png'
        crop_path_1p0 = os.path.join(DATADIR, entry.classname, crop_path_1p0)
        cv2.imwrite(crop_path_1p0, img_final)
        idxs_crop.append(IndexEntryCrop(img_path=entry.img_path,
                                        sub_idx=entry.sub_idx,
                                        crop_path_1p0=crop_path_1p0,
                                        classname=entry.classname,
                                        left=entry.left,
                                        top=entry.top,
                                        right=entry.right,
                                        bottom=entry.bottom))
        i += 1
        if i%1000==0: print(i)
    return idxs_crop

Create the `indices` tuples for a number `N` of parallel processes.
Store the result in `arg_instances`.

In [216]:
N=40

step = int(len(idxs)/N)
arg_instances= []

for i in range(N-1):
    print(i, i*step, (i+1)*step)
    #arg_instances.append(idxs[i*step:(i+1)*step])
    arg_instances.append([i*step,(i+1)*step])
print(SPLITCOUNT-1,(N-1)*step,len(idxs))
arg_instances.append([(N-1)*step, len(idxs)])

0 0 12119
1 12119 24238
2 24238 36357
3 36357 48476
4 48476 60595
5 60595 72714
6 72714 84833
7 84833 96952
8 96952 109071
9 109071 121190
10 121190 133309
11 133309 145428
12 145428 157547
13 157547 169666
14 169666 181785
15 181785 193904
16 193904 206023
17 206023 218142
18 218142 230261
19 230261 242380
20 242380 254499
21 254499 266618
22 266618 278737
23 278737 290856
24 290856 302975
25 302975 315094
26 315094 327213
27 327213 339332
28 339332 351451
29 351451 363570
30 363570 375689
31 375689 387808
32 387808 399927
33 399927 412046
34 412046 424165
35 424165 436284
36 436284 448403
37 448403 460522
38 460522 472641
39 472641 484779


Create `N` independent processes working on `idxs` in parallel

In [201]:
from joblib import Parallel, delayed


results = Parallel(n_jobs=SPLITCOUNT, backend="multiprocessing")(
             map(delayed(parallelfun), arg_instances))


# where arg_instances is list of values for which myfun is computed in parallel.
# The main restriction is that myfun must be a toplevel function.
# The backend parameter can be either "threading" or "multiprocessing".

1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
2000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000


In [206]:
# with open('tempsave.pickle','wb') as f:
#     pickle.dump(results,f)

In [208]:
result = [j for i in results for j in i]

with open('/raid/user-data/lscheucher/projects/bounding_box_classifier/full_object_index_crop.pickle','wb') as f:
    pickle.dump(result,f)