In [1]:
import pandas as pd
import pickle
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict
import os
import json
import copy

In [2]:
def make_save_dir(save_dir):
    if save_dir != None:
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
            
def make_subset(annotations, cls_div_ids, dict_class_with_subset=None):
    dict_annos = defaultdict(list)
    bkgrd_div = []
    if dict_class_with_subset != None:
        bkgrd_div = get_div_background_list(dict_class_with_subset,cls_div_ids)
#     print(bkgrd_div)
    for id, anno_details in tqdm(annotations.items()):
        subset_annos = []
        hasBackground = False
        hasLabels = False
        for anno in anno_details:
#             print(anno["LabelName"])
            if anno["LabelName"] in cls_div_ids:
                subset_annos.append(anno)
                hasLabels = True
            
            if anno["LabelName"] in bkgrd_div:
#                 print("hasBackground")
                hasBackground = True
                
        # TODO: REMOVE IF BACKGROUND LABELS NEEDS TO BE REMOVED
        if hasLabels==False and hasBackground:
            for anno in anno_details:
                if anno["LabelName"] in bkgrd_div:
                    temp = copy.deepcopy(anno) #.copy()
                    temp["LabelName"]="background"
                    subset_annos.append(temp)
                    
#         if len(subset_annos)>0 or hasBackground:
        if len(subset_annos)>0:
            dict_annos[id] = subset_annos

    idx_to_ids = {i:key for i, (key, _) in enumerate(dict_annos.items())}
    return dict_annos, idx_to_ids

def make_img_id_subset(annotations, cls_div_ids, valid_img_ids):
    dict_annos = defaultdict(list)
    
    for id in valid_img_ids:
        try:
            anno_details = annotations[id]
        except:
            print(id)
        subset_annos = []
        for anno in anno_details:
            if anno["LabelName"] in cls_div_ids:
                subset_annos.append(anno)
        if len(subset_annos)>0:
            dict_annos[id] = subset_annos

    idx_to_ids = {i:key for i, (key, _) in enumerate(dict_annos.items())}
    return dict_annos, idx_to_ids

def get_class_levels(hier):
    level1=[]
    level2=[]
    level3=[]
    level4=[]
    level5=[]

    for l2 in hier['Subcategory']:
        level5.append(l2['LabelName'])
        try:
            for j in l2['Subcategory']:
                level4.append(j['LabelName'])
                try:
                    for k in j['Subcategory']:
                        level3.append(k['LabelName'])
                        try:
                            for a in k['Subcategory']:
                                level2.append(a['LabelName'])
                                try:
                                    for b in a['Subcategory']:
                                        level1.append(b['LabelName'])
                                except:
                                    pass
                        except:
                            pass
                except:
                    pass
        except:
            pass
    level1 = list(set(level1))
    level2 = list(set(level2))
    level3 = list(set(level3))
    level4 = list(set(level4))       
    level5 = list(set(level5))   
    return [level1,level2,level3,level4,level5]

def get_required_human_list(human_labels_list, cls_to_names):
    important_human_label_list = []
    for hl in human_labels_list:
        if hl in cls_to_names:
            important_human_label_list.append(hl)
    return important_human_label_list

In [3]:
def get_subsets(d):
    res = []
    def myprint(d):
        for k, v in d.items():
            if isinstance(v, dict) and v != {}:
                myprint(v)
            else:
#                 print("{0} : {1}".format(k, v))
                res.append(k)
    myprint(d)
    return res

def get_dict_class_levels(hier):
    dict_levels = {}

    for l2 in hier['Subcategory']:
        l2_name = l2['LabelName']
        dict_levels[l2_name] = {}
        try:
            for j in l2['Subcategory']:
                l2j_name = j['LabelName']
                dict_levels[l2_name][l2j_name] = {}
                try:
                    for k in j['Subcategory']:
                        l2k_name = k['LabelName']
                        dict_levels[l2_name][l2j_name][l2k_name] = {}
                        try:
                            for a in k['Subcategory']:
                                l2a_name = a['LabelName']
                                dict_levels[l2_name][l2j_name][l2k_name][l2a_name] = {}
                                try:
                                    for b in a['Subcategory']:
                                        l2b_name = b['LabelName']
                                        dict_levels[l2_name][l2j_name][l2k_name][l2a_name][l2b_name] = {}
                                except:
                                    pass
                        except:
                            pass
                except:
                    pass
        except:
            pass
    return dict_levels

def get_valid_background_for_cls(dict_class_with_subset, current_cls):
    valid_background = []
    for key, value in dict_class_with_subset.items():
        if current_cls not in value:
            valid_background.append(key)
    return valid_background

def get_div_background_list(dict_class_with_subset,current_clss):
    valid_list = []
    for current_cls in current_clss:
        valid_list.append(get_valid_background_for_cls(dict_class_with_subset, current_cls))
    sets = []
    for val in valid_list:
        sets = list(set(sets) ^ set(val))
    sets = [x for x in sets if x not in current_clss]
    return sets

In [4]:
hierarchy_dir = '../dataset/challenge-2019-label500-hierarchy.json'
clsids_to_names_dir = "../data_info/all/clsids_to_names.json"
train_anno_dir = "../data_info/all/train/annotations/train-anno.json"
valid_anno_dir = "../data_info/all/valid/annotations/valid-anno.json"
human_labels_dir = "../dataset/class-ids-human-body-parts-and-mammal.txt"
save_dir = "../data_info/subsets"

In [5]:
clsids_to_names = json.load(open(clsids_to_names_dir,'r'))
hier = json.load(open(hierarchy_dir,'r'))
train_anno = json.load(open(train_anno_dir,'r'))
valid_anno = json.load(open(valid_anno_dir,'r'))
human_labels_list = list(pd.read_csv(human_labels_dir, header=None)[0])

In [6]:
human_labels_list = get_required_human_list(human_labels_list, clsids_to_names)
list_of_divisions = get_class_levels(hier)

In [7]:
# remove human labels from other class divisions
filter_list_of_dvisions = []
for divisions in list_of_divisions:
    temp = [x for x in divisions if x not in human_labels_list]
    filter_list_of_dvisions.append(temp)

In [8]:
all_divisions = filter_list_of_dvisions+ [human_labels_list]

In [9]:
for div in all_divisions:
    print(len(div))

3
20
78
228
166
11


In [10]:
def create_multiple_subsets(anno, all_divisions, save_dir, typ, clsids_to_names, dict_class_with_subset=None):
    for i, div in enumerate(all_divisions):
        dict_annos, idx_to_ids = make_subset(anno, div,dict_class_with_subset)
        print(i,":",len(dict_annos), len(idx_to_ids))
        curr_save_dir = save_dir+"/"+ str(i)
        make_save_dir(curr_save_dir+"/annotations")
        with open(curr_save_dir +"/annotations"+"/"+typ+"-anno.json", 'w') as fp:
            json.dump(dict(dict_annos), fp)

        with open(curr_save_dir+"/annotations"+"/"+typ+"-idx_to_id.json", 'w') as handle:
            json.dump(idx_to_ids, handle)
#             pickle.dump(idx_to_ids, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
        div_cls_to_names = {lb:clsids_to_names[lb] for lb in div}
        div_clsids_to_idx = {lb:i+1 for i, lb in enumerate(div)}
        div_clsids_to_idx["background"] = 0
        div_cls_to_names["background"] = "background"
        
        with open(curr_save_dir+"/clsids_to_names.json", 'w') as handle:
            json.dump(div_cls_to_names, handle)
#             pickle.dump(div_cls_to_names, handle, protocol=pickle.HIGHEST_PROTOCOL)

        with open(curr_save_dir+"/clsids_to_idx.json", 'w') as handle:
            json.dump(div_clsids_to_idx, handle)
#                 pickle.dump(div_clsids_to_idx, handle, protocol=pickle.HIGHEST_PROTOCOL)
                
        f = open(curr_save_dir+"/classes.txt", "w")
        f.write(str(len(div_cls_to_names)))
        f.close()

In [11]:
# create_multiple_subsets(train_anno, all_divisions, save_dir, "train", clsids_to_names)

In [12]:
# create_multiple_subsets(valid_anno, all_divisions, save_dir, "valid", clsids_to_names)

### Creating Background instances

In [13]:
dict_levels = get_dict_class_levels(hier)
dict_class_with_subset = defaultdict(list)
for key1, val1 in dict_levels.items():
#     print(key1, val1)
    dict_class_with_subset[key1] = get_subsets(val1)+[key1]
    for key2, val2 in val1.items():
        dict_class_with_subset[key2] = get_subsets(val2)+[key2]
#         print("-",key2, val2)
        for key3, val3 in val2.items():
            dict_class_with_subset[key3] = get_subsets(val3)+[key3]
#             print("--",key3, val3)
            for key4, val4 in val3.items():
                dict_class_with_subset[key4] = get_subsets(val4)+[key4]
#                 print("---",key4, val4)
                for key5, val5 in val4.items():
                    dict_class_with_subset[key5] = get_subsets(val5)+[key5]
#                     print("----",key5, val5)
dict_class_with_subset = dict(dict_class_with_subset)

In [14]:
for hl in human_labels_list:
    del dict_class_with_subset[hl]

In [15]:
len(dict(dict_class_with_subset))

489

In [16]:
dict_class_with_subset

{'/m/0242l': ['/m/0242l'],
 '/m/03120': ['/m/03120'],
 '/m/0h8l4fh': ['/m/0h8l4fh'],
 '/m/0138tl': ['/m/0167gd',
  '/m/01j51',
  '/m/029b3',
  '/m/02zt3',
  '/m/0kmg4',
  '/m/0138tl'],
 '/m/0167gd': ['/m/0167gd'],
 '/m/01j51': ['/m/01j51'],
 '/m/029b3': ['/m/029b3'],
 '/m/02zt3': ['/m/02zt3'],
 '/m/0kmg4': ['/m/0kmg4'],
 '/m/019dx1': ['/m/0174k2',
  '/m/01k6s3',
  '/m/029bxz',
  '/m/02pjr4',
  '/m/02wv84t',
  '/m/02x984l',
  '/m/03s_tn',
  '/m/040b_t',
  '/m/04169hn',
  '/m/063rgb',
  '/m/07xyvk',
  '/m/0fx9l',
  '/m/0llzx',
  '/m/03ldnb',
  '/m/019dx1'],
 '/m/0174k2': ['/m/0174k2'],
 '/m/01k6s3': ['/m/01k6s3'],
 '/m/029bxz': ['/m/029bxz'],
 '/m/02pjr4': ['/m/02pjr4'],
 '/m/02wv84t': ['/m/02wv84t'],
 '/m/02x984l': ['/m/02x984l'],
 '/m/03s_tn': ['/m/03s_tn'],
 '/m/040b_t': ['/m/040b_t'],
 '/m/04169hn': ['/m/04169hn'],
 '/m/063rgb': ['/m/063rgb'],
 '/m/07xyvk': ['/m/07xyvk'],
 '/m/0fx9l': ['/m/0fx9l'],
 '/m/0llzx': ['/m/0llzx'],
 '/m/03ldnb': ['/m/03ldnb'],
 '/m/02pkr5': ['/m/0130jx',
  

In [17]:
# current_clss = all_divisions[1]
# # bkgrd_div = get_div_background_list(dict_class_with_subset,current_clss)

# bkgrd_div = get_div_background_list(dict_class_with_subset,current_clss)
# bkgrd_div

In [18]:
create_multiple_subsets(train_anno, all_divisions, save_dir, "train", clsids_to_names, dict_class_with_subset)

HBox(children=(IntProgress(value=0, max=1674979), HTML(value='')))


0 : 1659071 1659071


HBox(children=(IntProgress(value=0, max=1674979), HTML(value='')))


1 : 408065 408065


HBox(children=(IntProgress(value=0, max=1674979), HTML(value='')))


2 : 447372 447372


HBox(children=(IntProgress(value=0, max=1674979), HTML(value='')))


3 : 1247608 1247608


HBox(children=(IntProgress(value=0, max=1674979), HTML(value='')))


4 : 1670396 1670396


HBox(children=(IntProgress(value=0, max=1674979), HTML(value='')))


5 : 1674979 1674979


In [19]:
create_multiple_subsets(valid_anno, all_divisions, save_dir, "valid", clsids_to_names)

HBox(children=(IntProgress(value=0, max=34917), HTML(value='')))


0 : 110 110


HBox(children=(IntProgress(value=0, max=34917), HTML(value='')))


1 : 891 891


HBox(children=(IntProgress(value=0, max=34917), HTML(value='')))


2 : 11772 11772


HBox(children=(IntProgress(value=0, max=34917), HTML(value='')))


3 : 25545 25545


HBox(children=(IntProgress(value=0, max=34917), HTML(value='')))


4 : 34795 34795


HBox(children=(IntProgress(value=0, max=34917), HTML(value='')))


5 : 7449 7449


## Other Scripts

### run in console

```cmd
python scripts/02_data_analysis.py --anno-json-dir data_info/all/train/annotations/train-anno.json --idx-to-id-dir data_info/all/train/annotations/train-idx_to_id.json --clsids-to-idx-dir data_info/all/clsids_to_idx.json --save-dir data_info/all --clsids-to-names-dir data_info/all/clsids_to_names.json

```

```
python scripts/02_data_analysis.py --anno-json-dir data_info/subsets/0/annotations/train-anno.json --idx-to-id-dir data_info/subsets/0/annotations/train-idx_to_id.json --clsids-to-idx-dir data_info/subsets/0/clsids_to_idx.json --save-dir data_info/subsets/0 --clsids-to-names-dir data_info/subsets/0/clsids_to_names.json

python scripts/02_data_analysis.py --anno-json-dir data_info/subsets/1/annotations/train-anno.json --idx-to-id-dir data_info/subsets/1/annotations/train-idx_to_id.json --clsids-to-idx-dir data_info/subsets/1/clsids_to_idx.json --save-dir data_info/subsets/1 --clsids-to-names-dir data_info/subsets/1/clsids_to_names.json

python scripts/02_data_analysis.py --anno-json-dir data_info/subsets/2/annotations/train-anno.json --idx-to-id-dir data_info/subsets/2/annotations/train-idx_to_id.json --clsids-to-idx-dir data_info/subsets/2/clsids_to_idx.json --save-dir data_info/subsets/2 --clsids-to-names-dir data_info/subsets/2/clsids_to_names.json

python scripts/02_data_analysis.py --anno-json-dir data_info/subsets/3/annotations/train-anno.json --idx-to-id-dir data_info/subsets/3/annotations/train-idx_to_id.json --clsids-to-idx-dir data_info/subsets/3/clsids_to_idx.json --save-dir data_info/subsets/3 --clsids-to-names-dir data_info/subsets/3/clsids_to_names.json

python scripts/02_data_analysis.py --anno-json-dir data_info/subsets/4/annotations/train-anno.json --idx-to-id-dir data_info/subsets/4/annotations/train-idx_to_id.json --clsids-to-idx-dir data_info/subsets/4/clsids_to_idx.json --save-dir data_info/subsets/4 --clsids-to-names-dir data_info/subsets/4/clsids_to_names.json

python scripts/02_data_analysis.py --anno-json-dir data_info/subsets/5/annotations/train-anno.json --idx-to-id-dir data_info/subsets/5/annotations/train-idx_to_id.json --clsids-to-idx-dir data_info/subsets/5/clsids_to_idx.json --save-dir data_info/subsets/5 --clsids-to-names-dir data_info/subsets/5/clsids_to_names.json
```