In [None]:
import os
import json
import logging
import structlog
import torch
import random
import pickle
from sklearn.model_selection import train_test_split

In [2]:
#workist functions for converting data

logger = structlog.getLogger(__name__)


def convert_nested_to_flat_bbox(bb):
    """
    bb : [[X1,Y1],[X2,Y2]] BBOX
    return: [X1,Y1,X2,Y2] BBOX
    """
    return [bb[0][0], bb[0][1], bb[1][0], bb[1][1]]

def convert_relative_to_abs_bbox(bbox, image_width, image_height):
    """
    Converts a given bbox with relative coordinates to absolute coordinates
    """
    return [
        [bbox[0][0] * image_width, bbox[0][1] * image_height],
        [bbox[1][0] * image_width, bbox[1][1] * image_height],
    ]

def convert_page_to_annotation_page(page, categories):
    document_id = page["file_name"].split("_")[0]
    url = f"images/{page['file_name']}"
    return {
        "width": page["page_width"],
        "height": page["page_height"],
        "file_name": url,
        "url": url,
        "operator_job_id": document_id,
        "image_id": url,
        "annotations": [
            {
                "class": categories.index(area["class"]),
                "bbox": convert_nested_to_flat_bbox(
                    convert_relative_to_abs_bbox(area["bbox"], page["page_width"], page["page_height"])
                ),
                "bbox_mode": 0,
                "category_id": categories.index(area["class"]),
            }
            for area in page["areas"]
        ],
    }

def load_data_from_disc(data_dir):
    categories = ["table", "footer", "header", "order_line_item_header", "order_line_item"]
    all_annotations = []
    json_data_dir = f"{data_dir}/training_data_json"
    for file in os.listdir(json_data_dir):
        if file.endswith(".json"):
            with open(os.path.join(json_data_dir, file)) as f:
                page = json.load(f)
                try:
                    all_annotations.append(convert_page_to_annotation_page(page, categories))
                except Exception as e:
                    logger.error(
                        f"Could not convert page (id: {page['file_name'].split('_')[0]}) to annotation page:\n {e}"
                    )

    return all_annotations, categories

In [3]:
#set root directory where the training_data_json is located

root_dir = 'training_data'
# all_annotations, categories=load_data_from_disc(root_dir)

In [4]:
#save all_annotations and categories loaded from disc 

# with open(str(data_dir) + '/all_annotations.json','w') as json_file:
#     json.dump(all_annotations, json_file)
# with open(str(data_dir) + '/categories.json','w') as json_file:
#     json.dump(categories, json_file)

In [5]:
#load all_annotations and categories

with open(root_dir + '/all_annotations.json', 'r') as file:
     all_annotations = json.load(file)
with open(os.path.join(root_dir,'categories.json'), 'r') as file:
     categories = json.load(file)

In [7]:
#check categories
categories

['table', 'footer', 'header', 'order_line_item_header', 'order_line_item']

In [8]:
#check number of samples 
len(all_annotations)

65210

In [9]:
#a function that takes n samples from all annotations and does the train/val/test split

def n_samples_dataset_split(all_annotations, n, val_set_size, test_set_size):
    
    n_random_annotations = random.sample(all_annotations, n)
    
    print('Created a dataset containing {0} samples out of a larger dataset with {1} samples!'.
          format(len(n_random_annotations),len(all_annotations)))

    torch.manual_seed(1)
    indices = torch.randperm(len(n_random_annotations)).tolist()
    train_ind, test_ind = train_test_split(indices, test_size=test_set_size)
    train_ind, val_ind = train_test_split(train_ind, test_size=val_set_size)

    train_annotations = [n_random_annotations[i] for i in train_ind]
    val_annotations = [n_random_annotations[i] for i in val_ind]
    test_annotations = [n_random_annotations[i] for i in test_ind]
    
    print('-----------------------')
    print('Train set size: ', len(train_annotations))
    print('Val set size: ', len(val_annotations))
    print('Test set size: ', len(test_annotations))
    print('-----------------------')
    
    indices = [train_ind, val_ind, test_ind]
    
    return train_annotations, val_annotations, test_annotations, indices

In [11]:
# set n to wanted number of samples, set validation and test size
n = 500
val_set_size=0.1 
test_set_size=0.1
train_page_annotation, val_page_annotation, test_page_annotation, indices = n_samples_dataset_split(all_annotations, 
                                                                                                    n, 
                                                                                                    val_set_size,
                                                                                                    test_set_size)

Created a dataset containing 500 samples out of a larger dataset with 65210 samples!
-----------------------
Train set size:  405
Val set size:  45
Test set size:  50
-----------------------


In [12]:
#save train, validation and test json in page annotation format

with open(root_dir + '/train_page_annotation_{0}.json'.format(n),'w') as json_file:
    json.dump(train_page_annotation, json_file)
with open(root_dir  + '/val_page_annotation_{0}.json'.format(n),'w') as json_file:
    json.dump(val_page_annotation, json_file)
with open(root_dir + '/test_page_annotation_{0}.json'.format(n),'w') as json_file:
    json.dump(test_page_annotation, json_file)

In [12]:
from training.faster_rcnn.helper.coco_conv_1 import convert_to_coco

#convert workist annotation to coco annotation using coco_conv_1 script that assigns indices [1,2,3,4,5] for classes

train_coco_1_annotation = convert_to_coco(train_page_annotation, categories)
val_coco_1_annotation = convert_to_coco(val_page_annotation, categories)
test_coco_1_annotation = convert_to_coco(test_page_annotation, categories)

2023-01-09 12:05.06 [info     ] Total of 152 pages skipped as no annotations were provided.
2023-01-09 12:05.06 [info     ] Total of 13 annotations skipped as size was invalid (<5px).
2023-01-09 12:05.06 [info     ] Total of 91159 annotations converted to the COCO format.
[{'supercategory': 'orders', 'id': 1, 'name': 'table'}, {'supercategory': 'orders', 'id': 2, 'name': 'footer'}, {'supercategory': 'orders', 'id': 3, 'name': 'header'}, {'supercategory': 'orders', 'id': 4, 'name': 'order_line_item_header'}, {'supercategory': 'orders', 'id': 5, 'name': 'order_line_item'}]
2023-01-09 12:05.06 [info     ] Total of 16 pages skipped as no annotations were provided.
2023-01-09 12:05.06 [info     ] Total of 1 annotations skipped as size was invalid (<5px).
2023-01-09 12:05.06 [info     ] Total of 9257 annotations converted to the COCO format.
[{'supercategory': 'orders', 'id': 1, 'name': 'table'}, {'supercategory': 'orders', 'id': 2, 'name': 'footer'}, {'supercategory': 'orders', 'id': 3, 'na

In [13]:
from training.faster_rcnn.helper.coco_conv import convert_to_coco

#convert workist annotation to coco annotation using coco_conv script that assigns indices [0,1,2,3,4] for classes

train_coco_0_annotation = convert_to_coco(train_page_annotation, categories)
val_coco_0_annotation = convert_to_coco(val_page_annotation, categories)
test_coco_0_annotation = convert_to_coco(test_page_annotation, categories)

2023-01-09 12:05.07 [info     ] Total of 152 pages skipped as no annotations were provided.
2023-01-09 12:05.07 [info     ] Total of 13 annotations skipped as size was invalid (<5px).
2023-01-09 12:05.07 [info     ] Total of 91159 annotations converted to the COCO format.
[{'supercategory': 'orders', 'id': 0, 'name': 'table'}, {'supercategory': 'orders', 'id': 1, 'name': 'footer'}, {'supercategory': 'orders', 'id': 2, 'name': 'header'}, {'supercategory': 'orders', 'id': 3, 'name': 'order_line_item_header'}, {'supercategory': 'orders', 'id': 4, 'name': 'order_line_item'}]
2023-01-09 12:05.07 [info     ] Total of 16 pages skipped as no annotations were provided.
2023-01-09 12:05.07 [info     ] Total of 1 annotations skipped as size was invalid (<5px).
2023-01-09 12:05.07 [info     ] Total of 9257 annotations converted to the COCO format.
[{'supercategory': 'orders', 'id': 0, 'name': 'table'}, {'supercategory': 'orders', 'id': 1, 'name': 'footer'}, {'supercategory': 'orders', 'id': 2, 'na

In [14]:
#save splitted page annotations and coco annotations in the same folder

def save_annotations(root_dir, new_dir, n):
    
    #create new dir with n samples
    cwd = os.getcwd()
    dir = os.path.join(cwd, root_dir, new_dir)
    
    page_anno_dir = os.path.join(dir, "page_annotation")
    coco_anno_0_dir = os.path.join(dir,"coco_0_annotation")
    coco_anno_1_dir = os.path.join(dir,"coco_1_annotation")
    
    if not os.path.exists(dir):
        os.mkdir(dir)
    
    if not os.path.exists(page_anno_dir):
        os.mkdir(page_anno_dir)
    if not os.path.exists(coco_anno_0_dir):
        os.mkdir(coco_anno_0_dir)
    if not os.path.exists(coco_anno_1_dir):
        os.mkdir(coco_anno_1_dir)
        
    #save annotations in new dir
    with open(page_anno_dir + '/train_page_annotation_{0}.json'.format(n),'w') as json_file:
        json.dump(train_page_annotation, json_file)
    with open(page_anno_dir  + '/val_page_annotation_{0}.json'.format(n),'w') as json_file:
        json.dump(val_page_annotation, json_file)
    with open(page_anno_dir + '/test_page_annotation_{0}.json'.format(n),'w') as json_file:
        json.dump(test_page_annotation, json_file)

    #save coco 0 annotations in new dir
    with open(coco_anno_0_dir + '/train_coco_0_annotation_{0}.json'.format(n),'w') as json_file:
        json.dump(train_coco_0_annotation, json_file)
    with open(coco_anno_0_dir + '/val_coco_0_annotation_{0}.json'.format(n),'w') as json_file:
        json.dump(val_coco_0_annotation, json_file)
    with open(coco_anno_0_dir + '/test_coco_0_annotation_{0}.json'.format(n),'w') as json_file:
        json.dump(test_coco_0_annotation, json_file)

    #save coco 1 annotations in new dir
    with open(coco_anno_1_dir + '/train_coco_1_annotation_{0}.json'.format(n),'w') as json_file:
        json.dump(train_coco_1_annotation, json_file)
    with open(coco_anno_1_dir + '/val_coco_1_annotation_{0}.json'.format(n),'w') as json_file:
        json.dump(val_coco_1_annotation, json_file)
    with open(coco_anno_1_dir + '/test_coco_1_annotation_{0}.json'.format(n),'w') as json_file:
        json.dump(test_coco_1_annotation, json_file)


In [18]:
#save indices of train, validation and test set

def save_index_lists(root_dir, new_dir, indices):
        
    cwd = os.getcwd()
    dir = os.path.join(cwd, root_dir, new_dir)
    
    pickle_dir = os.path.join(dir, "pickled_index_lists")
    if not os.path.exists(pickle_dir):
        os.mkdir(pickle_dir)

    # store train_ind
    with open(pickle_dir + '/train_index', 'wb') as fp:
        pickle.dump(indices[0], fp)
        print('Done writing train index list into a binary file')
    
    # store val_ind
    with open(pickle_dir + '/val_index', 'wb') as fp:
        pickle.dump(indices[1], fp)
        print('Done writing val index list into a binary file')
        
    # store test_ind
    with open(pickle_dir + '/test_index', 'wb') as fp:
        pickle.dump(indices[2], fp)
        print('Done writing test index list into a binary file')
    

In [16]:
new_dir = "data_{0}_samples".format(n)

save_annotations(root_dir, new_dir, n)

In [19]:
save_index_lists(root_dir, new_dir, indices)

Done writing train index list into a binary file
Done writing val index list into a binary file
Done writing test index list into a binary file


In [None]:
# check data

# with open(dir + '/train_coco_annotation_{0}.json'.format(n), 'r') as file:
#      train_coco_annotation = json.load(file) 
# with open(dir + '/val_coco_annotation_{0}.json'.format(n), 'r') as file:
#      val_coco_annotation = json.load(file)
# with open(dir + '/test_coco_annotation_{0}.json'.format(n), 'r') as file:
#      test_coco_annotation = json.load(file)

In [26]:
#check pickled list

# cwd = os.getcwd()
# path = os.path.join(cwd, root_dir, new_dir, "pickled_index_lists" )
# with open(path + '/val_index', 'rb') as fp:
#     n_list = pickle.load(fp)
    
# n_list