In [1]:
import json
import os
import math
from tqdm.auto import tqdm
import shutil
from uuid import uuid4

In [2]:
ORIGINAL_DS_PATH = "detr_app_dataset/train"
DS_FILE = "_annotations.coco.json"
TRAIN_PATH = "hand-cursive-detr/train"
VAL_PATH = "hand-cursive-detr/valid"
DATASE_FILE_OUT = "hand-cursive-detr"

In [None]:
dataset_data = json.load(open(os.path.join(ORIGINAL_DS_PATH, DS_FILE), "r", encoding="utf-8"))
dataset_images = dataset_data["images"]
len(dataset_images)

In [5]:
total_train_items = math.floor(len(dataset_images) * 0.7)
selected_train_images = dataset_images[:total_train_items]
selected_val_images = dataset_images[total_train_items:]
len(selected_train_images), len(selected_val_images)

In [None]:
train_dataset = {
    "info": dataset_data["info"],
    "licenses": dataset_data["licenses"],
    "categories": dataset_data["categories"],
    "images": [],
    "annotations": []
}

for idx, image in enumerate(tqdm(selected_train_images)):
    annotations_from_image = [ x for x in dataset_data["annotations"] if x["image_id"] == image["id"] ]
    annotations_from_image = [ {**x, "image_id": idx, "id": uuid4().hex[:12]} for x in annotations_from_image ]
    selected_image = { **image }
    selected_image["id"] = idx
    dst_path = os.path.join(TRAIN_PATH, selected_image["file_name"])
    shutil.copyfile(os.path.join(ORIGINAL_DS_PATH, selected_image["file_name"]), dst_path)
    selected_image["file_name"] = os.path.join("train", selected_image["file_name"])
    train_dataset["images"].append(selected_image)
    train_dataset["annotations"] += annotations_from_image
    
train_dataset["annotations"] = [ {**x, "id": idx} for idx, x in enumerate(train_dataset["annotations"]) ]

In [None]:
val_dataset = {
    "info": dataset_data["info"],
    "licenses": dataset_data["licenses"],
    "categories": dataset_data["categories"],
    "images": [],
    "annotations": []
}

for idx, image in enumerate(tqdm(selected_val_images)):
    annotations_from_image = [ x for x in dataset_data["annotations"] if x["image_id"] == image["id"] ]
    annotations_from_image = [ {**x, "image_id": idx, "id": uuid4().hex[:12]} for x in annotations_from_image ]
    selected_image = { **image }
    selected_image["id"] = idx
    dst_path = os.path.join(VAL_PATH, selected_image["file_name"])
    shutil.copyfile(os.path.join(ORIGINAL_DS_PATH, selected_image["file_name"]), dst_path)
    selected_image["file_name"] = os.path.join("valid", selected_image["file_name"])
    val_dataset["images"].append(selected_image)
    val_dataset["annotations"] += annotations_from_image
val_dataset["annotations"] = [ {**x, "id": idx} for idx, x in enumerate(val_dataset["annotations"]) ]

In [13]:
json.dump(train_dataset, open(os.path.join(DATASE_FILE_OUT, "_train_annotations_coco.json"), "w", encoding="utf-8"))
json.dump(val_dataset, open(os.path.join(DATASE_FILE_OUT, "_val_annotations_coco.json"), "w", encoding="utf-8"))