In [4]:
import json
import os
import random
from math import ceil
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple

import numpy as np
import tensorflow as tf
from constants import constants
from PIL import Image
from six import BytesIO


def aggregate_annotations_per_file_process_and_save(root: str) -> None:
    """Convert the annotation information from coco format to the the format used by data iterator.

    Read the annotations file (annotations.json) and aggregate annotations per image
    Processing: normalize and transpose the bounding boxes
        Input bounding boxes are x_min, y_min, x_max, y_max
        After processing, bounding boxes are y_min/height, x_min/width, y_max/height, x_max/width
    Save annotations in a folder with one annotation file per image.
    To read more about coco format, see https://cocodataset.org/#download
    Args:
        root: directory containing annotations.json
    """
    with open(os.path.join(root, constants.ANNOTATION_FILE)) as f:
        image_annotations = json.load(f)

    # images_info is a list of dict with keys including image_name, image_id, height and width of the image
    images_info = image_annotations[constants.IMAGES]

    # annotations_info is a list of dict with keys including image_id, bbox, and category_id
    annotations_info = image_annotations[constants.ANNOTATIONS]
    # Mapping from image_id to image_name
    id_to_images = {image[constants.ID]: image[constants.FILE_NAME] for image in images_info}

    id_to_width_and_height = {
        image[constants.ID]: (image[constants.WIDTH], image[constants.HEIGHT]) for image in images_info
    }
    # annotations_per_image aggregate annotations per the image_name
    annotations_per_image = {
        file_name: {constants.BOXES: [], constants.LABELS: []} for file_name in id_to_images.values()
    }

    for annotation in annotations_info:
        # image name for the current annotation
        image_name = id_to_images[annotation[constants.IMAGE_ID]]
        image_id = annotation[constants.IMAGE_ID]
        width, height = id_to_width_and_height[image_id]
        bbox = annotation[constants.BBOX]

        # Next we transform the boxes from input format to the format expected by the training algorithm.
        # {x_min, y_min, x_max, y_max} -> {y_min/height, x_min/width, y_max/height, x_max/width}
        bbox_normalized_and_transposed = [bbox[1] / height, bbox[0] / width, bbox[3] / height, bbox[2] / width]

        annotations_per_image[image_name][constants.BOXES].append(bbox_normalized_and_transposed)
        annotations_per_image[image_name][constants.LABELS].append(annotation[constants.CATEGORY_ID])

    # Create a directory named annotations in root folder which contains one annotation file for each image.
    annotation_directory_path = os.path.join(root, constants.ANNOTATION_FOLDER)
    if not os.path.exists(annotation_directory_path):
        os.mkdir(annotation_directory_path)

    for image_name in annotations_per_image:
        json_file_name = f"{image_name.split('.')[0]}.json"
        with open(os.path.join(root, constants.ANNOTATION_FOLDER, json_file_name), "w") as nf:
            nf.write(json.dumps(annotations_per_image[image_name]))


In [5]:
aggregate_annotations_per_file_process_and_save('/root/datasets/PlantDoc-Object-Detection-Dataset/')

In [34]:
pwd

'/root/sourcedir'

In [49]:
! aws s3 cp '/root/sourcedir/annotations.json' s3://plant-disease-detection-datasets/input_directory/annotations.json

upload: ./annotations.json to s3://plant-disease-detection-datasets/input_directory/annotations.json


In [12]:
import json

# Load the JSON file
with open('annotations.json', 'r') as file:
    data = json.load(file)

# Access the 'images' key and get the length of the list
image_count = len(data['images'])

# Print the number of items in the list
print({image_count})


{2335}


In [17]:
! find /root/datasets/PlantDoc-Object-Detection-Dataset/TRAIN/ -name *.jpg | wc -l

2379
