In [None]:
# Make sure these libraries are available within your python environement.
# Uncomment the following lines to install the libraries
!pip install --upgrade pip
!pip install 'sagemaker>=2.42.0'
!pip install jsonlines
!pip install pillow
!pip install matplotlib
!pip install 'sagemaker[local]' --upgrade
!pip install boto3
!pip install pandas
!pip install s3fs

## 1 - GroundTruth annotation to csv file


In [None]:
s3_bucket = "<YOUR GROUNDTRUTH S3 BUCKET>"
job_id = "visualsearch"
gt_job_name = "<YOUR GROUNDTRUTH JOB NAME>"


In [None]:
from io import StringIO
import json
import s3fs
import boto3
import pandas as pd

def parse_gt_output(manifest_path, job_name):
    """
    Captures the json Ground Truth bounding box annotations into a pandas dataframe

    Input:
    manifest_path: S3 path to the annotation file
    job_name: name of the Ground Truth job

    Returns:
    df_bbox: pandas dataframe with bounding box coordinates
             for each item in every image
    """

    filesys = s3fs.S3FileSystem()
    with filesys.open(manifest_path) as fin:
        annot_list = []
        for line in fin.readlines():
            record = json.loads(line)
            if job_name in record.keys():  # is it necessary?
                image_file_path = record["source-ref"]
                image_file_name = image_file_path.split("/")[-1]
                class_maps = record[f"{job_name}-metadata"]["class-map"]

                imsize_list = record[job_name]["image_size"]
                assert len(imsize_list) == 1
                image_width = imsize_list[0]["width"]
                image_height = imsize_list[0]["height"]

                for annot in record[job_name]["annotations"]:
                    left = annot["left"]
                    top = annot["top"]
                    height = annot["height"]
                    width = annot["width"]
                    class_name = class_maps[f'{annot["class_id"]}']

                    annot_list.append(
                        [
                            image_file_name,
                            class_name,
                            left,
                            top,
                            height,
                            width,
                            image_width,
                            image_height,
                        ]
                    )

        df_bbox = pd.DataFrame(
            annot_list,
            columns=[
                "img_file",
                "category",
                "box_left",
                "box_top",
                "box_height",
                "box_width",
                "img_width",
                "img_height",
            ],
        )

    return df_bbox


def save_df_to_s3(df_local, s3_bucket, destination):
    """
    Saves a pandas dataframe to S3

    Input:
    df_local: Dataframe to save
    s3_bucket: Bucket name
    destination: Prefix
    """

    csv_buffer = StringIO()
    s3_resource = boto3.resource("s3")

    df_local.to_csv(csv_buffer, index=False)
    s3_resource.Object(s3_bucket, destination).put(Body=csv_buffer.getvalue())

def save_df_to_local_csv(df_local, file_destination):
    df_local.to_csv(file_destination, index=False)
    

"""
Performs the following tasks:
1. Reads input from 'input.json'
2. Parses the Ground Truth annotations and creates a dataframe
3. Saves the dataframe to S3
"""

manifest_path = f"s3://{s3_bucket}/{job_id}/{gt_job_name}/manifests/output/output.manifest"

df_annot = parse_gt_output(manifest_path, gt_job_name)
save_df_to_local_csv(df_annot, './annot.csv')


## 2 - Simple filter to get similar number of annotation per object category

In [None]:
import pandas as pd
df_ann = pd.read_csv('annot.csv')

p = df_ann[["img_file","category"]]

print('-- Before filter')
print(p.groupby(["category"]).count())

screwdrivers = p[p["category"] == "screwdriver" ]
screwdriverPerFile = screwdrivers.groupby(['img_file']).count()
screwdriverToRemove = screwdriverPerFile[screwdriverPerFile["category"] > 3]

r = df_ann.merge(screwdriverToRemove.drop('category', 1), how='outer', on='img_file', indicator=True).query('_merge == "left_only"').drop('_merge', 1)
print('-- After filter')
print(r[["img_file","category"]].groupby(["category"]).count())


save_df_to_local_csv(r, './annot-filtered.csv')


## 3 - Converting data set meta data to Yolo format, saving them to S3

In [None]:
import os
import json
from io import StringIO
import boto3
import s3fs
import pandas as pd

yolo_output = "labels/train"

def annot_yolo(annot_file, cats):
    """
    Prepares the annotation in YOLO format

    Input:
    annot_file: csv file containing Ground Truth annotations
    ordered_cats: List of object categories in proper order for model training

    Returns:
    df_ann: pandas dataframe with the following columns
            img_file int_category box_center_w box_center_h box_width box_height


    Note:
    YOLO data format: <object-class> <x_center> <y_center> <width> <height>
    """

    df_ann = pd.read_csv(annot_file)

    df_ann["int_category"] = df_ann["category"].apply(lambda x: cats.index(x))
    df_ann["box_center_w"] = df_ann["box_left"] + df_ann["box_width"] / 2
    df_ann["box_center_h"] = df_ann["box_top"] + df_ann["box_height"] / 2

    # scale box dimensions by image dimensions
    df_ann["box_center_w"] = df_ann["box_center_w"] / df_ann["img_width"]
    df_ann["box_center_h"] = df_ann["box_center_h"] / df_ann["img_height"]
    df_ann["box_width"] = df_ann["box_width"] / df_ann["img_width"]
    df_ann["box_height"] = df_ann["box_height"] / df_ann["img_height"]

    return df_ann


def save_annots_to_s3(s3_bucket, prefix, df_local):
    """
    For every image in the dataset, save a text file with annotation in YOLO format

    Input:
    s3_bucket: S3 bucket name
    prefix: Folder name under s3_bucket where files will be written
    df_local: pandas dataframe with the following columns
              img_file int_category box_center_w box_center_h box_width box_height
    """

    unique_images = df_local["img_file"].unique()
    s3_resource = boto3.resource("s3")

    for image_file in unique_images:
        df_single_img_annots = df_local.loc[df_local.img_file == image_file]
        annot_txt_file = image_file.split(".")[0] + ".txt"
        destination = f"{prefix}/{annot_txt_file}"

        csv_buffer = StringIO()
        df_single_img_annots.to_csv(
            csv_buffer,
            index=False,
            header=False,
            sep=" ",
            float_format="%.4f",
            columns=[
                "int_category",
                "box_center_w",
                "box_center_h",
                "box_width",
                "box_height",
            ],
        )
        s3_resource.Object(s3_bucket, destination).put(Body=csv_buffer.getvalue())


def get_cats(json_file):
    """
    Makes a list of the category names in proper order

    Input:
    json_file: s3 path of the json file containing the category information

    Returns:
    cats: List of category names
    """

    filesys = s3fs.S3FileSystem()
    with filesys.open(json_file) as fin:
        line = fin.readline()
        record = json.loads(line)
        labels = [item["label"] for item in record["labels"]]

    return labels



"""
Performs the following tasks:
1. Reads input from 'input.json'
2. Collect the category names from the Ground Truth job
3. Creates a dataframe with annotaion in YOLO format
4. Saves a text file in S3 with YOLO annotations
   for each of the labeled images
"""


s3_path_cats = (f"s3://{s3_bucket}/{job_id}/{gt_job_name}/annotation-tool/data.json")
categories = get_cats(s3_path_cats)
print("\n labels used in Ground Truth job: ")
print(categories, "\n")

gt_annot_file = "annot-filtered.csv"
s3_dir = f"{job_id}/{yolo_output}"
print(f"annotation files saved in = ", s3_dir)

df_annot = annot_yolo(gt_annot_file, categories)

print(df_annot)

save_annots_to_s3(s3_bucket, s3_dir, df_annot)
