In [61]:
import json
import os
import glob

manifest = "output.manifest"
with open(manifest) as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]
json_items = []
for c in content:
    if c == "":
        continue
    obj = json.loads(c)
    json_items.append(obj)

In [62]:
prelabeled = []
ilabeled = []
for item in json_items:
    if 'NOAAlabelstodo' in item:
        del item['NOAAlabelstodo']
        ilabeled.append(item)        
    else:
        prelabeled.append(item)

### Create 2016CHESS_US and 2019PB dataframes

In [63]:
import pandas as pd
annotations_path_chess = "/data/raw_data/PolarBears/2016CHESS_PolarBearAnnotations.csv"
annotations_path_2019 = "/data/raw_data/PolarBears/2019_pb_annotations.csv"
numeric_cols = ["Xmin", "Ymin", "Xmax", "Ymax"]
data2016 = pd.read_csv(annotations_path_chess, sep = ',', header=0, dtype={'PB_ID': object})
data2019 = pd.read_csv(annotations_path_2019, sep = ',', header=0, dtype={'PB_ID': object})
data2016[numeric_cols] =  data2016[numeric_cols].apply(pd.to_numeric)
data2019[numeric_cols] =  data2019[numeric_cols].apply(pd.to_numeric)
data2016 = data2016[data2016["Frame_color"].notnull()]
data2019 = data2019[data2019["Frame_xml"].notnull()]

### Create 2016CHESS_RU

In [64]:
import xml.etree.ElementTree as ET
for obj in ilabeled:
    obj['NOAAlabelsdone-metadata']['job-name'] = "labeling-job/done-by-me"

RU_PATH = "/data/raw_data/PolarBears/s3_images/2016_Chukchi_CHESS_Russia"
file_base = file.split(".")[0]
xml_files = glob.glob(os.path.join(RU_PATH, "*.xml"))
for f in xml_files:
    tree = ET.parse(f)
    root = tree.getroot()
    img = root[1].text
    PB_IDs = []
    xmins = []
    ymins = []
    xmaxs = []
    ymaxs = []
    for annotation in root[6:]:
        PB_IDs.append(annotation[0].text)
        xmins.append(int(annotation[4][0].text))
        ymins.append(int(annotation[4][1].text))
        xmaxs.append(int(annotation[4][2].text))
        ymaxs.append(int(root[6][4][3].text))
    for idx, data in enumerate(ilabeled):
        src=data['source-ref']
        category, file = src.replace("s3://","").split("/")[2:]
        if file == img:
            if len(PB_IDs) == 0:
                break
            human_labeled = data['NOAAlabelsdone']
            labels = human_labeled['annotations']
            ilabeled[idx]['NOAAlabelsdone']['annotations'] = []
            for j in range(len(PB_IDs)):
                h=ymaxs[j] - ymins[j]
                w=xmaxs[j] - xmins[j]
                ilabeled[idx]['NOAAlabelsdone']['annotations'].append(
                    {
                        "Age_class":"UNK",
                        "PB_ID":PB_IDs[j],
                        "class_id":0,
                        "height":h,
                        "left":xmins[j] + w/2,
                        "top":ymins[j]+h/2,
                        "width":w
                     }
                )
                ilabeled[idx]['NOAAlabelsdone-metadata']['job-name'] = "labeling-job/done-by-noaa"



In [65]:

for obj in prelabeled:
    src = obj["source-ref"]
    category, file = src.replace("s3://","").split("/")[2:]
    if category == "2019_Beaufort_PolarBears":
        row2 = data2019[data2019["Frame_xml"].str.contains(file[:10])]
    elif category == "2016_Chukchi_CHESS_Russia":
        print("?")
    elif category == "2016_Chukchi_CHESS_US":
        row2 = data2016[data2016["Frame_color"].str.contains(file[:-4])]
    widths = list(row2['Xmax']-row2['Xmin'])
    heights = list(row2['Ymax']-row2['Ymin'])
    ids = list(row2['PB_ID'])
    ages = list(row2['Age_class'])
    badres = True in list(row2['Poor_image_quality'].notnull())
    
    for i, det in enumerate(obj['NOAAlabelsdone']['annotations']):
        for j in range(len(ids)):
            h = det['height']
            w = det['width']
            if heights[j] == h and widths[j] == w:
                obj['NOAAlabelsdone']['annotations'][i]['PB_ID'] = ids[j]
                obj['NOAAlabelsdone']['annotations'][i]['Age_class'] = ages[j]
    obj['NOAAlabelsdone-metadata']['job-name'] = "labeling-job/done-by-noaa"
    obj['NOAAlabelsdone-metadata']['bad-res'] = badres


In [66]:
combined = prelabeled + ilabeled
for obj in combined:
    if len(obj['NOAAlabelsdone']['annotations']) == 0:
        obj['NOAAlabelsTODO'] = obj['NOAAlabelsdone']
        del obj['NOAAlabelsdone']
        obj['NOAAlabelsTODO-metadata'] = obj['NOAAlabelsdone-metadata']
        del obj['NOAAlabelsdone-metadata']
with open("updated.manifest", "w") as f:
    for row in combined:
            json.dump(row, f, sort_keys=True)
            f.write("\r\n")
