# Extract data from Notes from Nature expedition and setup for label detection

## Imports and other setup

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import csv
import json
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd

from digi_leap.subject import Subject

In [3]:
DATA_DIR = Path.cwd() / 'data'

LABEL_BABEL_1 = DATA_DIR / 'label-babel-1'
LABEL_BABEL_2 = DATA_DIR / 'label-babel-2'

SHEETS_1 = LABEL_BABEL_1 / 'herbarium-sheets'
SHEETS_2 = LABEL_BABEL_2 / 'herbarium-sheets'

UNRECONCILED = LABEL_BABEL_2 / '17633_label_babel_2.unreconciled.csv'
RECONCILED = LABEL_BABEL_2 / '17633_label_babel_2.reconciled.csv'

## Prepare to reconcile the data from Label Babel 2

### Command line arguments used to reconcile Label Babel 1

It's handy to have a record of what I did before.

```bash
./reconcile.py -u temp/8296_label_babel.unreconciled.csv -r temp/8296_label_babel.reconciled.csv -s temp/8296_label_babel.summary.html  --tool-label-hack 'a33c0ef367baa8:All typewritten,bf76cbd8a5a838:All handwritten,a18a7571a273e8:Both typewritten and handwritten' data/raw/labs_label-babel-classifications.csv
```

### Look at the data given to us by the provider (Zooniverse.org)

Oh joy! The out put from the data provider changed between Label Babel 1 & 2. Let's look at what the differences are.

In [4]:
# Constants so we can copy/pasta some JSON into the a display function

true = True
false = False

#### What annotation data looks like from Label Babel 1

In [5]:
print(json.dumps([{"task": "T0", "task_label": "Identify the primary label on the specimen and the kind of text it contains.", "value": [{"x": 2377.794677734375, "y": 4987.271484375, "tool": 0, "frame": 0,
                                                                                                                                          "width": 1543.725341796875, "height": 982.72119140625, "details": [{"value": [{"value": "a33c0ef367baa8", "option": true}]}], "tool_label": "Rectangle around the primary label."}]}], indent=2))

[
  {
    "task": "T0",
    "task_label": "Identify the primary label on the specimen and the kind of text it contains.",
    "value": [
      {
        "x": 2377.794677734375,
        "y": 4987.271484375,
        "tool": 0,
        "frame": 0,
        "width": 1543.725341796875,
        "height": 982.72119140625,
        "details": [
          {
            "value": [
              {
                "value": "a33c0ef367baa8",
                "option": true
              }
            ]
          }
        ],
        "tool_label": "Rectangle around the primary label."
      }
    ]
  }
]


#### What annotation data looks like from Label Babel 2

In [6]:
print(json.dumps([{"task": "T0", "task_label": "Identify the labels on the specimen and the kind of text they contain.", "value": [{"x": 655.1286010742188, "y": 24.654661178588867, "tool": 0, "frame": 0, "width": 411.24566650390625, "height": 61.76217079162598, "details": [{"value": 1}], "tool_label": "Box(es)"}, {"x": 453.271728515625, "y": 328.9463195800781, "tool": 0, "frame": 0, "width": 146.1202392578125, "height": 120.51153564453125, "details": [{"value": 1}], "tool_label": "Box(es)"}, {"x": 905.1900634765625, "y": 1333.7113037109375, "tool": 0, "frame": 0, "width": 149.133056640625, "height": 63.2685546875, "details": [{"value": 4}], "tool_label": "Box(es)"}, {"x": 628.0134887695312, "y": 1403.0054931640625, "tool": 0, "frame": 0, "width": 451.91827392578125, "height": 402.207275390625,
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    "details": [{"value": 1}], "tool_label": "Box(es)"}, {"x": 275.5171813964844, "y": 1571.7216796875, "tool": 0, "frame": 0, "width": 357.0154724121094, "height": 158.17138671875, "details": [{"value": 0}], "tool_label": "Box(es)"}, {"x": 99.26904296875, "y": 1634.990234375, "tool": 0, "frame": 0, "width": 125.03073120117188, "height": 58.7493896484375, "details": [{"value": 4}], "tool_label": "Box(es)"}, {"x": 88.72428894042969, "y": 1735.918701171875, "tool": 0, "frame": 0, "width": 222.9463653564453, "height": 36.1534423828125, "details": [{"value": 1}], "tool_label": "Box(es)"}, {"x": 424.6502380371094, "y": 1274.9619140625, "tool": 0, "frame": 0, "width": 93.39645385742188, "height": 149.133056640625, "details": [{"value": 1}], "tool_label": "Box(es)"}]}], indent=2))

[
  {
    "task": "T0",
    "task_label": "Identify the labels on the specimen and the kind of text they contain.",
    "value": [
      {
        "x": 655.1286010742188,
        "y": 24.654661178588867,
        "tool": 0,
        "frame": 0,
        "width": 411.24566650390625,
        "height": 61.76217079162598,
        "details": [
          {
            "value": 1
          }
        ],
        "tool_label": "Box(es)"
      },
      {
        "x": 453.271728515625,
        "y": 328.9463195800781,
        "tool": 0,
        "frame": 0,
        "width": 146.1202392578125,
        "height": 120.51153564453125,
        "details": [
          {
            "value": 1
          }
        ],
        "tool_label": "Box(es)"
      },
      {
        "x": 905.1900634765625,
        "y": 1333.7113037109375,
        "tool": 0,
        "frame": 0,
        "width": 149.133056640625,
        "height": 63.2685546875,
        "details": [
          {
            "value": 4
          }
        ],


### What a workflow entry looks like for Label Babel 2

In [7]:
print(json.dumps({"T0.help": "Locate all labels on the specimen and select the answer that best matches. Draw a box around each label until you have identified them all. If you want to start your rectangle over, just click the \"x.\"\n\nUse the zoom and pan buttons in the upper right to focus in on any smaller labels if needed. Once you have selected the label and the type, click OK to move on to next label. \n\nYou can ignore any envelopes, tags or fragment packets. Also, ignore any stamps directly on the sheet, scale bars or color chips.   \n\nSelect the option that best represents the label. Stray handwritten marks such as check marks can be ignored when deciding whether the label is typewritten, handwritten or both. If a label is mostly handwritten, and only the herbarium nameplate is typed (i.e. the herbarium name), classify this as “All handwritten.\"\n\nNote that there may be different kinds of barcodes represented such as linear and matrix (2D). All of these should be labeled \"Barcode.\" More information about different types of barcodes can be found [on Wikipedia](+tab+https://en.wikipedia.org/wiki/Barcode#Types_of_barcodes).\n\n**Blue arrows indicate labels with typewritten text. Red arrow indicates a barcode.**\n\n![Label_Finder_Ex1_ArrowsV3.jpg](https://panoptes-uploads.zooniverse.org/project_attached_image/d21033b1-c749-49c0-9814-e0acf7a2cc7f.jpeg =450x)\n\n**Blue arrows indicate labels with typewritten text. Red arrow indicate a barcode.**\n\n![Label_Finder_Ex2_ArrowsV3.jpg](https://panoptes-uploads.zooniverse.org/project_attached_image/a7f981dc-d90e-45b0-ad6e-034838d98f1c.jpeg =450x)\n\n**Blue arrows indicate labels with typewritten text. Red arrow indicates a barcode. Green arrows indicate handwritten text.**\n\n![Label_Finder_Ex3_ArrowV3.jpg](https://panoptes-uploads.zooniverse.org/project_attached_image/df1f806b-3eca-4719-99fa-146ddd15bbb8.jpeg =450x)\n\n**The example below contains a fragment packet in the upper left of the sheet. There is also a stamp in the upper right. These can be ignored and do not need to be identified.**\n\n![Other_Example.jpg](https://panoptes-uploads.zooniverse.org/project_attached_image/f4d80819-e68f-4ccc-b36e-c7f3083aa230.jpeg =450x)\n\n\n**The example below contains a tag marked by an orange arrow. These can be ignored and do not need to be identified.**\n\n![TagEx1.jpg](https://panoptes-uploads.zooniverse.org/project_attached_image/bf49dd12-0102-4223-afa8-8ac7ac6af818.jpeg =450x)\n\n**The example below contains a tag marked by an orange arrow. These can be ignored and do not need to be identified.**\n\n![TagEx2.jpg](https://panoptes-uploads.zooniverse.org/project_attached_image/3864482d-8edb-47b0-87e5-d00fc47ee3cb.jpeg =450x)",
                  "T1.help": "", "T0.instruction": "Identify the labels on the specimen and the kind of text they contain.", "T1.instruction": "Identify all the labels on the specimen, the type of label and the kind of text it contains.", "T0.tools.0.label": "Box(es)", "T1.tools.0.label": "Rectangle (secondary label)", "T0.tools.0.details.0.help": "", "T1.tools.0.details.0.help": "", "T0.tools.0.details.0.question": "Choose the option that best describes the label you just marked:", "T1.tools.0.details.0.question": "Choose the option that best describes the label you just marked:", "T0.tools.0.details.0.answers.0.label": "Label - All handwritten", "T0.tools.0.details.0.answers.1.label": "Label - All typewritten\n", "T0.tools.0.details.0.answers.2.label": "Label - Both\n", "T0.tools.0.details.0.answers.3.label": "Barcode\n", "T1.tools.0.details.0.answers.0.label": "Contains only handwritten text", "T1.tools.0.details.0.answers.1.label": "Contains only printed text", "T1.tools.0.details.0.answers.2.label": "Contains both handwritten and printed text"}, indent=2))

{
  "T0.help": "Locate all labels on the specimen and select the answer that best matches. Draw a box around each label until you have identified them all. If you want to start your rectangle over, just click the \"x.\"\n\nUse the zoom and pan buttons in the upper right to focus in on any smaller labels if needed. Once you have selected the label and the type, click OK to move on to next label. \n\nYou can ignore any envelopes, tags or fragment packets. Also, ignore any stamps directly on the sheet, scale bars or color chips.   \n\nSelect the option that best represents the label. Stray handwritten marks such as check marks can be ignored when deciding whether the label is typewritten, handwritten or both. If a label is mostly handwritten, and only the herbarium nameplate is typed (i.e. the herbarium name), classify this as \u201cAll handwritten.\"\n\nNote that there may be different kinds of barcodes represented such as linear and matrix (2D). All of these should be labeled \"Barcode.

#### Command line arguments for getting unreconciled data from Label Babel 2

```bash
./reconcile.py -u data/raw/17633_label_babel_2.unreconciled.csv --tool-label-hack '0:Handwritten,1:Typewritten,2:Both,3:Barcode' data/raw/labs_label-babel-2-classifications.csv
```

## Reconcile data from Label Babel 2

In [8]:
with open(UNRECONCILED) as csv_file:
    reader = csv.DictReader(csv_file)
    classifications = [r for r in reader]

### Group classifications by subject

In [9]:
subs: dict[str, Subject] = defaultdict(lambda: Subject())

for clsif in classifications:
    sub_id = clsif['subject_id']

    subs[sub_id].subject_id = sub_id
    subs[sub_id].subject_file_name = clsif['subject_Filename']

    coords = [v for k, v in clsif.items() if k.startswith('Box(es): box') and v]
    boxes = np.array([Subject.bbox_from_json(c) for c in coords if c])
    if len(boxes):
        subs[sub_id].boxes = np.vstack((subs[sub_id].boxes, boxes))

    selects = [(v if v else '') for k, v in clsif.items()
               if k.startswith('Box(es): select')]
    types = np.array(selects[:len(boxes)], dtype=str)
    if len(types):
        subs[sub_id].types = np.hstack((subs[sub_id].types, types))

subjects = list(subs.values())

### Merge all boxes in each group of boxes into a single bound box

There is a slight wrinkle here in that when labels are next to each other on the herbarium sheet some people lumped them into one large bounding box and others drew boxes around the individual labels. We'd prefer to have the individual bounding boxes for each label so we're going to do some extra processing to see if we can get them.

In [10]:
for_df = []

for subject in subjects:
    subject.merge_box_groups()
    for_df.append(subject.to_dict(everything=True))

medialib.naturalis.nl_file_id_L.1732428_format_large.jpg
medialib.naturalis.nl_file_id_L.1758368_format_large.jpg
medialib.naturalis.nl_file_id_L.1943278_format_large.jpg
medialib.naturalis.nl_file_id_L.2215372_format_large.jpg
medialib.naturalis.nl_file_id_L.2381614_format_large.jpg
medialib.naturalis.nl_file_id_L.2531270_format_large.jpg
medialib.naturalis.nl_file_id_L.2713449_format_large.jpg
medialib.naturalis.nl_file_id_L.2920885_format_large.jpg
medialib.naturalis.nl_file_id_L.2920885_format_large.jpg


In [11]:
df = pd.DataFrame(for_df).fillna('')
df.head()

Unnamed: 0,subject_id,subject_file_name,merged_box_1,merged_type_1,merged_box_2,merged_type_2,merged_box_3,merged_type_3,box_1,type_1,...,group_40,box_41,type_41,group_41,box_42,type_42,group_42,box_43,type_43,group_43
0,56418756,biimages.biodiversity.ku.edu_static_VascularPl...,"{""left"": 1843, ""top"": 3366, ""right"": 2688, ""bo...",Typewritten,"{""left"": 124, ""top"": 3871, ""right"": 608, ""bott...",Barcode,"{""left"": 1887, ""top"": 1888, ""right"": 2138, ""bo...",Handwritten,"{""left"": 1843, ""top"": 3366, ""right"": 2688, ""bo...",Typewritten,...,,,,,,,,,,
1,56418757,biimages.biodiversity.ku.edu_static_VascularPl...,"{""left"": 1374, ""top"": 2694, ""right"": 2056, ""bo...",Typewritten,"{""left"": 1566, ""top"": 2558, ""right"": 2024, ""bo...",Typewritten,"{""left"": 71, ""top"": 2931, ""right"": 451, ""botto...",Barcode,"{""left"": 1395, ""top"": 2694, ""right"": 2031, ""bo...",Typewritten,...,,,,,,,,,,
2,56418758,biimages.biodiversity.ku.edu_static_VascularPl...,"{""left"": 1364, ""top"": 2699, ""right"": 2035, ""bo...",Typewritten,"{""left"": 941, ""top"": 2965, ""right"": 1375, ""bot...",Typewritten,"{""left"": 188, ""top"": 2960, ""right"": 547, ""bott...",Barcode,"{""left"": 1379, ""top"": 2722, ""right"": 2027, ""bo...",Typewritten,...,,,,,,,,,,
3,56418759,biimages.biodiversity.ku.edu_static_VascularPl...,"{""left"": 1303, ""top"": 2741, ""right"": 2038, ""bo...",Typewritten,"{""left"": 96, ""top"": 2884, ""right"": 534, ""botto...",Barcode,"{""left"": 1313, ""top"": 2614, ""right"": 1747, ""bo...",Typewritten,"{""left"": 1314, ""top"": 2782, ""right"": 2003, ""bo...",Typewritten,...,,,,,,,,,,
4,56418760,biimages.biodiversity.ku.edu_static_VascularPl...,"{""left"": 1295, ""top"": 2733, ""right"": 2053, ""bo...",Typewritten,"{""left"": 812, ""top"": 5, ""right"": 2071, ""bottom...",Typewritten,"{""left"": 1284, ""top"": 2612, ""right"": 2018, ""bo...",Typewritten,"{""left"": 1301, ""top"": 2750, ""right"": 2032, ""bo...",Typewritten,...,,,,,,,,,,


In [12]:
df.to_csv(RECONCILED, index=False)