# Reconcile data from Label Babel

## Imports and other setup

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import csv
import json
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image, UnidentifiedImageError
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from digi_leap.subject import Subject

## Data that may change for each user

In [3]:
DATA_DIR = Path.cwd() / 'data'

LABEL_BABEL_1 = DATA_DIR / 'label-babel-1'
LABEL_BABEL_2 = DATA_DIR / 'label-babel-2'

SHEETS_1 = LABEL_BABEL_1 / 'herbarium-sheets'
SHEETS_2 = LABEL_BABEL_2 / 'herbarium-sheets-small'

UNRECONCILED = LABEL_BABEL_2 / '17633_label_babel_2.unreconciled.csv'
RECONCILED = LABEL_BABEL_2 / '17633_label_babel_2.reconciled.csv'
TRAIN = LABEL_BABEL_2 / '17633_label_babel_2.train.csv'
TEST = LABEL_BABEL_2 / '17633_label_babel_2.test.csv'

## Read unreconciled data

In [4]:
with open(UNRECONCILED) as csv_file:
    reader = csv.DictReader(csv_file)
    classifications = [r for r in reader]

## Group classifications by subject

In [5]:
subs: dict[str, Subject] = defaultdict(lambda: Subject())

for clsif in tqdm(classifications):
    sub_id = clsif['subject_id']

    subs[sub_id].subject_id = sub_id
    subs[sub_id].image_file = clsif['subject_Filename']

    coords = [v for k, v in clsif.items() if k.startswith('Box(es): box') and v]
    boxes = np.array([Subject.bbox_from_json(c) for c in coords if c])
    if len(boxes):
        subs[sub_id].boxes = np.vstack((subs[sub_id].boxes, boxes))

    selects = [(v if v else '') for k, v in clsif.items()
               if k.startswith('Box(es): select')]
    types = np.array(selects[:len(boxes)], dtype=str)
    if len(types):
        subs[sub_id].types = np.hstack((subs[sub_id].types, types))

subjects = list(subs.values())

100%|██████████| 15027/15027 [00:00<00:00, 23964.16it/s]


## Merge all boxes in each group of boxes into a single bound box

There is a slight wrinkle here in that when labels are next to each other on the herbarium sheet some people lumped them into one large bounding box and others drew boxes around the individual labels. We'd prefer to have the individual bounding boxes for each label so we're going to do some extra processing to see if we can get them.

In [6]:
for_df = []

for subject in tqdm(subjects):
    subject.merge_box_groups()
    for_df.append(subject.to_dict(everything=True))

100%|██████████| 4995/4995 [00:05<00:00, 932.09it/s] 


## Get image sizes

In [7]:
df_rows = []

for subject in tqdm(for_df):
    path = SHEETS_2 / subject['image_file']
    try:
        image = Image.open(path)
        width, height = image.size
    except UnidentifiedImageError:
        continue
    subject['image_size'] = {'width': width, 'height': height}
    df_rows.append(subject)

print(len(df_rows))

100%|██████████| 4995/4995 [00:00<00:00, 7882.13it/s]


4994


## Sort data frame columns

In [8]:
df = pd.DataFrame(df_rows).fillna('')

In [9]:
columns = """ subject_id  image_file image_size """.split()

boxes = [k for k in df.columns if k.startswith('merged_box_')]
types = [k for k in df.columns if k.startswith('merged_type_')]
columns += [c for s in zip(boxes, types) for c in s]

boxes = [k for k in df.columns if k.startswith('removed_box_')]
types = [k for k in df.columns if k.startswith('removed_type_')]
columns += [c for s in zip(boxes, types) for c in s]

boxes = [k for k in df.columns if k.startswith('box_')]
types = [k for k in df.columns if k.startswith('type_')]
groups = [k for k in df.columns if k.startswith('group_')]
columns += [c for s in zip(boxes, types, groups) for c in s]

df = df[columns]

## Split into training and test sets

In [10]:
df.to_csv(RECONCILED, index=False)

train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=9984)

train_df.to_csv(TRAIN, index=False)
test_df.to_csv(TEST, index=False)