# Examine output from the "_Rise of Machines_" expedition

## Setup

In [1]:
from collections import Counter, defaultdict
from types import SimpleNamespace

from IPython.display import display
from ipywidgets import interact
from PIL import ImageDraw

from digi_leap.pylib import consts
from digi_leap.pylib.db import db
from digi_leap.pylib.label_finder.rise_of_machines import build_expedition as build
from digi_leap.pylib.label_finder.rise_of_machines import reconcile_expedition as recon

In [2]:
SERNEC = consts.DATA_DIR / "sernec"
UNRECONCILED = (
    SERNEC
    / "rise_of_machines"
    / "label-babel-3-rise-of-the-machines-classifications.unreconciled.csv"
)

ARGS = SimpleNamespace(
    database=SERNEC / "sernec.sqlite",
    label_set="tf_efficientnetv2_s_2022-03-04",
    label_conf=0.25,
    threshold=0.4,
    increase_by=4,
)

## Process expedition data

In [3]:
classifications = recon.read_classifications(UNRECONCILED)
POINTS = recon.classifications_to_points(classifications, ARGS.increase_by)

In [4]:
with db.connect(ARGS.database) as cxn:
    SHEETS = recon.select_sheet_labels(cxn, ARGS.label_set)

In [5]:
WHIFFS = recon.points_to_annotations(SHEETS, POINTS)

## Show per label stats

In [6]:
def check_label_results(sheets, whiff):
    correct, incorrect, tied = 0, 0, 0
    per_label = defaultdict(int)
    label_count = 0
    for labels in sheets.values():
        for lb in labels:
            if lb["annotations"]:
                label_count += 1
                per_label[len(lb["annotations"])] += 1
                counts = Counter(lb["annotations"])
                counts = counts.most_common()
                if len(counts) > 1 and counts[0][1] == counts[1][1]:
                    tied += 1
                elif counts[0][0] == "correct":
                    correct += 1
                else:
                    incorrect += 1
    per_label = sorted(per_label.items())
    too_many = sum(c for n, c in per_label if n > 3)
    print(f"{label_count=}  {correct=}  {incorrect=}  {tied=}  {whiff=}  {too_many=}")
    for times, counts in per_label:
        print(f"{counts:5d} label(s) clicked {times:2d} times")


check_label_results(SHEETS, WHIFFS)

label_count=12700  correct=6812  incorrect=5697  tied=191  whiff=204  too_many=87
  650 label(s) clicked  1 times
 1852 label(s) clicked  2 times
10111 label(s) clicked  3 times
   71 label(s) clicked  4 times
    8 label(s) clicked  5 times
    5 label(s) clicked  6 times
    1 label(s) clicked  9 times
    1 label(s) clicked 15 times
    1 label(s) clicked 16 times


## Show per sheet stats

In [7]:
def check_point_results(points):
    per_sheet = defaultdict(int)
    for counts in points.values():
        total = len(counts["correct"]) + len(counts["incorrect"])
        per_sheet[total] += 1
    per_sheet = sorted(per_sheet.items())
    sheet_accum = sum(c for _, c in per_sheet)
    print(f"Number of sheets: {sheet_accum}")
    print("Number of times sheets were clicked")
    for times, counts in per_sheet:
        print(f"{counts:4d} sheet(s) clicked {times:2d} times")


check_point_results(POINTS)

Number of sheets: 3000
Number of times sheets were clicked
   1 sheet(s) clicked  0 times
   4 sheet(s) clicked  3 times
  14 sheet(s) clicked  4 times
  13 sheet(s) clicked  5 times
 193 sheet(s) clicked  6 times
  49 sheet(s) clicked  7 times
  89 sheet(s) clicked  8 times
 791 sheet(s) clicked  9 times
 100 sheet(s) clicked 10 times
 119 sheet(s) clicked 11 times
 666 sheet(s) clicked 12 times
  79 sheet(s) clicked 13 times
  97 sheet(s) clicked 14 times
 428 sheet(s) clicked 15 times
  49 sheet(s) clicked 16 times
  62 sheet(s) clicked 17 times
 139 sheet(s) clicked 18 times
  11 sheet(s) clicked 19 times
  21 sheet(s) clicked 20 times
  40 sheet(s) clicked 21 times
   3 sheet(s) clicked 22 times
   5 sheet(s) clicked 23 times
  18 sheet(s) clicked 24 times
   1 sheet(s) clicked 25 times
   2 sheet(s) clicked 26 times
   2 sheet(s) clicked 27 times
   1 sheet(s) clicked 29 times
   1 sheet(s) clicked 30 times
   1 sheet(s) clicked 34 times
   1 sheet(s) clicked 68 times


## Show sheet clicks

In [14]:
KEYS = list(POINTS.keys())


def show_clicks(idx):
    key = KEYS[idx]
    print(key)
    pad = 48
    sheet = SHEETS[key][0]
    points = POINTS[key]
    path = "../" + sheet["path"]
    with db.connect(ARGS.database) as cxn:
        labels = build.select_labels(cxn, sheet, ARGS.label_set, ARGS.label_conf)
        labels = build.filter_labels(labels, threshold=ARGS.threshold)
    image = build.create_sheet_image(path, labels)
    draw = ImageDraw.Draw(image)
    for point in points["correct"]:
        x, y = point["x"], point["y"]
        draw.ellipse(
            (x-pad, y-pad, x+pad, y+pad), fill="limegreen", outline="white", width=8
        )
    for point in points["incorrect"]:
        x, y = point["x"], point["y"]
        draw.ellipse(
            (x-pad, y-pad, x+pad, y+pad), fill="deeppink", outline="white", width=8
        )
    for box in points["missing"]:
        rec = [box["left"], box["top"], box["right"], box["bottom"]]
        draw.rectangle(rec, outline="cyan", width=pad)
    image = image.reduce(5)
    display(image)


interact(show_clicks, idx=(0, len(KEYS) - 2))
#       happy: 1380, 1503
# many clicks: 213, 351
#  good boxes: 529
#   bad boxes: 527 2999
# boxes overlap clicks: 2538

interactive(children=(IntSlider(value=1499, description='idx', max=2998), Output()), _dom_classes=('widget-intâ€¦

<function __main__.show_clicks(idx)>

## Find sheets with many clicks

In [None]:
# [(i, c) for i, p in enumerate(POINTS.items())
#  if (c := len(p[1]["correct"]) + len(p[1]["incorrect"])) > 24]