In [None]:
import sys

sys.path.append('..')

In [None]:
import sqlite3
import textwrap
from collections import namedtuple
from datetime import date
from pathlib import Path
from pprint import pp

import pandas as pd
from IPython.display import display
from ipywidgets import interact
from PIL import Image, ImageColor, ImageDraw, ImageFont
from sklearn import metrics
from tqdm import tqdm

from herbarium.pylib import db

In [None]:
DATA = Path('..') / 'data'

DB = DATA / 'angiosperms.sqlite'
IMAGES = DATA / 'images'
TEMP = DATA / 'temp'

In [None]:
TODAY = date.today().isoformat()

FIELDS = [
    ('reproductivecondition', 'reproductive condition'),
    ('occurrenceremarks', 'occurrence remarks'),
    ('fieldnotes', 'field notes'),
    ('dynamicproperties', 'dynamic properties'),
]

FLOWERING = db.select_tests(DB, 'b0_flowers_all_orders_1')
FRUITING = db.select_tests(DB, 'b0_fruits_all_orders_1')
LEAF_OUT = db.select_tests(DB, 'b0_leaf_out_all_orders_1')

TEST_SETS = (FLOWERING, FRUITING, LEAF_OUT)

ORDERS = db.select_all_orders(DB)

TRAITS = [
    ('flowering', FLOWERING),
    ('fruiting', FRUITING),
    ('leaf_out', LEAF_OUT),
]

# Confusion matrix per order

In [None]:
per_order = []

for trait_name, test_set in TRAITS:
    for order in ORDERS:
        order_recs = [r for r in test_set if r['order_'] == order]
        if not order_recs:
            continue
        per_order.append({
            'trait': trait_name,
            'order': order,
            'true_pos': sum(1 for r in order_recs if r['target'] == 1 and round(r['pred']) == 1),
            'true_neg': sum(1 for r in order_recs if r['target'] == 0 and round(r['pred']) == 0),
            'false_pos': sum(1 for r in order_recs if r['target'] == 0 and round(r['pred']) == 1),
            'false_neg': sum(1 for r in order_recs if r['target'] == 1 and round(r['pred']) == 0),
        })

In [None]:
df = pd.DataFrame(per_order)
path = TEMP / f'results_per_order_{TODAY}.csv'
df.to_csv(path, index=False)

In [None]:
df.head()

## Per trait confusion matrices

In [None]:
for trait, test_set in TRAITS:
    tp = [r for r in test_set if r['target'] == 1 and round(r['pred']) == 1]
    tn = [r for r in test_set if r['target'] == 0 and round(r['pred']) == 0]
    fp = [r for r in test_set if r['target'] == 0 and round(r['pred']) == 1]
    fn = [r for r in test_set if r['target'] == 1 and round(r['pred']) == 0]

    targets = pd.Series([round(r['target']) for r in test_set])
    preds = pd.Series([round(r['pred']) for r in test_set])

    df_confusion = pd.crosstab(
        targets, preds, rownames=['Actual'], colnames=['Predicted'])

    print('=' * 80)
    print(trait[0])
    print(df_confusion)
    print()

## Display output results

In [None]:
PAD = 8
BLACK = 'black'
GRAY = '#eeeeee'

Text = namedtuple('Text', 'x y bbox text')

In [None]:
indent = ' ' * 24

def build_text(draw, font, texts, text, x, y):
    for t in textwrap.wrap(text, subsequent_indent=indent):
        bbox = draw.textbbox((0, 0), t, font, anchor='lt')
        texts.append(Text(x, y, bbox, t))
        y += bbox[3] + PAD
    return y

In [None]:
def display_image(idx, trait, dataset, confusion, dir_):
    rec = test_set[idx - 1]
    image = Image.open(Path('..') / rec['path'])

    font = ImageFont.truetype(
        str(DATA / 'fonts' / 'SourceCodePro-Regular.ttf'), 64)
    draw = ImageDraw.Draw(image)

    texts = []

    x, y = 10, 10
    w, h = image.size

    text = f'{"confusion:":<23} {confusion}'
    y = build_text(draw, font, texts, text, x, y)

    text = f'{"coreid:":<23} {rec["coreid"]}'
    y = build_text(draw, font, texts, text, x, y)

    for field, label in FIELDS:
        text = f'{(label+":"):<23} {rec[field]}'
        y = build_text(draw, font, texts, text, x, y)

    flag = '1' if rec['target'] == 1 else '0'

    text = f'{trait + " NLP:":<23} {flag}'
    y = build_text(draw, font, texts, text, x, y)

    text = f'{trait + " model:":<23} {round(rec["pred"])} ({rec["pred"]:0.4})'
    y = build_text(draw, font, texts, text, x, y)

    max_x = max(t.bbox[2] for t in texts)
    draw.rectangle((0, 0, max_x + 32, y + 32), fill=GRAY)

    for t in texts:
        draw.text((t.x, t.y), t.text, BLACK, font=font)

    path = dir_ / f'{rec["coreid"]}.jpg'
    image.save(path, 'JPEG')

    # display(image)

In [None]:
COUNT = 100

for trait, test_set in TRAITS:
    tp = [r for r in test_set if r['target'] == 1 and round(r['pred']) == 1]
    tn = [r for r in test_set if r['target'] == 0 and round(r['pred']) == 0]
    fp = [r for r in test_set if r['target'] == 0 and round(r['pred']) == 1]
    fn = [r for r in test_set if r['target'] == 1 and round(r['pred']) == 0]

    datasets = [('true_pos', tp), ('true_neg', tn),
                ('false_pos', fp), ('false_neg', fn)]

    for confusion, dataset in datasets:
        dir_ = DATA / 'temp' / f'{trait}_{TODAY}' / confusion
        dir_.mkdir(exist_ok=True, parents=True)

        for i, _ in tqdm(enumerate(dataset[:COUNT], 1)):
            display_image(i, trait, dataset, confusion, dir_)