In [1]:
import sys

sys.path.append('..')

In [13]:
import sqlite3
import textwrap
from collections import namedtuple
from datetime import date
from pathlib import Path
from pprint import pp

import pandas as pd
from IPython.display import display
from ipywidgets import interact
from PIL import Image, ImageColor, ImageDraw, ImageFont
from sklearn import metrics
from tqdm import tqdm

from herbarium.pylib import db

In [9]:
DATA = Path('..') / 'data'

DB = DATA / 'angiosperms.sqlite'
IMAGES = DATA / 'images'
TEMP = DATA / 'temp'

In [10]:
TODAY = date.today().isoformat()

TEST_RUNS = """ b0_flowers_all_orders_1 b0_fruits_all_orders_1 """.split()

FIELDS = [
    ('reproductivecondition', 'reproductive condition'),
    ('occurrenceremarks', 'occurrence remarks'),
    ('fieldnotes', 'field notes'),
    ('dynamicproperties', 'dynamic properties'),
]

TRAITS = [
    ('flowering', 'not_flowering'),
    ('fruiting', 'not_fruiting'),
    # ('leaf_out', 'not_leaf_out'),
]

FLOWERING = db.select_test_run(DB, 'b0_flowers_all_orders_1')
FRUITING = db.select_test_run(DB, 'b0_fruits_all_orders_1')

# Confusion matrix per order

In [7]:
traits = [
    ('flowering', FLOWERING),
    ('fruiting', FRUITING),
]

orders = db.select_all_orders(DB)

In [19]:
per_order = []

for trait_name, test_recs in traits:
    for order in orders:
        order_recs = [r for r in test_recs if r['order_'] == order]
        if not order_recs:
            continue
        per_order.append({
            'trait': trait_name,
            'order': order,
            'true_pos': sum(1 for r in order_recs if r['true'] == 1 and round(r['pred']) == 1),
            'true_neg': sum(1 for r in order_recs if r['true'] == 0 and round(r['pred']) == 0),
            'false_pos': sum(1 for r in order_recs if r['true'] == 0 and round(r['pred']) == 1),
            'false_neg': sum(1 for r in order_recs if r['true'] == 1 and round(r['pred']) == 0),
        })

In [20]:
df = pd.DataFrame(per_order)
path = TEMP / f'results_per_order_{TODAY}.csv'
df.to_csv(path, index=False)

In [21]:
df.head()

Unnamed: 0,trait,order,true_pos,true_neg,false_pos,false_neg
0,flowering,acorales,0,1,0,1
1,flowering,alismatales,5,9,0,3
2,flowering,apiales,25,4,1,2
3,flowering,aquifoliales,2,1,0,0
4,flowering,arecales,3,0,1,0


# Flowering results

In [22]:
pad = 8
color = 'black'

Text = namedtuple('Text', 'x y bbox text')

In [23]:
indent = ' ' * 24

def build_text(draw, font, texts, text, x, y):
    for t in textwrap.wrap(text, subsequent_indent=indent):
        bbox = draw.textbbox((0, 0), t, font, anchor='lt')
        texts.append(Text(x, y, bbox, t))
        y += bbox[3] + pad
    return y

In [24]:
true_pos = [r for r in FLOWERING if r['true'] == 1 and r['true'] == round(r['pred'])]
true_neg = [r for r in FLOWERING if r['true'] == 0 and r['true'] == round(r['pred'])]
false_pos = [r for r in FLOWERING if r['true'] == 1 and r['true'] != round(r['pred'])]
false_neg = [r for r in FLOWERING if r['true'] == 0 and r['true'] != round(r['pred'])]

In [25]:
y_true = pd.Series([round(r['true']) for r in FLOWERING])
y_pred = pd.Series([round(r['pred']) for r in FLOWERING])

df_confusion = pd.crosstab(y_true, y_pred, rownames=['Actual'], colnames=['Predicted'])
df_confusion

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,284,19
1,110,2186


In [39]:
DATASET = false_neg
CONFUSION = 'false_neg'

DIR = DATA / 'temp' / f'flowering_{TODAY}' / CONFUSION
DIR.mkdir(exist_ok=True, parents=True)

In [35]:
def flowering(idx):
    rec = DATASET[idx - 1]
    image = Image.open(Path('..') / rec['path'])

    font = ImageFont.truetype(str(DATA / 'fonts' / 'SourceCodePro-Regular.ttf'), 64)
    draw = ImageDraw.Draw(image)

    texts = []

    x, y = 10, 10
    w, h = image.size
    
    text = f'{"confusion:":<23} {CONFUSION}'
    y = build_text(draw, font, texts, text, x, y)

    text = f'{"coreid:":<23} {rec["coreid"]}'
    y = build_text(draw, font, texts, text, x, y)

    for field, label in FIELDS:
        text = f'{(label+":"):<23} {rec[field]}'
        y = build_text(draw, font, texts, text, x, y)

    flag = ''
    if rec['flowering']:
        flag += '1'
    elif rec['not_flowering']:
        flag += '0'

    text = f'{"flowering NLP:":<23} {flag}'
    y = build_text(draw, font, texts, text, x, y)

    text = f'{"flowering model:":<23} {round(rec["pred"])} ({rec["pred"]:0.4})'
    y = build_text(draw, font, texts, text, x, y)

    max_x = max(t.bbox[2] for t in texts)
    draw.rectangle((0, 0, max_x + 32, y + 32), fill='#eeeeee')

    for t in texts:
        draw.text((t.x, t.y), t.text, color, font=font)

    path = DIR / f'{rec["coreid"]}.jpg'
    image.save(path, 'JPEG')
    
    # display(image)


# flowering(19)
# interact(flowering, idx=(1, len(dataset)))

In [40]:
for i, _ in tqdm(enumerate(DATASET[:100], 1)):
    flowering(i)

19it [00:15,  1.22it/s]


# Fruiting results

In [41]:
true_pos = [r for r in FRUITING if r['true'] == 1 and r['true'] == round(r['pred'])]
true_neg = [r for r in FRUITING if r['true'] == 0 and r['true'] == round(r['pred'])]
false_pos = [r for r in FRUITING if r['true'] == 1 and r['true'] != round(r['pred'])]
false_neg = [r for r in FRUITING if r['true'] == 0 and r['true'] != round(r['pred'])]

In [42]:
y_true = pd.Series([round(r['true']) for r in FRUITING])
y_pred = pd.Series([round(r['pred']) for r in FRUITING])

df_confusion = pd.crosstab(y_true, y_pred, rownames=['Actual'], colnames=['Predicted'])
df_confusion

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,267,54
1,156,797


In [52]:
DATASET = false_neg
CONFUSION = 'false_neg'

DIR = DATA / 'temp' / f'fruiting_{TODAY}' / CONFUSION
DIR.mkdir(exist_ok=True, parents=True)

In [46]:
def fruiting(idx):
    rec = DATASET[idx - 1]
    image = Image.open(Path('..') / rec['path'])

    font = ImageFont.truetype(str(DATA / 'fonts' / 'SourceCodePro-Regular.ttf'), 64)
    draw = ImageDraw.Draw(image)

    texts = []

    x, y = 10, 10
    w, h = image.size
    
    text = f'{"confusion:":<23} {CONFUSION}'
    y = build_text(draw, font, texts, text, x, y)

    text = f'{"coreid:":<23} {rec["coreid"]}'
    y = build_text(draw, font, texts, text, x, y)

    for field, label in FIELDS:
        text = f'{(label+":"):<23} {rec[field]}'
        y = build_text(draw, font, texts, text, x, y)

    flag = ''
    if rec['fruiting']:
        flag += '1'
    elif rec['not_fruiting']:
        flag += '0'

    text = f'{"fruiting NLP:":<23} {flag}'
    y = build_text(draw, font, texts, text, x, y)

    text = f'{"fruiting model:":<23} {round(rec["pred"])} ({rec["pred"]:0.4})'
    y = build_text(draw, font, texts, text, x, y)

    max_x = max(t.bbox[2] for t in texts)
    draw.rectangle((0, 0, max_x + 32, y + 32), fill='#eeeeee')

    for t in texts:
        draw.text((t.x, t.y), t.text, color, font=font)

    path = DIR / f'{rec["coreid"]}.jpg'
    image.save(path, 'JPEG')

    # display(image)


# fruiting(19)
# interact(fruiting, idx=(1, len(dataset)))

In [53]:
for i, _ in tqdm(enumerate(DATASET[:100], 1)):
    fruiting(i)

54it [00:45,  1.19it/s]
