In [1]:
import sys

sys.path.append("..")

In [2]:
import csv
import sqlite3
from pathlib import Path

import pandas as pd

In [3]:
TRAITS = ["leaves", "fruits", "flowers"]

In [4]:
DB = Path("..") / "data" / "backups" / "angiosperms.sqlite"
CSV = Path("..") / "splits" / "splits.csv"

In [5]:
MAP = {
    "flowering": "flowers",
    "fruiting": "fruits",
    "leaf_out": "leaves",
}

In [6]:
with sqlite3.connect(DB) as cxn:
    cxn.row_factory = sqlite3.Row
    rows = cxn.execute("select * from targets")
    targets = {}
    for row in rows:
        trait = MAP[row["trait"]]
        targets[(row["coreid"], trait)] = int(row["target"])

In [7]:
with sqlite3.connect(DB) as cxn:
    cxn.row_factory = sqlite3.Row
    rows = cxn.execute("select * from angiosperms")
    idigbio = {row["coreid"]: dict(row) for row in rows}

In [8]:
counts = {
    "flowers": {
        "match": 0,
        "mismatch": 0,
        "missing": 0,
        "other": 0,
        "coreids": [],
        "recs": [],
    },
    "fruits": {
        "match": 0,
        "mismatch": 0,
        "missing": 0,
        "other": 0,
        "coreids": [],
        "recs": [],
    },
    "leaves": {
        "match": 0,
        "mismatch": 0,
        "missing": 0,
        "other": 0,
        "coreids": [],
        "recs": [],
    },
}

In [9]:
with CSV.open() as f:
    reader = csv.DictReader(f)
    for row in reader:
        coreid = Path(row["file"]).stem
        for trait in TRAITS:
            target = targets.get((coreid, trait))
            if target is None:
                counts[trait]["missing"] += 1
            elif row[trait] in "NU":
                counts[trait]["other"] += 1
            elif target == int(row[trait]):
                counts[trait]["match"] += 1
            else:
                counts[trait]["mismatch"] += 1
                counts[trait]["coreids"].append(coreid)
                rec = dict(row)
                rec["mismatch"] = trait
                counts[trait]["recs"].append(rec)

In [10]:
for trait in TRAITS:
    matches = counts[trait]["match"]
    mismatch = counts[trait]["mismatch"]
    missing = counts[trait]["missing"]
    other = counts[trait]["other"]
    print(f"{trait}: Erin's notation matches        iDigBio {matches:4d}")
    print(f"{trait}: Erin's notation does not match iDigBio {mismatch:4d}")
    print(f"{trait}: Erin's notation missing from   iDigBio {missing:4d}")
    print(f"{trait}: Erin's notation is U or N              {other:4d}")
    total = matches + mismatch
    print(f"{trait}: match fraction = {matches / total:0.2f}")
    print()

leaves: Erin's notation matches        iDigBio  922
leaves: Erin's notation does not match iDigBio   58
leaves: Erin's notation missing from   iDigBio 5025
leaves: Erin's notation is U or N                51
leaves: match fraction = 0.94

fruits: Erin's notation matches        iDigBio 1566
fruits: Erin's notation does not match iDigBio  351
fruits: Erin's notation missing from   iDigBio 3679
fruits: Erin's notation is U or N               460
fruits: match fraction = 0.82

flowers: Erin's notation matches        iDigBio 2711
flowers: Erin's notation does not match iDigBio  594
flowers: Erin's notation missing from   iDigBio 2159
flowers: Erin's notation is U or N               592
flowers: match fraction = 0.82



In [11]:
# for coreid in counts["flowers"]["coreids"][:50]:
#     name = f"{coreid}.jpg"
#     src = Path("../../../images/herbarium_sheets") / name
#     dst = Path("../data/flowers_mismatch") / name
#     shutil.copy(src, dst)

In [12]:
# for coreid in counts["fruits"]["coreids"][:50]:
#     name = f"{coreid}.jpg"
#     src = Path("../../../images/herbarium_sheets") / name
#     dst = Path("../data/fruits_mismatch") / name
#     shutil.copy(src, dst)

In [13]:
records = counts["flowers"]["recs"][:50] + counts["fruits"]["recs"][:50]

In [14]:
for rec in records:
    coreid = rec["file"].split(".")[0]
    rec |= idigbio[coreid]

In [15]:
df = pd.DataFrame(records)
df.to_csv("../data/mismatch.csv", index=False)