# Build database

In [1]:
import json
import re
import sqlite3
from collections import defaultdict
from pathlib import Path
from types import SimpleNamespace

import pandas as pd
from tqdm.notebook import tqdm

In [2]:
DATA_DIR = Path("..") / "data"

In [3]:
args = SimpleNamespace(
    obs_dir=DATA_DIR / "inat" / "obs",
    image_dir=DATA_DIR / "inat" / "images",
    db=DATA_DIR / "inat" / "inat.sqlite",
)

## Functions for parsing iNaturalist observation JSON records

#### Get record level data from the JSON record

In [4]:
def get_ids(rec, path):
    order = path.stem.replace("order_", "")
    return {
        "obs_id": rec["id"],
        "split": "",
        "order": order,
    }

#### Get phenology data from the JSON record

In [5]:
phenology = {
    13: "Flowering",
    14: "Fruiting",
    # 15: "Flower Budding",
    21: "No Evidence of Flowering",
}


def get_annotations(rec):
    annos = [a["controlled_value_id"] for a in rec["annotations"]]
    pheno = [phenology.get(a["controlled_value_id"]) for a in rec["annotations"]]
    annos = ",".join([str(a) for a in annos])
    pheno = ", ".join([a for a in pheno if a])
    return {"annotations": annos, "phenology": pheno}

#### Get taxon data from the JSON record

In [6]:
def get_taxon(rec):
    return {
        "taxon_id": rec["taxon"]["id"],
        "taxon": rec["taxon"]["name"],
        "ancestry": rec["taxon"]["ancestry"],
    }

#### Get image data from the JSON record

In [7]:
def get_images(rec, obs_id):
    photos = []
    for photo in rec["photos"]:
        url = photo["url"].replace("square.", size)
        match = re.search(r"/(\d+)/[a-z]+\.([a-z]+)$", url, flags=re.I)

        if not match:
            continue

        image_path = args.image_dir / f"{match[1]}_{size}{match[2]}"

        if not image_path.exists():
            continue

        photos.append(
            {
                "photo_id": photo["id"],
                "obs_id": obs_id,
                "license": photo["license_code"],
                "path": str(image_path),
                "url": url,
            }
        )

    return photos

#### Build database records for the observations and images.

In [8]:
size = "medium."  # medium large original

json_paths = sorted(args.obs_dir.glob("*.json"))

annotations = defaultdict(int)

observations = []
images = []

for json_path in tqdm(json_paths):
    with open(json_path) as in_file:
        data = json.load(in_file)

    for rec in data:
        obs_rec = get_ids(rec, json_path)
        obs_rec |= get_taxon(rec)
        obs_rec |= get_annotations(rec)
        if not obs_rec["phenology"]:
            continue

        imgs = get_images(rec, obs_rec["obs_id"])
        if not imgs:
            continue

        images += imgs
        observations.append(obs_rec)

  0%|          | 0/64 [00:00<?, ?it/s]

#### Build the data frames and write them to the database.

In [9]:
obs_df = pd.DataFrame(observations)

img_df = pd.DataFrame(images)

In [10]:
with sqlite3.connect(args.db) as cxn:
    obs_df.to_sql("obs", cxn, index=False, if_exists="replace")
    img_df.to_sql("images", cxn, index=False, if_exists="replace")

## Count records for each order and each phenology category

In [20]:
counts = defaultdict(lambda: {
    "both": 0,
    "flowering": 0, 
    "fruiting": 0, 
    "neither": 0, 
    "total": 0,
})
grand = 0

In [21]:
for obs in observations:
    count = counts[obs["order"]]
    count["total"] += 1
    grand += 1

    if "13" in obs["annotations"] and "14" in obs["annotations"]:
        count["both"] += 1

    if "13" in obs["annotations"]:
        count["flowering"] += 1

    if "14" in obs["annotations"]:
        count["fruiting"] += 1

    if "21" in obs["annotations"]:
        count["neither"] += 1

In [22]:
keys = sorted(counts.keys())
for taxon in keys:
    count = counts[taxon]
    print(
        f"{taxon:20} flowering ={count['flowering']: 4}, "
        f"fruiting ={count['fruiting']: 4}, neither ={count['neither']: 4}, "
        f"both = {count['both']: 4}, total = {count['total']: 4}"
    )
grand

Acorales             flowering =  28, fruiting =  29, neither =   9, both =    1, total =   65
Alismatales          flowering = 340, fruiting = 269, neither = 397, both =   38, total =  968
Amborellales         flowering =   0, fruiting =   6, neither =   0, both =    0, total =    6
Apiales              flowering = 675, fruiting = 225, neither = 142, both =   88, total =  954
Aquifoliales         flowering = 117, fruiting = 719, neither = 143, both =    2, total =  977
Arecales             flowering = 126, fruiting = 375, neither = 500, both =   27, total =  974
Asparagales          flowering = 476, fruiting =  88, neither = 427, both =   17, total =  974
Asterales            flowering = 786, fruiting = 128, neither = 102, both =   46, total =  970
Austrobaileyales     flowering =  96, fruiting =  39, neither =   7, both =    4, total =  138
Berberidopsidales    flowering =   9, fruiting =   0, neither =   6, both =    0, total =   15
Boraginales          flowering = 825, fruiting = 1

44474