# Build database

In [1]:
import json
import re
import sqlite3
from collections import defaultdict
from pathlib import Path
from types import SimpleNamespace

import pandas as pd
from tqdm.notebook import tqdm

In [2]:
DATA_DIR = Path("..") / "data"

In [3]:
args = SimpleNamespace(
    obs_dir=DATA_DIR / "inat" / "obs",
    image_dir=DATA_DIR / "inat" / "images",
    db=DATA_DIR / "inat" / "inat.sqlite",
)

## Functions for parsing iNaturalist observation JSON records

#### Get the record ID from the JSON record

In [4]:
def get_ids(rec):
    return {
        "obs_id": rec["id"],
    }

#### Get phenology data from the JSON record

In [5]:
phenology = {
    13: "Flowering",
    14: "Fruiting",
    15: "Flower Budding",
    21: "No Evidence of Flowering",
}


def get_annotations(rec):
    annos = [a["controlled_value_id"] for a in rec["annotations"]]
    pheno = [phenology.get(a["controlled_value_id"]) for a in rec["annotations"]]
    annos = ",".join([str(a) for a in annos])
    pheno = ", ".join([a for a in pheno if a])
    return {"annotations": annos, "phenology": pheno}

#### Get taxon data from the JSON record

In [6]:
def get_taxon(rec):
    return {
        "taxon_id": rec["taxon"]["id"],
        "taxon": rec["taxon"]["name"],
        "ancestry": rec["taxon"]["ancestry"],
    }

#### Get image data from the JSON record

In [7]:
def get_images(rec, obs_id):
    photos = []
    for photo in rec["photos"]:
        url = photo["url"].replace("square.", size)
        match = re.search(r"/(\d+)/[a-z]+\.([a-z]+)$", url, flags=re.I)

        if not match:
            continue

        image_path = args.image_dir / f"{match[1]}_{size}{match[2]}"

        if not image_path.exists():
            continue

        photos.append(
            {
                "photo_id": photo["id"],
                "obs_id": obs_id,
                "license": photo["license_code"],
                "path": str(image_path),
                "url": url,
            }
        )

    return photos

#### Build database records for the observations and images.

In [8]:
size = "medium."  # medium large original

json_paths = sorted(args.obs_dir.glob("*.json"))

annotations = defaultdict(int)

observations = []
images = []

for json_path in tqdm(json_paths):
    with open(json_path) as in_file:
        data = json.load(in_file)

    for rec in data:
        obs_rec = get_ids(rec)
        obs_rec |= get_taxon(rec)
        obs_rec |= get_annotations(rec)
        if not obs_rec["phenology"]:
            continue

        imgs = get_images(rec, obs_rec["obs_id"])
        if not imgs:
            continue

        images += imgs
        observations.append(obs_rec)

  0%|          | 0/64 [00:00<?, ?it/s]

#### Build the data frames and write them to the database.

In [10]:
obs_df = pd.DataFrame(observations)
img_df = pd.DataFrame(images)

In [11]:
with sqlite3.connect(args.db) as cxn:
    obs_df.to_sql("obs", cxn, index=False)
    img_df.to_sql("images", cxn, index=False)