# Explore the iNaturalist API

In [1]:
import sys

sys.path.append("..")

In [16]:
import json
import time
from pathlib import Path
from types import SimpleNamespace

import polars as pl
import requests
from tqdm import tqdm

In [3]:
DATA_DIR = Path("..") / "data"

In [4]:
args = SimpleNamespace(
    taxa=DATA_DIR / "inat" / "taxa.csv.gz",
    base_url='https://api.inaturalist.org/v1/observations?',
    plantae=47126,
    obs_dir=DATA_DIR / "inat" / "obs",
    per_page=100,
)

In [5]:
# !aws s3 cp s3://inaturalist-open-data/taxa.csv.gz $args.taxa --no-sign-request

## Get target taxon IDs

In [6]:
taxa_df = pl.read_csv(
    args.taxa,
    sep="\t",
    dtypes=[pl.Int64, pl.Utf8, pl.Float32, pl.Utf8, pl.Utf8, pl.Boolean],
).lazy()

### Get orders

In [7]:
orders = (
    taxa_df.filter(
        pl.col("ancestry").str.contains(f"/{args.plantae}/", literal=True)
    )
    .filter(pl.col("rank") == "order")
    .sort("name")
    .collect()
)
orders = orders.to_series(0)
orders.head()

taxon_id
i64
71270
152660
550940
509462
152661
48073
152141
152709
152710
56329


In [8]:
orders.shape

(269,)

### Map order IDs to names

In [9]:
name = (
    taxa_df.filter(
        pl.col("ancestry").str.contains(f"/{args.plantae}/", literal=True)
    )
    .filter(pl.col("rank") == "order")
    .select(["taxon_id", "name"])
    .collect()
)

name = name.to_dict()
name = {k: v for k, v in zip(name["taxon_id"], name["name"])}
len(name)

269

In [10]:
for i, taxon_id in enumerate(name.keys()):
    print(f"{taxon_id} {name[taxon_id]}")
    if i == 4:
        break

48808 Laurales
47363 Gentianales
47218 Asparagales
47195 Cornales
48232 Ranunculales


### Get species for each order

In [11]:
taxa = {}
for order in orders:
    ancestry = f"/{order}/"
    df = (
        taxa_df.filter(pl.col("ancestry").str.contains(ancestry, literal=True))
        .filter(pl.col("rank") == "species")
        .collect()
    )
    taxa[order] = df.to_series(0)

In [12]:
for i, (order, species) in enumerate(taxa.items()):
    print(f"{order}: {len(species)}")
    if i == 4:
        break

sum(len(v) for k, v in taxa.items())

71270: 3
152660: 23
550940: 0
509462: 2
152661: 5


283665

## Make API requests

In [14]:
def get_url(taxon_id, page=1):
    url = '&'.join([
        args.base_url,
        "photos=true",
        "quality_grade=research",
        f"taxon_id={taxon_id}",
        "term_id=12",
        f"per_page={args.per_page}",
        f"page={page}",
        "order=desc",
        "order_by=created_at",
        "photo_license=cc0,cc-by,cc-by-nc,cc-by-sa",
    ])
    return url

In [17]:
for order, species in tqdm(taxa.items()):
    if len(species) == 0:
        continue
    url = get_url(order)
    path = args.obs_dir / f"order_{name[order]}.json"
    result = requests.get(url)
    if len(result.json()["results"]) == 0:
        continue
    with open(path, "w") as out_file:
        json.dump(result.json(), out_file)
    time.sleep(3)

100%|████████████████████████████████████████████████████████████████████████████████████████| 269/269 [06:04<00:00,  1.35s/it]
