# Gather iNaturalist data

In [1]:
import sys

sys.path.append("..")

In [26]:
import json
import math
import re
import time
from pathlib import Path
from types import SimpleNamespace

import polars as pl
import requests
from tqdm.notebook import tqdm

In [3]:
DATA_DIR = Path("..") / "data"

In [4]:
args = SimpleNamespace(
    base_url='https://api.inaturalist.org/v1/observations?',
    taxa=DATA_DIR / "inat" / "taxa.csv.gz",
    obs_dir=DATA_DIR / "inat" / "obs",
    images=DATA_DIR / "inat" / "images",
    plantae=47126,
    per_page=100,
)

In [27]:
# !aws s3 cp s3://inaturalist-open-data/taxa.csv.gz $args.taxa --no-sign-request

## Get target taxon IDs

In [6]:
taxa_df = pl.read_csv(
    args.taxa,
    sep="\t",
    dtypes=[pl.Int64, pl.Utf8, pl.Float32, pl.Utf8, pl.Utf8, pl.Boolean],
).lazy()

### Get orders

In [7]:
orders = (
    taxa_df.filter(
        pl.col("ancestry").str.contains(f"/{args.plantae}/", literal=True)
    )
    .filter(pl.col("rank") == "order")
    .sort("name")
    .collect()
)
orders = orders.to_series(0)
orders.head()

taxon_id
i64
71270
152660
550940
509462
152661
48073
152141
152709
152710
56329


In [8]:
orders.shape

(269,)

### Map order IDs to names

In [9]:
name = (
    taxa_df.filter(
        pl.col("ancestry").str.contains(f"/{args.plantae}/", literal=True)
    )
    .filter(pl.col("rank") == "order")
    .select(["taxon_id", "name"])
    .collect()
)

name = name.to_dict()
name = {k: v for k, v in zip(name["taxon_id"], name["name"])}
len(name)

269

In [10]:
for i, taxon_id in enumerate(name.keys()):
    print(f"{taxon_id} {name[taxon_id]}")
    if i >= 9:
        break

48808 Laurales
47363 Gentianales
47218 Asparagales
47195 Cornales
48232 Ranunculales
47123 Fabales
47754 Polypodiales
47162 Poales
48700 Apiales
47729 Sapindales


### Get species for each order

In [11]:
taxa = {}
for order in orders:
    ancestry = f"/{order}/"
    df = (
        taxa_df.filter(pl.col("ancestry").str.contains(ancestry, literal=True))
        .filter(pl.col("rank") == "species")
        .collect()
    )
    taxa[order] = df.to_series(0)

In [12]:
for i, (order, species) in enumerate(taxa.items()):
    print(f"{order}: {len(species)}")
    if i >= 9:
        break

sum(len(v) for k, v in taxa.items())

71270: 3
152660: 23
550940: 0
509462: 2
152661: 5
48073: 3243
152141: 1
152709: 32
152710: 1
56329: 39


283665

## Make API requests

In [13]:
def get_url(taxon_id, per_page, page=1):
    url = '&'.join([
        args.base_url,
        "photos=true",
        "quality_grade=research",
        f"taxon_id={taxon_id}",
        "term_id=12",
        f"per_page={per_page}",
        f"page={page}",
        "order=desc",
        "order_by=created_at",
        "photo_license=cc0,cc-by,cc-by-nc,cc-by-sa",
    ])
    return url

In [None]:
def call_api(url, results):
    try:
        response = requests.get(url)
        results += response.json()["results"]
    except KeyError:
        print(response.json())
        raise  # ???????????????????????????
    return response.json()

In [21]:
for order, species in tqdm(taxa.items(), position=0, leave=None):
    if len(species) == 0:
        continue
    url = get_url(order, args.per_page)
    path = args.obs_dir / f"order_{name[order]}.json"
    results = []
    response = call_api(url, results)
    if len(results) == 0:
        continue
    last = math.ceil(response["total_results"] / args.per_page) + 1
    last = min(last, 11)  # #################################################
    for page in tqdm(range(2, last), position=1, leave=None):
        url = get_url(order, args.per_page, page=page)
        call_api(url, results)
    with open(path, "w") as out_file:
        json.dump(results, out_file)

  0%|          | 0/269 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

## Download images

In [28]:
def download_image(url, path):
    if path.exists():
        return
    for attempt in range(3):
        try:
            image = requests.get(url).content
            with open(path, "wb") as out_file:
                out_file.write(image)
            return
        except TimeoutError:
            time.sleep(20)

In [31]:
size = "medium."  # medium large original
json_paths = sorted(args.obs_dir.glob("*.json"))

for json_path in tqdm(json_paths, position=0, leave=None):
    with open(json_path) as in_file:
        data = json.load(in_file)
    for result in tqdm(data, position=1, leave=None):
        for photo in result["photos"]:
            url = photo["url"].replace("square.", size)
            match = re.search(r"/(\d+)/[a-z]+\.([a-z]+)$", url, flags=re.I)
            if not match:
                continue
            image_path = args.images / f"{match[1]}_{size}{match[2]}"
            download_image(url, image_path)

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/141 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/323 [00:00<?, ?it/s]

  0%|          | 0/314 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/243 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/206 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/621 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]