In [1]:
import csv
import json
import socket
import time
from collections import defaultdict
from pathlib import Path
from urllib.error import HTTPError, URLError

import pandas as pd
import requests
from tqdm import tqdm

In [2]:
ERRORS = [
    HTTPError,
    URLError,
    requests.exceptions.ReadTimeout,
]

TIMEOUT = 20  # Seconds to wait for a server reply
DELAY = 1  # Seconds to delay between attempts to download a file

# Set a timeout for requests
socket.setdefaulttimeout(TIMEOUT)

In [3]:
DATA = Path("..") / "data"
CSV = DATA / "my_observation10.csv"
OUT = DATA / "inat_obs10"
URL = "https://api.inaturalist.org/v1/observations/{}"
WITH_OBS = DATA / "my_observation10_anno_a.csv"

In [4]:
ATTR = {
    9: "sex",
    12: "flowers_and_fruits",
    36: "leaves",
}

VAL = {
    # Sex
    10: "Female",
    11: "Male",
    20: "Cannot Be Determined",
    #
    # Flowers & fruits
    21: "No flowers or fruits",
    15: "Flower buds",
    13: "Flowers",
    14: "Fruits or Seeds",
    #
    # Leaves
    37: "Breaking leaf buds",
    38: "Green leaves",
    39: "Colored leaves",
    40: "No live leaves",
}

Get observation IDs

In [5]:
obs = {}

with CSV.open() as inp:
    reader = csv.DictReader(inp)
    for row in reader:
        obs[row["id"]] = row

OBS = dict(sorted(obs.items()))

print(len(OBS))

6127


In [6]:
def download(id_):
    path: Path = OUT / f"{id_}.json"
    url: str = URL.format(id_)

    if path.exists():
        return 1

    time.sleep(DELAY)

    try:
        response = requests.get(url, timeout=DELAY)

        json.loads(response.text)

        with path.open("w") as out:
            out.write(response.text)

    except ValueError:
        print(f"{id_} bad json")
        print(response.text)
        return 0

    except ERRORS:
        print(f"{id_} error")
        return 0

    return 1


# download(list(OBS.keys())[0])

In [7]:
def download_all():
    for id_ in tqdm(OBS.keys()):
        result = download(id_)
        if result == 0:
            break


# download_all()

In [8]:
def append_all_annotations():
    for id_, obs in tqdm(OBS.items()):
        path: Path = OUT / f"{id_}.json"

        with path.open() as inp:
            data = json.load(inp)
            attrs = defaultdict(list)
            for result in data["results"]:
                for anno in result["annotations"]:
                    try:
                        key = ATTR[anno["controlled_attribute_id"]]
                        val = VAL[anno["controlled_value_id"]]
                        attrs[key].append(val)
                    except KeyError:
                        pass

        for key, vals in attrs.items():
            obs[key] = ",".join(vals)


append_all_annotations()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6127/6127 [00:01<00:00, 3197.73it/s]


In [9]:
df = pd.DataFrame(OBS.values())
df.to_csv(WITH_OBS, index=False)