# Explore iNaturalist plant data

There is nothing really important here except that I used the techniques and information discovered here later in other notebooks.

In [2]:
from pathlib import Path
from types import SimpleNamespace

import polars as pl

In [3]:
DATA_DIR = Path("..") / "data"
INAT = DATA_DIR / "inat" / "inaturalist-open-data-20221227"

In [4]:
args = SimpleNamespace(
    observations=INAT / "observations.csv",
    photos=INAT / "photos.csv",
    taxa=INAT / "taxa.csv",
)

## Get total number of records in CSVs

In [5]:
!wc -l $INAT/*

   89600604 ../data/inat/inaturalist-open-data-20221227/observations.csv
     589291 ../data/inat/inaturalist-open-data-20221227/observers.csv
  153429245 ../data/inat/inaturalist-open-data-20221227/photos.csv
    1394174 ../data/inat/inaturalist-open-data-20221227/taxa.csv
  245013314 total


In [6]:
taxa_df = pl.read_csv(
    args.taxa,
    sep="\t",
    dtypes=[pl.Int64, pl.Utf8, pl.Float32, pl.Utf8, pl.Utf8, pl.Boolean],
).lazy()

In [7]:
taxa_df.filter(pl.col("rank") == "kingdom").filter(pl.col("active")).collect()

taxon_id,ancestry,rank_level,rank,name,active
i64,str,f32,str,str,bool
1,"""48460""",70.0,"""kingdom""","""Animalia""",True
47170,"""48460""",70.0,"""kingdom""","""Fungi""",True
47126,"""48460""",70.0,"""kingdom""","""Plantae""",True
48222,"""48460""",70.0,"""kingdom""","""Chromista""",True
47686,"""48460""",70.0,"""kingdom""","""Protozoa""",True
67333,"""48460""",70.0,"""kingdom""","""Bacteria""",True
131236,"""48460""",70.0,"""kingdom""","""Viruses""",True
151817,"""48460""",70.0,"""kingdom""","""Archaea""",True


## Get plant species IDs

In [8]:
species = (
    taxa_df.filter(pl.col("ancestry").str.contains("/47126/", literal=True))
    .filter(pl.col("rank") == "species")
    .collect()
    # .filter(pl.col('active'))
)
species = species.to_series(0)
species.head()

taxon_id
i64
47849
48280
48696
48279
49559
48278
47893
49386
49002
49526


In [9]:
species.shape

(283844,)

## Get plant order IDs

In [10]:
orders = (
    taxa_df.filter(pl.col("ancestry").str.contains("/47126/", literal=True))
    .filter(pl.col("rank") == "order")
    .collect()
    # .filter(pl.col('active'))
)
orders = orders.to_series(0)
orders.head()

taxon_id
i64
48808
47363
47218
47195
48232
47123
47754
47162
48700
47729


In [11]:
orders.shape

(269,)

## Link species to orders

In [14]:
taxa = {}
for order in orders:
    ancestry = f"/{order}/"
    df = (
        taxa_df.filter(pl.col("ancestry").str.contains(ancestry, literal=True))
        .filter(pl.col("rank") == "species")
        .collect()
    )
    taxa[order] = df.to_series(0)

In [20]:
# for order, species in taxa.items():
#     print(f"{order}: {len(species)}")

sum(len(v) for k, v in taxa.items())

283665

## Get plant observation records

In [10]:
obs_df = pl.read_csv(args.observations, sep="\t").lazy()

In [11]:
obs_df = (
    obs_df.filter(pl.col("taxon_id").is_in(species))
    .filter(pl.col("quality_grade") == "research")
    .collect()
)
obs_df.head()

observation_uuid,observer_id,latitude,longitude,positional_accuracy,taxon_id,quality_grade,observed_on
str,i64,f64,f64,i64,i64,str,str
"""2fee38b7-f1e1-...",477,-16.214675,-56.206055,,169520,"""research""","""2009-08-10"""
"""bc2bf26b-b6cc-...",477,-23.117116,-46.550216,,120490,"""research""","""2009-08-10"""
"""d2067277-8c40-...",477,-16.262141,-56.173096,,67511,"""research""","""2009-08-10"""
"""63c6eebc-5228-...",477,-16.24632,-56.206055,,962649,"""research""","""2009-08-10"""
"""1eb58e5c-3f93-...",477,46.789153,-121.735554,,67525,"""research""","""2010-08-13"""


In [12]:
obs_df.shape

(20459827, 8)

## Get plant observation IDs

In [13]:
obs_ids = obs_df.to_series(0)
obs_ids.head()

observation_uuid
str
"""2fee38b7-f1e1-..."
"""bc2bf26b-b6cc-..."
"""d2067277-8c40-..."
"""63c6eebc-5228-..."
"""1eb58e5c-3f93-..."
"""a46aca09-df98-..."
"""f1fa6354-c0e7-..."
"""d7b37f38-87ae-..."
"""ef55e0f4-7a8c-..."
"""725595c8-4a2a-..."


## Get plant photo records

In [14]:
photo_df = pl.read_csv(args.photos, sep="\t").lazy()

In [15]:
photo_df = (
    photo_df.filter(pl.col("observation_uuid").is_in(obs_ids))
    .filter(pl.col("license").str.contains("CC", literal=True))
    .collect()
)
photo_df.head()

photo_uuid,photo_id,observation_uuid,observer_id,extension,license,width,height,position
str,i64,str,i64,str,str,i64,i64,i64
"""0a0d0a02-eb6c-...",21330,"""2fee38b7-f1e1-...",477,"""jpg""","""CC0""",1200,1600,0
"""035eee8e-b160-...",21342,"""bc2bf26b-b6cc-...",477,"""jpg""","""CC0""",1200,1600,0
"""85fa2f9d-edaa-...",21344,"""d2067277-8c40-...",477,"""jpg""","""CC0""",1600,1200,0
"""b0cc2cfc-46d2-...",21351,"""63c6eebc-5228-...",477,"""jpg""","""CC0""",1600,1200,0
"""ecca6143-2ddb-...",20187,"""1eb58e5c-3f93-...",477,"""jpg""","""CC0""",2048,1365,0


In [16]:
photo_df.shape

(36333510, 9)