In [7]:
# Taxonomic Data Mapping using GBIF and Catalogue of Life
# --------------------------------------------------------
# This notebook fetches taxon data from GBIF and COL APIs,
# stores it in Parquet format, and demonstrates efficient taxonomic data handling.
# Author: [Your Name]
# Date: [Today's Date]

import requests
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa

# Display markdown explanation
from IPython.display import display, Markdown

display(Markdown("## 📌 Introduction"))
display(Markdown(
"""
This notebook demonstrates how to:
- Fetch taxonomic data from **GBIF** and **Catalogue of Life (COL)**.
- Standardize taxon IDs for interoperability.
- Store the data efficiently in **Parquet format** for analysis.
"""
))

# Define API Endpoints
GBIF_API = "https://api.gbif.org/v1/species/"
COL_API = "https://api.catalogueoflife.org/dataset/3LR/taxon/"

# Example Taxon IDs
gbif_taxon_id = "5231190"  # Passer domesticus (GBIF)
col_taxon_id = "4DXXM"     # Passer domesticus (COL)

display(Markdown("### 📡 Fetching Data from GBIF and COL"))

# Function to fetch GBIF data
def fetch_gbif_data(taxon_id):
    url = GBIF_API + taxon_id
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return {
            "taxonID": str(data.get("key")),  # Convert to string for consistency
            "scientificName": data.get("scientificName"),
            "canonicalName": data.get("canonicalName"),
            "rank": data.get("rank"),
            "source": "GBIF"
        }
    return None

# Function to fetch COL data
def fetch_col_data(taxon_id):
    url = COL_API + taxon_id
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return {
            "taxonID": str(data.get("id")),  # Convert to string for consistency
            "scientificName": data.get("name", {}).get("scientificName"),
            "canonicalName": data.get("name", {}).get("canonicalName"),
            "rank": data.get("name", {}).get("rank"),
            "source": "COL"
        }
    return None

# Fetch data
gbif_record = fetch_gbif_data(gbif_taxon_id)
col_record = fetch_col_data(col_taxon_id)

# Show retrieved data
display(Markdown("#### 🔍 Retrieved Data from GBIF and COL"))
df_preview = pd.DataFrame([gbif_record, col_record])
display(df_preview)

# Combine into a Pandas DataFrame
df_taxon = pd.DataFrame([gbif_record, col_record])

display(Markdown("### 📂 Storing Data in Parquet Format"))
display(Markdown(
"""
Parquet is a highly efficient **columnar** storage format that:
- Reduces storage space.
- Enables fast queries and analytics.
- Works well with big data tools like **Databricks and Apache Spark**.
"""
))

# Ensure taxonID is treated as a string
df_taxon["taxonID"] = df_taxon["taxonID"].astype(str)

# Save as Parquet
parquet_file = "taxon_mapping.parquet"
table = pa.Table.from_pandas(df_taxon)
pq.write_table(table, parquet_file)

display(Markdown(f"✅ Data successfully saved as `{parquet_file}`"))

# Read back and print
display(Markdown("### 📖 Loading Data from Parquet"))
df_loaded = pd.read_parquet(parquet_file)
display(df_loaded)

display(Markdown("### 🚀 Summary & Next Steps"))
display(Markdown(
"""
- This notebook fetched **taxon information** from **GBIF** and **COL**.
- Data was standardized and stored in **Parquet format**.
- The next steps could include:
  - Expanding to **Wikidata** and **NSR** taxon IDs.
  - Integrating this with a **larger biodiversity knowledge graph**.
  - Automating periodic updates to track new taxon names.
"""
))



## 📌 Introduction


This notebook demonstrates how to:
- Fetch taxonomic data from **GBIF** and **Catalogue of Life (COL)**.
- Standardize taxon IDs for interoperability.
- Store the data efficiently in **Parquet format** for analysis.


### 📡 Fetching Data from GBIF and COL

#### 🔍 Retrieved Data from GBIF and COL

Unnamed: 0,taxonID,scientificName,canonicalName,rank,source
0,5231190,"Passer domesticus (Linnaeus, 1758)",Passer domesticus,SPECIES,GBIF
1,4DXXM,Passer domesticus,,species,COL


### 📂 Storing Data in Parquet Format


Parquet is a highly efficient **columnar** storage format that:
- Reduces storage space.
- Enables fast queries and analytics.
- Works well with big data tools like **Databricks and Apache Spark**.


✅ Data successfully saved as `taxon_mapping.parquet`

### 📖 Loading Data from Parquet

Unnamed: 0,taxonID,scientificName,canonicalName,rank,source
0,5231190,"Passer domesticus (Linnaeus, 1758)",Passer domesticus,SPECIES,GBIF
1,4DXXM,Passer domesticus,,species,COL


### 🚀 Summary & Next Steps


- This notebook fetched **taxon information** from **GBIF** and **COL**.
- Data was standardized and stored in **Parquet format**.
- The next steps could include:
  - Expanding to **Wikidata** and **NSR** taxon IDs.
  - Integrating this with a **larger biodiversity knowledge graph**.
  - Automating periodic updates to track new taxon names.
