In [32]:
import os
import requests
import pandas as pd
from tqdm import tqdm
import time
import ast

# ------------------------------------------------------------
# Map-Only Approach
# ------------------------------------------------------------

def safe_get_map(url, params=None, timeout=12):
    try:
        r = requests.get(url, params=params, timeout=timeout)
        if r.status_code == 200:
            return r.json()
        return None
    except Exception as e:
        return None

def fetch_macrostrat_map_only(lat, lon):
    """Fetch geological data using only the /geologic_units/map endpoint"""
    url = "https://macrostrat.org/api/v2/geologic_units/map"
    params = {
        "lat": lat,
        "lng": lon,
        "scale": "medium"
    }

    js = safe_get_map(url, params)
    if not js or not js.get("success"):
        return None

    units = js.get("success", {}).get("data", [])
    if not units:
        return None

    # Take the first (surface) unit
    unit = units[0]
    
    # Handle lithology - can be string, list of strings, or list of dicts
    lith = unit.get("lith", [])
    if isinstance(lith, list):
        if lith and isinstance(lith[0], dict):
            # Extract 'name' field from dict objects
            lith_names = [item.get("name") for item in lith if isinstance(item, dict) and item.get("name")]
            lithology = ", ".join(lith_names)
        else:
            # List of strings
            lithology = ", ".join(str(item) for item in lith)
    else:
        lithology = str(lith) if lith else None
    
    return {
        "MACRO_UNIT_NAME": unit.get("name"),
        "MACRO_STRAT_NAME": unit.get("strat_name"),
        "MACRO_LITHOLOGY": lithology,
        "MACRO_AGE_MIN": unit.get("t_int_age"),
        "MACRO_AGE_MAX": unit.get("b_int_age"),
        "MACRO_DESCRIPTION": unit.get("descrip"),
        "MACRO_COLOR": unit.get("color"),
        "MACRO_SOURCE_ID": unit.get("source_id"),
        "MACRO_MAP_ID": unit.get("map_id")
    }

def enrich_macrostrat_map_only(
    coord_csv_path,
    output_csv_path,
    sample_n=None,
    sleep_sec=0.1
):
    """Map-only enrichment using /geologic_units/map endpoint"""
    
    # Load coordinates
    df = pd.read_csv(coord_csv_path)
    
    def parse_coord(x):
        try:
            return ast.literal_eval(x) if isinstance(x, str) else x
        except:
            return None

    df["COORDINATES"] = df["COORDINATES"].apply(parse_coord)
    df = df[df["COORDINATES"].notna()].copy()

    # Apply sampling
    if sample_n is not None:
        df = df.head(sample_n)

    rows = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Macrostrat Map"):
        
        sid = row["STRUCTURE_ID"]
        lat, lon = row["COORDINATES"]

        entry = {
            "STRUCTURE_ID": sid,
            "COORDINATES": (lat, lon),
            "MACRO_UNIT_NAME": None,
            "MACRO_STRAT_NAME": None,
            "MACRO_LITHOLOGY": None,
            "MACRO_AGE_MIN": None,
            "MACRO_AGE_MAX": None,
            "MACRO_DESCRIPTION": None,
            "MACRO_COLOR": None,
            "MACRO_SOURCE_ID": None,
            "MACRO_MAP_ID": None
        }

        result = fetch_macrostrat_map_only(lat, lon)
        if result:
            entry.update(result)

        rows.append(entry)
        time.sleep(sleep_sec)

    # Save output
    df_out = pd.DataFrame(rows)
    df_out.to_csv(output_csv_path, index=False)

    # Show summary
    non_null_count = df_out['MACRO_UNIT_NAME'].notna().sum()
    print(f"Completed: {non_null_count}/{len(rows)} structures enriched")
    
    return df_out

# Run the map-only version
enrich_macrostrat_map_only(
    coord_csv_path=r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\structure_coordinates.csv",
    output_csv_path=r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\macrostrat_map_only.csv",
    sample_n=None
)

Macrostrat Map: 100%|██████████| 4914/4914 [1:08:44<00:00,  1.19it/s]

Completed: 4754/4914 structures enriched





Unnamed: 0,STRUCTURE_ID,COORDINATES,MACRO_UNIT_NAME,MACRO_STRAT_NAME,MACRO_LITHOLOGY,MACRO_AGE_MIN,MACRO_AGE_MAX,MACRO_DESCRIPTION,MACRO_COLOR,MACRO_SOURCE_ID,MACRO_MAP_ID
0,,"(47.98571667, -122.2271222)",,,,,,,,,
1,,"(47.697975, -122.6195)",Younger glacial drift,Vashon Drift; Sumas Drift,"Major:{fine alluvium,coarse alluvium}",0.0117,2.58,"Till. Hard, blue-gray to gray concrete-like m...",#F9F97F,133.0,2752774.0
2,,"(48.21215, -121.9331306)",Older glacial drift,Admiralty Clay; Orting Drift; Puyallup Formati...,"Major:{fine alluvium,coarse alluvium}, Inciden...",0.0117,3.60,"Till, outwash, and associated interglacial dep...",#FFF0BD,133.0,2717109.0
3,,"(47.56759167, -122.5517028)",Younger glacial drift,Vashon drift; Possession Drift; Whidbey Formation,"Major:{fine alluvium,coarse alluvium}",0.0117,2.58,"Younger glacial drift, undivided. Till, outwa...",#FFF2C7,133.0,2815013.0
4,,"(47.769275, -122.707925)",Younger glacial drift,Vashon Drift; Sumas Drift,"Major:{fine alluvium,coarse alluvium}",0.0117,2.58,"Till. Hard, blue-gray to gray concrete-like m...",#F9F97F,133.0,2752774.0
...,...,...,...,...,...,...,...,...,...,...,...
4909,DAPFORLE,"(47.02788056, -122.5136)",Younger glacial drift,Vashon Drift; Steilacoom Gravel; Washtucna Gra...,"Major:{fine alluvium,coarse alluvium}",0.0117,2.58,"Advance and recessional outwash, stratified dr...",#FFF2AE,133.0,2718478.0
4910,DAPFORLE,"(47.02893056, -122.4655)",Younger glacial drift,Vashon Drift; Steilacoom Gravel; Washtucna Gra...,"Major:{fine alluvium,coarse alluvium}",0.0117,2.58,"Advance and recessional outwash, stratified dr...",#FFF2AE,133.0,2718478.0
4911,DAPFORLE,"(47.11668889, -122.5003)",Younger glacial drift,Vashon Drift; Steilacoom Gravel; Washtucna Gra...,"Major:{fine alluvium,coarse alluvium}",0.0117,2.58,"Advance and recessional outwash, stratified dr...",#FFF2AE,133.0,2718478.0
4912,DAPFORLE,"(47.10798889, -122.5898)",water,,,,,,,133.0,2764941.0


In [None]:
macrostrat_schema = {
  "MACRO_UNIT_NAME": {
    "category": "General geologic unit name",
    "type": "categorical_text",
    "datatype": "string",
    "unit": None,
    "code_map": None
  },

  "MACRO_STRAT_NAME": {
    "category": "Stratigraphic formation names",
    "type": "categorical_text",
    "datatype": "string",
    "unit": None,
    "code_map": None
  },

  "MACRO_LITHOLOGY": {
    "category": "Geologic lithology (detailed)",
    "type": "categorical_text",
    "datatype": "string",
    "unit": None,
    "code_map": None
  },

  "MACRO_AGE_MIN": {
    "category": "Minimum geologic age",
    "type": "numerical",
    "datatype": "float",
    "unit": "Ma",
    "code_map": None
  },
  "MACRO_AGE_MIN_SEMANTIC": {
    "category": "Minimum geologic age (semantic)",
    "type": "nominal_derived",
    "datatype": "string",
    "unit": None,
    "source": "MACRO_AGE_MIN",
    "bin_rules": [
      { "category": "holocene",     "min": 0.0,    "max": 0.0117 },
      { "category": "pleistocene",  "min": 0.0117, "max": 2.58 },
      { "category": "neogene",      "min": 2.58,   "max": 23.0 },
      { "category": "paleogene",    "min": 23.0,   "max": 66.0 },
      { "category": "mesozoic",     "min": 66.0,   "max": 252.2 },
      { "category": "paleozoic",    "min": 252.2,  "max": 541.0 },
      { "category": "precambrian",  "min": 541.0,  "max": None }
    ],
    "code_map": {
      "none":        "Geologic age not available",
      "holocene":    "Very young Holocene deposits (<0.0117 Ma)",
      "pleistocene": "Pleistocene glacial or interglacial deposits (0.0117–2.58 Ma)",
      "neogene":     "Neogene sediments (2.58–23 Ma)",
      "paleogene":   "Early Cenozoic units (23–66 Ma)",
      "mesozoic":    "Mesozoic bedrock (66–252 Ma)",
      "paleozoic":   "Paleozoic bedrock (252–541 Ma)",
      "precambrian": "Ancient Precambrian rock (>541 Ma)"
    }
  },

  "MACRO_AGE_MAX": {
    "category": "Maximum geologic age",
    "type": "numerical",
    "datatype": "float",
    "unit": "Ma",
    "code_map": None
  },
  "MACRO_AGE_MAX_SEMANTIC": {
    "category": "Maximum geologic age (semantic)",
    "type": "nominal_derived",
    "datatype": "string",
    "unit": None,
    "source": "MACRO_AGE_MAX",
    "bin_rules": [
      { "category": "holocene",     "min": 0.0,    "max": 0.0117 },
      { "category": "pleistocene",  "min": 0.0117, "max": 2.58 },
      { "category": "neogene",      "min": 2.58,   "max": 23.0 },
      { "category": "paleogene",    "min": 23.0,   "max": 66.0 },
      { "category": "mesozoic",     "min": 66.0,   "max": 252.2 },
      { "category": "paleozoic",    "min": 252.2,  "max": 541.0 },
      { "category": "precambrian",  "min": 541.0,  "max": None }
    ],
    "code_map": {
      "none":        "Geologic age not available",
      "holocene":    "Very young Holocene deposits (<0.0117 Ma)",
      "pleistocene": "Pleistocene glacial or interglacial deposits (0.0117–2.58 Ma)",
      "neogene":     "Neogene sediments (2.58–23 Ma)",
      "paleogene":   "Early Cenozoic units (23–66 Ma)",
      "mesozoic":    "Mesozoic bedrock (66–252 Ma)",
      "paleozoic":   "Paleozoic bedrock (252–541 Ma)",
      "precambrian": "Ancient Precambrian rock (>541 Ma)"
    }
  },

  "MACRO_DESCRIPTION": {
    "category": "Geologic description",
    "type": "nl",
    "datatype": "string",
    "unit": None,
    "code_map": None
  }
}
