### Application

In [98]:
mock_data = {
  "data": {
    "collection": {
      "abstract": "MiCASA is an extensive revision of CASA-GFED3. CASA-GFED3 derives from Potter et al. (1993), diverging in development since Randerson et al. (1996). CASA is a light use efficiency model: NPP is expressed as the product of photosynthetically active solar radiation, a light use efficiency parameter, scalars that capture temperature and moisture limitations, and fractional absorption of photosynthetically active radiation (fPAR) by the vegetation canopy derived from satellite data. Fire parameterization was incorporated into the model by van der Werf et al. (2004) leading to CASA-GFED3 after several revisions (van der Werf et al., 2006, 2010). Development of the GFED module has continued, now at GFED5 (Chen et al., 2023) with less focus on the CASA module. MiCASA diverges from GFED development at version 3, although future reconciliation is possible. Input datasets include air temperature, precipitation, incident solar radiation, a soil classification map, and several satellite derived products. These products are primarily based on Moderate Resolution Imaging Spectroradiometer (MODIS) Terra and Aqua combined datasets including land cover classification (MCD12Q1), burned area (MCD64A1), Nadir BRDF-Adjusted Reflectance (NBAR; MCD43A4), from which fPAR is derived, and tree/herbaceous/bare vegetated fractions from Terra only (MOD44B). Emissions due to fire and burning of coarse woody debris (fuel wood) are estimated separately. ",
      "archiveAndDistributionInformation": {
        "fileArchiveInformation": [
          {
            "format": "netCDF",
            "averageFileSize": 10,
            "averageFileSizeUnit": "MB"
          }
        ]
      },
      "associatedDois": None,
      "boxes": [
        "-90 -180 90 179"
      ],
      "cloudHosted": True,
      "conceptId": "C3273639213-GES_DISC",
      "coordinateSystem": "CARTESIAN",
      "dataCenter": "GES_DISC",
      "dataCenters": [
        {
          "roles": [
            "ARCHIVER"
          ],
          "shortName": "NASA/GSFC/SED/ESD/TISL/GESDISC",
          "longName": "Goddard Earth Sciences Data and Information Services Center (formerly Goddard DAAC), Terrestrial Information Systems Laboratory, Earth Sciences Division, Science and Exploration Directorate, Goddard Space Flight Center, NASA",
          "contactInformation": {
            "relatedUrls": [
              {
                "urlContentType": "DataCenterURL",
                "type": "HOME PAGE",
                "url": "https://disc.gsfc.nasa.gov",
                "description": "NASA GES DISC Website"
              }
            ]
          },
          "contactGroups": [
            {
              "roles": [
                "Data Center Contact"
              ],
              "groupName": "GES DISC HELP DESK SUPPORT GROUP",
              "contactInformation": {
                "addresses": [
                  {
                    "streetAddresses": [
                      "Goddard Earth Sciences Data and Information Services Center",
                      "Code 610.2",
                      "NASA Goddard Space Flight Center"
                    ],
                    "city": "Greenbelt",
                    "stateProvince": "MD",
                    "postalCode": "20771",
                    "country": "USA"
                  }
                ],
                "contactMechanisms": [
                  {
                    "type": "Telephone",
                    "value": "301-614-5224"
                  },
                  {
                    "type": "Email",
                    "value": "gsfc-dl-help-disc@mail.nasa.gov"
                  }
                ]
              }
            }
          ],
          "contactPersons": [
            {
              "roles": [
                "Data Center Contact"
              ],
              "firstName": "Kristan",
              "lastName": "Morgan",
              "contactInformation": {
                "addresses": [
                  {
                    "city": "Greenbelt",
                    "stateProvince": "MD",
                    "postalCode": "20771",
                    "country": "USA"
                  }
                ],
                "contactMechanisms": [
                  {
                    "type": "Email",
                    "value": "kristan.l.morgan@nasa.gov"
                  }
                ]
              }
            }
          ]
        }
      ],
      "directDistributionInformation": {
        "region": "us-west-2",
        "s3CredentialsApiEndpoint": "https://data.gesdisc.earthdata.nasa.gov/s3credentials",
        "s3CredentialsApiDocumentationUrl": "https://data.gesdisc.earthdata.nasa.gov/s3credentialsREADME",
        "s3BucketAndObjectPrefixNames": [
          "s3://gesdisc-cumulus-prod-protected/CMS/MICASA_FLUX_D.1/"
        ]
      },
      "doi": {
        "doi": "10.5067/ZBXSA1LEN453"
      },
      "duplicateCollections": {
        "count": 0,
        "items": []
      },
      "hasGranules": True,
      "lines": None,
      "nativeDataFormats": [],
      "points": None,
      "polygons": None,
      "relatedUrls": [
        {
          "url": "https://docserver.gesdisc.eosdis.nasa.gov/public/project/CMS/micasa_v1_sample.jpg",
          "urlContentType": "VisualizationURL",
          "type": "GET RELATED VISUALIZATION"
        },
        {
          "url": "https://disc.gsfc.nasa.gov/datacollection/MICASA_FLUX_D_1.html",
          "description": "Access the dataset landing page from the GES DISC website.",
          "type": "DATA SET LANDING PAGE",
          "urlContentType": "CollectionURL"
        },
        {
          "url": "https://acdisc.gsfc.nasa.gov/data/CMS/MICASA_FLUX_D.1/",
          "description": "Access the data via HTTPS.",
          "subtype": "DATA TREE",
          "type": "GET DATA",
          "urlContentType": "DistributionURL"
        },
        {
          "url": "https://acdisc.gsfc.nasa.gov/opendap/CMS/MICASA_FLUX_D.1/",
          "description": "Access the data via the OPeNDAP protocol.",
          "subtype": "OPENDAP DATA",
          "type": "USE SERVICE API",
          "urlContentType": "DistributionURL"
        },
        {
          "url": "https://acdisc.gsfc.nasa.gov/data/CMS/MICASA_FLUX_D.1/doc/MiCASA_README.pdf",
          "description": "README Document",
          "subtype": "READ-ME",
          "type": "VIEW RELATED INFORMATION",
          "urlContentType": "PublicationURL"
        },
        {
          "url": "carbon.nasa.gov",
          "description": "The NASA Carbon Monitoring System (CMS) page.",
          "type": "PROJECT HOME PAGE",
          "urlContentType": "CollectionURL"
        },
        {
          "url": "https://search.earthdata.nasa.gov/search?q=MICASA_FLUX_D",
          "description": "Use the Earthdata Search to find and retrieve data sets across multiple data centers.",
          "subtype": "Earthdata Search",
          "type": "GET DATA",
          "urlContentType": "DistributionURL"
        }
      ],
      "relatedCollections": {
        "count": 0,
        "items": []
      },
      "scienceKeywords": [
        {
          "category": "EARTH SCIENCE",
          "topic": "CLIMATE INDICATORS",
          "term": "CARBON FLUX"
        }
      ],
      "shortName": "MICASA_FLUX_D",
      "spatialExtent": {
        "granuleSpatialRepresentation": "CARTESIAN",
        "horizontalSpatialDomain": {
          "geometry": {
            "coordinateSystem": "CARTESIAN",
            "boundingRectangles": [
              {
                "westBoundingCoordinate": -180,
                "northBoundingCoordinate": 90,
                "eastBoundingCoordinate": 179,
                "southBoundingCoordinate": -90
              }
            ]
          }
        }
      },
      "tags": {
        "edsc.extra.serverless.collection_capabilities": {
          "data": {
            "cloud_cover": False,
            "day_night_flag": False,
            "granule_online_access_flag": True,
            "orbit_calculated_spatial_domains": False,
            "updated_at": "2025-07-01T18:47:33.478Z"
          }
        }
      },
      "temporalExtents": [
        {
          "rangeDateTimes": [
            {
              "beginningDateTime": "2001-01-01T00:00:00.000Z",
              "endingDateTime": "2024-12-31T23:59:59.999Z"
            }
          ],
          "endsAtPresentFlag": False
        }
      ],
      "timeStart": "2001-01-01T00:00:00.000Z",
      "timeEnd": "2024-12-31T23:59:59.999Z",
      "tilingIdentificationSystems": None,
      "title": "MiCASA Daily NPP Rh ATMC NEE FIRE FUEL Fluxes 0.1 degree x 0.1 degree",
      "versionId": "1",
      "services": {
        "count": 0,
        "items": []
      },
      "granules": {
        "count": 8766,
        "items": [
          {
            "conceptId": "G3274577363-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274574213-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274574067-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274577258-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274576831-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274576739-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274573924-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274576758-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274577354-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274577305-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274576674-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274577352-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274577345-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274573921-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274577067-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274574064-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274576933-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274577251-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274577311-GES_DISC",
            "onlineAccessFlag": True
          },
          {
            "conceptId": "G3274576621-GES_DISC",
            "onlineAccessFlag": True
          }
        ]
      },
      "subscriptions": {
        "count": 0,
        "items": []
      },
      "tools": {
        "count": 0,
        "items": []
      },
      "variables": {
        "count": 0,
        "cursor": None,
        "items": []
      }
    }
  }
}

In [2]:
from langgraph.graph import StateGraph, END, START
from langgraph.prebuilt import create_react_agent
from langchain.chat_models import init_chat_model
from langchain_core.tools import tool


In [4]:
llm = init_chat_model("google_genai:gemini-2.0-flash")

In [None]:
# craete a data class
from typing import Optional
from pydantic import BaseModel, Field

# this is the data that we will form.
# this will be used to create the config.json needed for airflow
# Note: everything is flattened, for simplicity.

class ConfigVars(BaseModel):
  """
  Always use this tool to structure your response to the user.
  """
  # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
  title: Optional[str] = Field(description="The title of the STAC Collection")
  collection: Optional[str] = Field(description="The Collection Id which is same as the concept id")
  description: Optional[str] = Field(description="The description of the STAC Collection") 
   
  

In [181]:
from typing import TypedDict, Annotated
from langgraph.graph.message import add_messages
import operator

# here along with the messaege, the graph node will share the configs.
# whatever is missing will be filled in the process by nodes/agents in the graph.
class AgentState(TypedDict):
  messages: Annotated[list, operator.add]
  configs: ConfigVars

In [None]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# node 1
def create_flat_config(state: AgentState) -> AgentState:
  """
  This node will be responsible for creation of the pydantic enforced ConfigState.
  Todo: validate the input CMR json to match the CMR pydentic schema. If error, throw that instead.
  The values that are not available in the input CMR json should be assigned as None.
  The values that are available in CMR and corresponds to the ConfigVars should be filled up.
  """
  # The message content is expected to have the CMR json
  model = llm.with_structured_output(schema=ConfigVars)

  structuring_prompt_template = ChatPromptTemplate.from_messages(
    [
      ( "system",
        "You are an expoert extraction algorithm."
        "Only extract relevant information from the structured data."
        "If you do not know the value of an attribute asked to extract,"
        "return null for the attribute's value"
      ),
      ( "human", "{data}" )
    ]
  )

  prompt = structuring_prompt_template.invoke({ "data": state["messages"][0]["content"] })

  response = model.invoke(prompt)

  # TODO: validate if the response is as per the pydantic defination of ConfigVars.

  return {
    "messages": "yeah done. now find the null values using tools",
    "configs": response
  }


In [254]:
# tools for node 2, which is a react agent

@tool
def find_dashboard_is_periodic(concept_id: str) -> bool:
  """
  Provided the concept_id from cmr, use it to get the list of granules.
  based on the granules, figure out the periodicity
  """
  
# many more tools 
  
tools = [find_dashboard_is_periodic]

filler_react_agent = create_react_agent(llm, tools)

In [255]:
# node 2
def call_filler_react_agent(state: AgentState):
  """
  for the missing value in the state.configs variables,
  try to fill in the value using available tools.
  Maybe create a template with system message to command it to do so
  """
  print("node 3", state)
  
  results = filler_react_agent.invoke({"messages": state["messages"]})
  return {"messages": [result]}

In [256]:
# node 3
def formulate_stac_config(config: ConfigVars) -> dict:
  """
  use the flattened configVars and then
  formulate the necessary config json
  """
  print("node 4", state)
  
  return {}

In [257]:
# node 4
def create_collection(stac_config: dict) -> None:
  """
  request the airflow create_collection DAG, with the complete STAC config.json
  """
  print("node 5", stac_config)
  
  return None

In [258]:
# finally create a graph
graph_builder = StateGraph(AgentState)

graph_builder.add_node("create_flat_config", create_flat_config)
graph_builder.add_node("call_filler_react_agent", call_filler_react_agent)
graph_builder.add_node("formulate_stac_config", formulate_stac_config)
graph_builder.add_node("create_collection", create_collection)

graph_builder.add_edge(START, "create_flat_config")
graph_builder.add_edge("create_flat_config", "call_filler_react_agent")
graph_builder.add_edge("call_filler_react_agent", "formulate_stac_config")
graph_builder.add_edge("formulate_stac_config", "create_collection")
graph_builder.add_edge("create_collection", END)

graph = graph_builder.compile()


In [259]:
# from IPython.display import Image, display

# try:
#   display(Image(graph.get_graph().draw_mermaid_png()))
# except exception:
#   pass

In [260]:
# from IPython.display import Image, display

# try:
#   display(Image(filler_react_agent.get_graph().draw_mermaid_png()))
# except exception:
#   pass

In [263]:
config = {
  "configurable": {
    "thread_id": "test123xyz"
  }
}

messages = { "messages": [{
  "role": "user",
  "content": mock_data
  }]
}

print(messages)

{'messages': [{'role': 'user', 'content': {'data': {'collection': {'abstract': 'MiCASA is an extensive revision of CASA-GFED3. CASA-GFED3 derives from Potter et al. (1993), diverging in development since Randerson et al. (1996). CASA is a light use efficiency model: NPP is expressed as the product of photosynthetically active solar radiation, a light use efficiency parameter, scalars that capture temperature and moisture limitations, and fractional absorption of photosynthetically active radiation (fPAR) by the vegetation canopy derived from satellite data. Fire parameterization was incorporated into the model by van der Werf et al. (2004) leading to CASA-GFED3 after several revisions (van der Werf et al., 2006, 2010). Development of the GFED module has continued, now at GFED5 (Chen et al., 2023) with less focus on the CASA module. MiCASA diverges from GFED development at version 3, although future reconciliation is possible. Input datasets include air temperature, precipitation, incid

In [264]:
# result = graph.invoke(messages, config)
