# Supervisor Agent Constructuion/Testing 

In [1]:
import sys 
import subprocess

# get root of current repo and add to our path
root_dir = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], stderr=subprocess.DEVNULL).decode("utf-8").strip()

sys.path.append(root_dir)

## S3 Scanning Tool 

In [12]:
import boto3

def get_client_with_role(role_arn: str, aws_service: str, session_name: str, region: str = "us-east-1"):
    # start sts session
    sts = boto3.client("sts")

    # assume role + grab credentials
    creds = sts.assume_role(
        RoleArn=role_arn,
        RoleSessionName=session_name
    )["Credentials"]

    # create a new session with the assumed role credentials
    return boto3.client(
        aws_service,
        aws_access_key_id=creds["AccessKeyId"],
        aws_secret_access_key=creds["SecretAccessKey"],
        aws_session_token=creds["SessionToken"],
        region_name=region
    )

In [13]:
import os 

s3 = get_client_with_role(
    role_arn=os.getenv("DIGI_INNO_ROLE_ARN"), 
    aws_service="s3",
    session_name="digi-inno-s3"
)

In [19]:
from datetime import datetime, timedelta

def get_yesterdays_files(s3_client):
    yesterday = (datetime.today() - timedelta(days=1)).strftime("%m%d%Y")
    yesterdays_files = []
    paginator = s3_client.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket="agentic-de", Prefix=f"bronze/{yesterday}/"):
        for obj in page.get("Contents", []):
            key = obj["Key"]
            yesterdays_files.append(key)
    return yesterdays_files

In [15]:
def get_file_meta(key):
    obj = s3.get_object(Bucket="agentic-de", Key=key)
    content = obj["Body"].read(500).decode("utf-8")
    return f"File: {key}\nContent Preview:\n{content}"

## Airflow Task Runner

In [16]:
from utils.airflow import AirflowClient
import os 

airflow = AirflowClient(
    username=os.getenv("AIRFLOW_USERNAME"),
    password=os.getenv("AIRFLOW_PASSWORD"),
)

# airflow_res = airflow.trigger_dag("transform_pbs_data")

## Tool-ify

In [17]:
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.chat_models import ChatOpenAI

tools = [
    Tool(
        name="GetYesterdaysFiles", 
        func=get_yesterdays_files,
        description="Get a list of files from yesterday's scraping from the bronze subdirectory of the S3 bucket."
    ), 
    Tool(
        name="GetFileMeta", 
        func=get_file_meta,
        description="Get the metadata of a file in the S3 bucket. The file is specified by its key. The metadata includes the file name and a preview of its content."
    ), 
    Tool(
        name="TriggerAirflowDAG", 
        func=airflow.trigger_dag,
        description="Trigger an Airflow DAG. The DAG is specified by its ID."
    )
]

In [18]:
llm = ChatOpenAI(model="gpt-4", temperature=0)
agent = initialize_agent(
    tools=tools, 
    llm=llm, 
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

In [20]:
def run_supervisor_agent():
    # setp 0: initialize s3 client 
    s3 = get_client_with_role(
        role_arn=os.getenv("DIGI_INNO_ROLE_ARN"), 
        aws_service="s3",
        session_name="digi-inno-s3"
    )
    # Step 1: Get new files
    new_files = get_yesterdays_files(s3)
    if not new_files:
        print("No new files.")
        return

    for file_key in new_files:
        # Step 2: Ask agent to choose DAG
        metadata = get_file_meta(file_key)
        prompt = f"""
You are a supervisor agent. Your task is to examine the file content and decide which Airflow DAG should be triggered.

Here is the metadata:
{metadata}

Choose the correct DAG from:
- transform_pbs_data
- transform_npr_data

Then call TriggerAirflowDAG with the correct dag name and file key.
"""
        print(agent.run(prompt))


In [21]:
run_supervisor_agent()

  print(agent.run(prompt))




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThe content preview indicates that the file is an XML file from NPR (National Public Radio). Therefore, the correct Airflow DAG to trigger would be the one related to NPR data.

Action: TriggerAirflowDAG
Action Input: {"dag_id": "transform_npr_data", "key": "bronze/05222025/https%3A%2F%2Fwww.npr.org%2Frss%2Frss.php%3Fid%3D10001_05222025.xml"}[0m
Observation: [38;5;200m[1;3m{'status_code': 405, 'response': {'detail': 'Method Not Allowed'}}[0m
Thought:[32;1m[1;3mThe method is not allowed. It seems like I may have used the wrong format for the input or the wrong method to trigger the DAG.

Action: TriggerAirflowDAG
Action Input: "transform_npr_data"[0m
Observation: [38;5;200m[1;3m{'status_code': 200, 'response': {'dag_run_id': 'manual__2025-05-23T12:56:33.714940+00:00', 'dag_id': 'transform_npr_data', 'logical_date': '2025-05-23T12:56:33.714940Z', 'queued_at': '2025-05-23T12:56:33.721982Z', 'start_date': None, 'end_date

ValueError: An output parsing error occurred. In order to pass this error back to the agent and have it try again, pass `handle_parsing_errors=True` to the AgentExecutor. This is the error: Parsing LLM output produced both a final answer and a parse-able action:: The content preview indicates that the file is an XML file from NPR. Therefore, the correct DAG to trigger would be the one related to NPR data.

Action: TriggerAirflowDAG
Action Input: {"dag_id": "transform_npr_data", "key": "bronze/05222025/https%3A%2F%2Fwww.npr.org%2Frss%2Frss.php%3Fid%3D10002163_05222025.xml"}

Thought: I now know the final answer
Final Answer: The correct DAG to trigger is "transform_npr_data" with the file key "bronze/05222025/https%3A%2F%2Fwww.npr.org%2Frss%2Frss.php%3Fid%3D10002163_05222025.xml".
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 