# Crawler Testing

Test out our crawler and writing process before we toss it into Airflow. Envisioning this as two parallel crawls + 1 upsert event to S3?

In [1]:
import sys 
import subprocess

# get root of current repo and add to our path
root_dir = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], stderr=subprocess.DEVNULL).decode("utf-8").strip()

sys.path.append(root_dir)

## Gather Raw Data 

Let's continously search for new XML files and dump them in S3. We'll make this 2 jobs in parallel and we are making the choice not to worry about duplication here!

In [7]:
from airflow.dags.utils.constants import FEED_URLS
import requests
import boto3
from datetime import datetime
from urllib.parse import quote_plus

def request_feed(rss_feed_url: str) -> dict[str, str] | None:
    res = requests.get(rss_feed_url, timeout=10)

    if res.status_code != 200:
        print(f"Error retrieving XML document for {rss_feed_url}: {res.status_code}")
        return None
    
    return {
        "rss_url": rss_feed_url,
        "xml_doc": res.text,
    }

def get_s3_client_with_role(role_arn: str, region: str = "us-east-1") -> boto3.client:
    sts = boto3.client("sts")
    creds = sts.assume_role(
        RoleArn=role_arn,
        RoleSessionName="airflow-dag-session"
    )["Credentials"]

    return boto3.client(
        "s3",
        aws_access_key_id=creds["AccessKeyId"],
        aws_secret_access_key=creds["SecretAccessKey"],
        aws_session_token=creds["SessionToken"],
        region_name=region,
    )

def upload_to_bronze(res_dict: dict[str, str], s3_client) -> None:
    today = datetime.today().strftime("%m%d%Y")
    try: 
        s3_client.put_object(
            Bucket="agentic-de", 
            Key=f"bronze/{today}/{quote_plus(res_dict["rss_url"])}_{today}.xml",
            Body=res_dict["xml_doc"], 
            ContentType="application/xml"
        )
        print("Sucess!")
    except Exception as e:
        print(f"Error: {str(e)}")

In [5]:
import os 

s3 = get_s3_client_with_role(os.getenv("DIGI_INNO_ROLE_ARN"))

In [None]:
# fetch page
res_dict = request_feed("https://www.npr.org/rss/rss.php?id=1126")

# test upload
upload_to_bronze(res_dict, s3)