#  scidx_streaming URL Registration

This notebook crawls the public index at:

**https://horel.chpc.utah.edu/data/meop/data/**

It extracts file links (recursively), filters them to only include files whose **filename contains a year ≥ 2020**, and then **registers** those URLs into your **`scidx_streaming`** deployment.

> **Note:** You will need valid credentials and the correct POP/API endpoint for your `scidx_streaming` instance. This notebook includes a dry-run mode so you can verify which URLs would be registered before actually registering them.

In [33]:
# If running locally and you need these packages:
# %pip install requests beautifulsoup4 urllib3
# %pip install scidx-streaming  # uncomment if you have access to this package

import re
import queue
import urllib.parse as up
from dataclasses import dataclass
from typing import List, Set, Tuple
import requests
from bs4 import BeautifulSoup
from ndp_ep import APIClient
import os, datetime
import pandas as pd
import msgpack
import blosc
from kafka import KafkaProducer


In [None]:
# ---- Configuration ----

# The root MEOP index to crawl
BASE_URL = 'https://horel.chpc.utah.edu/data/meop/data/'

# Only register a file if its filename contains a year >= MIN_YEAR
MIN_YEAR = 2023

# Allowed file extensions to consider 
ALLOWED_EXTENSIONS = {'.csv', '.txt', '.json', '.nc'}

# Crawl settings
MAX_DEPTH = 4            # set higher if needed; beware of deep trees
TIMEOUT = 15             # seconds for HTTP requests
RESPECT_HOST = True      # only follow links on the same host as BASE_URL

# Registration settings
DRY_RUN = True           # True = don't hit the API; just show what would be registered
TOKEN = ''
API_URL = '155.101.6.191:8003'  # <-- change to your POP API base URL

# Optional: Prefix to apply to each data object name in SciDx
NAME_PREFIX = 'meop_'

# Optional: Tag(s) or metadata to attach (adjust to your POP's expected schema)
DEFAULT_DESCRIPTION = 'MEOP public data (URL-registered)'

# initializing ndp_ep APIClient
client = APIClient(base_url=API_URL, token=TOKEN)


In [4]:
def same_host(url_a: str, url_b: str) -> bool:
    """Return True if url_b is on the same hostname as url_a."""
    pa = up.urlparse(url_a)
    pb = up.urlparse(url_b)
    return pa.netloc.lower() == pb.netloc.lower()

def is_directory_link(href: str) -> bool:
    """Heuristic: treat trailing '/' as a directory link."""
    return href.endswith('/')

def is_allowed_file(href: str, allowed_ext: Set[str]) -> bool:
    """Return True if href ends with one of the allowed extensions."""
    path = up.urlparse(href).path
    for ext in allowed_ext:
        if path.lower().endswith(ext.lower()):
            return True
    return False

def extract_years_from_filename(url: str) -> List[int]:
    """Return all 4-digit years found in the filename part of the URL."""
    path = up.urlparse(url).path
    fname = path.split('/')[-1]
    years = re.findall(r'(?:19|20)\d{2}', fname)
    return [int(y) for y in years]

def should_register(url: str, min_year: int) -> bool:
    years = extract_years_from_filename(url)
    if not years:
        return False
    return max(years) >= min_year


In [5]:
@dataclass
class CrawlResult:
    visited_pages: Set[str]
    file_urls: List[str]

def crawl_index(start_url: str, max_depth: int = 3, timeout: int = 15, respect_host: bool = True,
                allowed_ext: Set[str] = None) -> CrawlResult:
    if allowed_ext is None:
        allowed_ext = set()
    start = up.urlparse(start_url)
    start_host = start.netloc

    visited_pages: Set[str] = set()
    collected_files: List[str] = []

    Q = queue.Queue()
    Q.put((start_url, 0))

    while not Q.empty():
        url, depth = Q.get()
        if url in visited_pages:
            continue
        visited_pages.add(url)

        try:
            r = requests.get(url, timeout=timeout)
            r.raise_for_status()
        except Exception as e:
            print(f"[WARN] Failed to fetch {url}: {e}")
            continue

        soup = BeautifulSoup(r.text, 'html.parser')
        anchors = soup.find_all('a', href=True)

        for a in anchors:
            href = up.urljoin(url, a['href'])
            if not href.startswith('http'):
                continue
            if respect_host and not same_host(start_url, href):
                continue
            if is_directory_link(href):
                if depth < max_depth:
                    Q.put((href, depth + 1))
                continue
            if is_allowed_file(href, allowed_ext):
                collected_files.append(href)

    return CrawlResult(visited_pages=visited_pages, file_urls=sorted(set(collected_files)))


In [6]:
crawl = crawl_index(
    start_url=BASE_URL,
    max_depth=MAX_DEPTH,
    timeout=TIMEOUT,
    respect_host=RESPECT_HOST,
    allowed_ext=ALLOWED_EXTENSIONS
)

print(f"Visited pages: {len(crawl.visited_pages)}")
print(f"Discovered candidate files: {len(crawl.file_urls)}\n")

filtered_urls = [u for u in crawl.file_urls if should_register(u, MIN_YEAR)]
print(f"Files containing year >= {MIN_YEAR}: {len(filtered_urls)}\n")
for u in filtered_urls:
    print(u)


Visited pages: 74
Discovered candidate files: 1018

Files containing year >= 2023: 559

https://horel.chpc.utah.edu/data/meop/d_20240904/BUS01_noqc_202409040000_202409050600.csv
https://horel.chpc.utah.edu/data/meop/d_20240904/BUS02_noqc_202409040000_202409050600.csv
https://horel.chpc.utah.edu/data/meop/d_20240904/BUS03_noqc_202409040000_202409050600.csv
https://horel.chpc.utah.edu/data/meop/d_20240904/BUS04_noqc_202409040000_202409050600.csv
https://horel.chpc.utah.edu/data/meop/d_20240904/BUS05_noqc_202409040000_202409050600.csv
https://horel.chpc.utah.edu/data/meop/d_20240904/BUS06_noqc_202409040000_202409050600.csv
https://horel.chpc.utah.edu/data/meop/d_20240904/BUS07_noqc_202409040000_202409050600.csv
https://horel.chpc.utah.edu/data/meop/d_20240904/BUS08_noqc_202409040000_202409050600.csv
https://horel.chpc.utah.edu/data/meop/d_20240904/BUS09_noqc_202409040000_202409050600.csv
https://horel.chpc.utah.edu/data/meop/d_20240904/BUS10_noqc_202409040000_202409050600.csv
https://hore

In [None]:
org_name = "ebus_data"

# Get the list of organizations
organizations = client.list_organizations(server="local")


# If the organization already exists, delete it
if org_name in organizations:
    print(f"Organization '{org_name}' already exists.")
else:
    print(f"Organization '{org_name}' does not exist. Proceeding to create it.")
    # registering organization
    org_data = {
        "name": org_name,
        "title": org_name,
        "description": "Sumaiya test organization for testing purposes",
    }
    try:
        client.register_organization(org_data,server="local")
        print(f"Organization '{org_name}' registered successfully.")
    except ValueError as e:
        print(e)

Organization 'ebus_data' does not exist. Proceeding to create it.
Organization 'ebus_data' registered successfully.


In [11]:
date_time_now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

In [12]:
resource_name_file = "names_" + date_time_now + ".txt"
print(resource_name_file)

names_20250825_194740.txt


In [13]:
payload_file = "payload_" + date_time_now + ".txt"
print(payload_file)

payload_20250825_194740.txt


In [16]:
def generate_resource_name(url: str) -> str:
    path = up.urlparse(url).path
    fname = path.split('/')[-1] or "resource"
    # lowercase + ascii-only (drop non-ascii)
    fname = fname.encode("ascii", "ignore").decode("ascii").lower()
    # replace any disallowed char with '-'
    fname = re.sub(r'[^a-z0-9_-]+', '-', fname)
    # collapse repeats and trim separators
    fname = re.sub(r'[-_]{2,}', '-', fname).strip('-_')
    # fallback if empty after sanitization
    if not fname:
        fname = "resource"
    # ensure prefix is compliant too
    prefix = (NAME_PREFIX.encode("ascii", "ignore").decode("ascii").lower()
              if 'NAME_PREFIX' in globals() else "")
    prefix = re.sub(r'[^a-z0-9_-]+', '-', prefix)
    prefix = re.sub(r'[-_]{2,}', '-', prefix).strip('-_')
    return f"{prefix}{fname}" if prefix else fname


def generate_resource_title(url: str) -> str:
    path = up.urlparse(url).path
    fname = path.split('/')[-1]
    title = fname.replace('_', ' ').replace('.csv', '')
    return f"MEOP Data – {title}"

registration_payloads = [
    {
        'resource_name': generate_resource_name(u),
        'resource_title': generate_resource_title(u),
        'type': 'url',
        'resource_url': u,
        'description': DEFAULT_DESCRIPTION,
        'owner_org': 'ebus_data',
    }
    for u in filtered_urls
]

names = []

print(f"Prepared {len(registration_payloads)} registration payload(s). Example:\n")

for meta in registration_payloads:
    print(meta)
    names.append(meta["resource_name"])
with open(resource_name_file, 'w') as f:
    for name in names:
        f.write(name + '\n')

with open(payload_file, 'w') as f:
    for payload in registration_payloads:
        f.write(str(payload) + '\n')  


Prepared 559 registration payload(s). Example:

{'resource_name': 'meopbus01_noqc_202409040000_202409050600-csv', 'resource_title': 'MEOP Data – BUS01 noqc 202409040000 202409050600', 'type': 'url', 'resource_url': 'https://horel.chpc.utah.edu/data/meop/d_20240904/BUS01_noqc_202409040000_202409050600.csv', 'description': 'MEOP public data (URL-registered)', 'owner_org': 'ebus_data'}
{'resource_name': 'meopbus02_noqc_202409040000_202409050600-csv', 'resource_title': 'MEOP Data – BUS02 noqc 202409040000 202409050600', 'type': 'url', 'resource_url': 'https://horel.chpc.utah.edu/data/meop/d_20240904/BUS02_noqc_202409040000_202409050600.csv', 'description': 'MEOP public data (URL-registered)', 'owner_org': 'ebus_data'}
{'resource_name': 'meopbus03_noqc_202409040000_202409050600-csv', 'resource_title': 'MEOP Data – BUS03 noqc 202409040000 202409050600', 'type': 'url', 'resource_url': 'https://horel.chpc.utah.edu/data/meop/d_20240904/BUS03_noqc_202409040000_202409050600.csv', 'description': '

In [None]:
resource_id_file = "ids_" + date_time_now+ ".txt"
print(resource_id_file)

In [None]:
def register_in_scidx(payloads, file_name, dry_run=True):
    """Register URL-based data objects in scidx_streaming.
    Replace the body with your actual scidx_streaming client calls.
    """
    if dry_run:
        print('[DRY RUN] Skipping API calls. These would be registered:')
        for p in payloads[:10]:
            print(f" • {p['name']} -> {p['url']}")
        if len(payloads) > 10:
            print(f" ...and {len(payloads)-10} more.")
        return
    ids = []
    
    for meta in payloads:
        try:
            response = client.register_url(meta, server="local")
            print(response)
            ids.append(response["id"])
        except Exception as e:
            print(str(e)) 
    with open(file_name, 'w') as f:
        for id in ids:
            f.write(id + '\n')

#register_in_scidx(registration_payloads, API_URL, TOKEN, DRY_RUN)
register_in_scidx(registration_payloads, resource_id_file, False)


{'id': 'b251026e-2e27-495f-90e1-d06af384034c'}
{'id': '817169f6-3a2e-49ad-a257-47f429665291'}
{'id': '485e4f2a-bd99-4596-bdbf-f947a0c854ee'}
{'id': 'c1c7160f-574c-4035-bbde-edf7b5e9c39a'}
{'id': 'fa594d89-af37-4d4e-8ea7-7503fe89108f'}
{'id': '6c10e763-b8a0-4455-8528-4b09bedd5133'}
{'id': 'ea44a39e-09fb-4fee-aefe-b0aa7992c51f'}
{'id': '442104eb-442b-4409-bae7-0c437495afd3'}
{'id': '0fa33cc0-3deb-458c-b3b7-d956b494aa5d'}
{'id': 'd40e6502-4f59-42a7-b824-1909ba95b5d7'}
{'id': '53a5b204-7648-4188-b1aa-02648eb13dc6'}
{'id': '75918521-7833-48b0-bd50-eef2b1d72c54'}
{'id': '33b2e7f0-6248-4f5e-8085-da7a6cb7e67f'}
{'id': '1738a239-83da-4391-b414-c00c9c006d09'}
{'id': '50a045a7-e6b0-4349-9b51-4e4a659fb3cd'}
{'id': 'b122c377-c29d-4189-8725-16ed4325f8ad'}
{'id': '9d5364c2-2a4c-40b3-8b26-ffdb672b74a0'}
{'id': '43ee5d0c-895f-43f5-9b35-57133f160251'}
{'id': 'c1201f59-2b27-4b6b-a731-21854ba2c66b'}
{'id': '0084216f-53ad-4456-9d28-cc26be74a520'}
{'id': 'ca7339c3-45f5-49d6-ab6f-9d61c22a51b5'}
{'id': '86a10

In [None]:
def deregister_in_scidx(file_name):
    with open(file_name, 'r') as f:
        ids = [line.strip() for line in f if line.strip()]
    for id in ids:
        try:
            response = client.delete_resource_by_id(id, server="local")
            print(response)
        except Exception as e:
            print(str(e))

deregister_in_scidx(resource_id_file)

{'message': '95b5ae2b-b5a3-41d9-90bf-b4fac657f29d deleted successfully'}
{'message': '9f0ca6ae-d68c-4c96-8ff0-c301ba69f90f deleted successfully'}
{'message': 'a7a740a5-c068-4a3c-bce0-7af7490e27ac deleted successfully'}
{'message': '88056bc3-4017-4646-bdb8-828ef03c230e deleted successfully'}
{'message': '7579691c-00ed-4984-82f0-2cb97e1be3f7 deleted successfully'}


In [None]:
def deregister_name_in_scidx(payloads):
    for meta in payloads:
        try:
            response = client.delete_resource_by_name(meta["resource_name"], server="local")
            print(response)
        except Exception as e:
            print(str(e))

deregister_name_in_scidx(registration_payloads)

meopbus01_noqc_202409040000_202409050600-csv
meopbus02_noqc_202409040000_202409050600-csv


In [None]:
# Configuration
BOOTSTRAP = "155.101.6.191:9092"
CHUNK_SIZE = 25_000  # rows per message

def compress_data(data: dict) -> bytes:
    packed = msgpack.packb(data, use_bin_type=True)
    return blosc.compress(packed, cname="zstd", clevel=5, shuffle=blosc.SHUFFLE)

def stream_register(payload):
    for meta in payload:
        # Load CSV (simple: all into memory)
        search_result = client.search_datasets([meta["resource_name"], meta["resource_url"]],server="local")
        if len(search_result) != 1:
            continue
        # print(search_result[0]["resources"][0]["url"])
        resource_url = search_result[0]["resources"][0]["url"]
        dataset_id = 
        df = pd.read_csv(resource_url)
        total_rows = len(df)
        print(f"Loaded CSV with {total_rows} rows and {len(df.columns)} columns")

        # Kafka Producer
        producer = KafkaProducer(
            bootstrap_servers=BOOTSTRAP,
            acks="all",
            linger_ms=0,
            max_request_size=5 * 1024 * 1024,
        )

        # Stable key so all chunks go to the same partition (ordered delivery)
        key = meta["resource_url"].encode("utf-8")
        topic = meta["resource_name"]
        # Chunk and send
        for i in range(0, total_rows, CHUNK_SIZE):
            chunk = df.iloc[i:i + CHUNK_SIZE]
            payload = {
                "values": chunk.to_dict(orient="list"),
                "stream_info": {
                    "source_url": meta["resource_url"],
                    "rows": len(chunk),
                    "cols": list(chunk.columns),
                    "chunk_index": i // CHUNK_SIZE,
                    "start_row": i,
                    "end_row": i + len(chunk) - 1,
                    "encoding": "msgpack+blosc(zstd5,shuffle)",
                },
            }
            blob = compress_data(payload)
            producer.send(topic, key=key, value=blob).get(timeout=30)
            print(f"✅ Sent chunk {i // CHUNK_SIZE} with {len(chunk)} rows "
                f"(compressed: {len(blob)} bytes)")

        payload = {
            "topic": topic,
            "status": "active",
            "format": "stream",
            "url": f"{resource_url}",
            "description": f"Kafka stream for topic {topic}. The stream is generated from resource url {resource_url} without any filtering. The stream status is active.",
            "name": f"derived {topic}"
        }
            
        # patch_response = client.patch_general_dataset(
        #     dataset_id=dataset['id'],
        #     server=self.server,
        #     data={"resources": [payload]}
        # )

        # logger.info("Patch response: %s", patch_response)

        producer.flush()
        producer.close()
        print(f"🎉 All chunks sent to topic '{meta["resource_url"]}'")

stream_register(registration_payloads[:1])

KeyboardInterrupt: 