#  scidx_streaming URL Registration

This notebook crawls the public index at:

**https://horel.chpc.utah.edu/data/meop/data/**

It extracts file links (recursively), filters them to only include files whose **filename contains a year ≥ 2020**, and then **registers** those URLs into your **`scidx_streaming`** deployment.

> **Note:** You will need valid credentials and the correct POP/API endpoint for your `scidx_streaming` instance. This notebook includes a dry-run mode so you can verify which URLs would be registered before actually registering them.

In [4]:
# If running locally and you need these packages:
# %pip install requests beautifulsoup4 urllib3
# %pip install scidx-streaming  # uncomment if you have access to this package

import re
import queue
import urllib.parse as up
from dataclasses import dataclass
from typing import List, Set, Tuple
import requests
from bs4 import BeautifulSoup
from ndp_ep import APIClient
import os, datetime
import pandas as pd
import msgpack
import blosc
from kafka import KafkaProducer
from typing import Dict, Any, List
from pathlib import Path
from kafka.errors import MessageSizeTooLargeError

In [None]:
# ---- Configuration ----

# The root MEOP index to crawl
BASE_URL = 'https://horel.chpc.utah.edu/data/meop/'

# Only register a file if its filename contains a year >= MIN_YEAR
MIN_YEAR = 2020
MAX_YEAR = 2025

# Allowed file extensions to consider 
ALLOWED_EXTENSIONS = {'.csv', '.txt', '.json', '.nc'}

# Crawl settings
MAX_DEPTH = 3            # set higher if needed; beware of deep trees
TIMEOUT = 15             # seconds for HTTP requests
RESPECT_HOST = True      # only follow links on the same host as BASE_URL

# Registration settings
TOKEN = ''
client = APIClient(base_url=API_URL, token=TOKEN)


In [26]:
org_name = "ebus_data"

# Get the list of organizations
organizations = client.list_organizations(server="local")


# If the organization already exists, delete it
if org_name in organizations:
    print(f"Organization '{org_name}' already exists.")
else:
    print(f"Organization '{org_name}' does not exist. Proceeding to create it.")
    # registering organization
    org_data = {
        "name": org_name,
        "title": org_name,
        "description": "Sumaiya test organization for testing purposes",
    }
    try:
        client.register_organization(org_data,server="local")
        print(f"Organization '{org_name}' registered successfully.")
    except ValueError as e:
        print(e)

Organization 'ebus_data' already exists.


In [27]:
def same_host(url_a: str, url_b: str) -> bool:
    """Return True if url_b is on the same hostname as url_a."""
    pa = up.urlparse(url_a)
    pb = up.urlparse(url_b)
    return pa.netloc.lower() == pb.netloc.lower()

def is_directory_link(href: str) -> bool:
    """Heuristic: treat trailing '/' as a directory link."""
    return href.endswith('/')

def is_allowed_file(href: str, allowed_ext: Set[str]) -> bool:
    """Return True if href ends with one of the allowed extensions."""
    path = up.urlparse(href).path
    for ext in allowed_ext:
        if path.lower().endswith(ext.lower()):
            return True
    return False

def extract_years_from_filename(url: str) -> List[int]:
    """Return all 4-digit years found in the filename part of the URL."""
    path = up.urlparse(url).path
    fname = path.split('/')[-1]
    years = re.findall(r'(?:19|20)\d{2}', fname)
    return [int(y) for y in years]

def should_register(url: str, min_year: int, max_year: int) -> bool:
    years = extract_years_from_filename(url)
    if not years:
        return False
    return min(years) >= min_year and max(years) <= max_year


In [28]:
@dataclass
class CrawlResult:
    visited_pages: Set[str]
    file_urls: List[str]

def crawl_index(start_url: str, max_depth: int = 3, timeout: int = 15, respect_host: bool = True,
                allowed_ext: Set[str] = None) -> CrawlResult:
    if allowed_ext is None:
        allowed_ext = set()
    start = up.urlparse(start_url)
    start_host = start.netloc

    visited_pages: Set[str] = set()
    collected_files: List[str] = []

    Q = queue.Queue()
    Q.put((start_url, 0))

    while not Q.empty():
        url, depth = Q.get()
        if url in visited_pages:
            continue
        visited_pages.add(url)

        try:
            r = requests.get(url, timeout=timeout)
            r.raise_for_status()
        except Exception as e:
            print(f"[WARN] Failed to fetch {url}: {e}")
            continue

        soup = BeautifulSoup(r.text, 'html.parser')
        anchors = soup.find_all('a', href=True)

        for a in anchors:
            href = up.urljoin(url, a['href'])
            if not href.startswith('http'):
                continue
            if respect_host and not same_host(start_url, href):
                continue
            if is_directory_link(href):
                if depth < max_depth:
                    Q.put((href, depth + 1))
                continue
            if is_allowed_file(href, allowed_ext):
                collected_files.append(href)

    return CrawlResult(visited_pages=visited_pages, file_urls=sorted(set(collected_files)))


In [None]:
crawl = crawl_index(
    start_url=BASE_URL,
    max_depth=MAX_DEPTH,
    timeout=TIMEOUT,
    respect_host=RESPECT_HOST,
    allowed_ext=ALLOWED_EXTENSIONS
)

print(f"Visited pages: {len(crawl.visited_pages)}")
print(f"Discovered candidate files: {len(crawl.file_urls)}\n")

In [65]:
def generate_url(url_list: List[str], min_year: int, max_year: int) -> List[str]:
    filtered_urls = [u for u in url_list if should_register(u, min_year, max_year)]
    # print(f"Files containing {min_year} <= year <= {max_year} : {len(filtered_urls)}\n")
    return filtered_urls


def generate_resource_name(url: str) -> str:
    path = up.urlparse(url).path
    fname = path.split('/')[-1] or "resource"
    # lowercase + ascii-only (drop non-ascii)
    fname = fname.encode("ascii", "ignore").decode("ascii").lower()
    # replace any disallowed char with '-'
    fname = re.sub(r'[^a-z0-9_-]+', '-', fname)
    # collapse repeats and trim separators
    fname = re.sub(r'[-_]{2,}', '-', fname).strip('-_')
    # fallback if empty after sanitization
    if not fname:
        fname = "resource"
    return fname


def generate_resource_title(url: str) -> str:
    path = up.urlparse(url).path
    fname = path.split('/')[-1]
    title = fname.replace('_', ' ').replace('.csv', '')
    return f"Sensor Data – {title}"

def generate_file_type(url: str) -> str:
    return Path(url).suffix.lstrip(".").upper() or "UNKNOWN"


def generate_description_for_file_from_url(url: str) -> str:
    parts = []

    # VEHICLE:
    # 1) BUS/TRX/TRAIN/RAIL with a required numeric id (e.g., BUS01, TRX03, TRAIN02, RAIL1)
    # VEHICLE:
    # BUS and RAIL require numeric IDs; TRX may appear with or without an ID.
    m_bus  = re.search(r'(?:^|[/_])BUS(?P<id>\d+)(?=[_.\/]|$)', url, re.IGNORECASE)
    m_trx  = re.search(r'(?:^|[/_])TRX(?P<id>\d*)(?=[_.\/]|$)', url, re.IGNORECASE)  # id optional
    m_rail = re.search(r'(?:^|[/_])RAIL(?P<id>\d+)(?=[_.\/]|$)', url, re.IGNORECASE)

    if m_bus:
        parts.append(f"Vehicle: Bus {m_bus.group('id')}")
    elif m_trx:
        tid = m_trx.group('id')
        parts.append(f"Vehicle: Train {tid}" if tid else "Vehicle: Train")
    elif m_rail:
        parts.append(f"Vehicle: Rail {m_rail.group('id')}")
    else:
        # EBUS: optional number; if digits follow, include them
        m = re.search(r'(?:^|[/_])EBUS(?P<id>\d*)(?=[_.\/]|$)', url, re.IGNORECASE)
        if m:
            eid = m.group('id')
            parts.append(f"Vehicle: E-bus{(' ' + eid) if eid else ''}")
        else:
            parts.append("Vehicle: Unknown")

    # DATE: try exact range first (YYYYMMDDHHMM_YYYYMMDDHHMM), then monthly (YYYY_MM)
    m_range = re.search(r'_(\d{12})_(\d{12})(?=[^0-9]|$)', url)
    m_month = re.search(r'_(\d{4})_(\d{2})(?=[^0-9]|$)', url)

    if m_range:
        start, end = m_range.groups()
        start_fmt = f"{start[:4]}-{start[4:6]}-{start[6:8]} {start[8:10]}:{start[10:12]}"
        end_fmt   = f"{end[:4]}-{end[4:6]}-{end[6:8]} {end[8:10]}:{end[10:12]}"
        parts.append(f"Data period: {start_fmt} → {end_fmt}")
    elif m_month:
        year, month = m_month.groups()
        parts.append(f"Data period: {year}-{month}")
    else:
        parts.append("Data period: Unknown")

    # FILE TYPE
    parts.append(f"File type: {generate_file_type(url)}")

    # FLAGS
    low = url.lower()
    if "noqc" in low:
        parts.append("File marked 'noqc' (no quality control).")
    if re.search(r'(^|[/_])min([_/\.]|$)', low):
        parts.append("File marked 'min' appears to be minute-resolution of data.")
    if "/meop/" in low:
        parts.append("File marked 'meop' (Mobile Environment Observation Platform) where sensors are attached to UTA.")

    # PROCESSING LEVEL (Level 2 / Level 3)
    if re.search(r'(?<![a-z0-9])level[-_]?2(?![a-z0-9])', low):
        parts.append("Data processing level: Level 2 (modified on raw data)")
    elif re.search(r'(?<![a-z0-9])level[-_]?3(?![a-z0-9])', low):
        parts.append("Data processing level: Level 3 (modified on Level 2 data)")
    else:
        parts.append("Data processing level: data is not modified.")

    return f"This dataset is available at {url}. " + " ".join(parts)



def generate_payloads(filtered_urls: List[str]) -> List[Dict[str, Any]]:
    return [
        {
            'resource_name': generate_resource_name(u),
            'resource_title': generate_resource_title(u),
            'type': 'url',
            'resource_url': u,
            'notes': generate_description_for_file_from_url(u),
            'file_type': generate_file_type(u),
            'owner_org': 'ebus_data',
        }
        for u in filtered_urls
    ]

def register_in_scidx(payloads) -> List[str]:
    """Register URL-based data objects in scidx_streaming.
    Replace the body with your actual scidx_streaming client calls.
    """
    ids = []
    
    for meta in payloads:
        try:
            response = client.register_url(meta, server="local")
            print(response)
            ids.append(response["id"])
        except Exception as e:
            print(str(e)) 
    return ids


In [9]:
date_time_now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

In [56]:
# Read URLs from text file
with open('urls_20250828_094831.txt', 'r') as file:
    filtered_urls = [line.strip() for line in file if line.strip()]

print(f"Loaded {len(filtered_urls)} URLs from file")

Loaded 880 URLs from file


In [None]:
filtered_urls = generate_url(crawl.file_urls, MIN_YEAR, MAX_YEAR)

In [66]:
payloads = generate_payloads(filtered_urls)
for payload in payloads:
    print(payload["notes"])

This dataset is available at https://horel.chpc.utah.edu/data/meop/d_20240904/BUS01_noqc_202409040000_202409050600.csv. Vehicle: Bus 01 Data period: 2024-09-04 00:00 → 2024-09-05 06:00 File type: CSV File marked 'noqc' (no quality control). File marked 'meop' (Mobile Environment Observation Platform) where sensors are attached to UTA. Data processing level: data is not modified.
This dataset is available at https://horel.chpc.utah.edu/data/meop/d_20240904/BUS02_noqc_202409040000_202409050600.csv. Vehicle: Bus 02 Data period: 2024-09-04 00:00 → 2024-09-05 06:00 File type: CSV File marked 'noqc' (no quality control). File marked 'meop' (Mobile Environment Observation Platform) where sensors are attached to UTA. Data processing level: data is not modified.
This dataset is available at https://horel.chpc.utah.edu/data/meop/d_20240904/BUS03_noqc_202409040000_202409050600.csv. Vehicle: Bus 03 Data period: 2024-09-04 00:00 → 2024-09-05 06:00 File type: CSV File marked 'noqc' (no quality contr

In [67]:
ids = register_in_scidx(payloads)

{'id': '2c0635a6-ecd8-4702-af35-7ac1fb50fefe'}
{'id': 'cb0cc82b-6896-4fa7-ac35-2e88f75388d2'}
{'id': 'b42d2409-8dd5-43c5-89f0-30bd89b55cad'}
{'id': '6637342d-e1a3-4c3e-8058-d3b089eed760'}
{'id': 'a11e746a-b071-4a36-a016-251b34114314'}
{'id': '7c06eaa7-6ead-4a0a-9483-9617db1b3dcb'}
{'id': 'e4a9dd55-17eb-4a52-9298-ebc803d4e15e'}
{'id': '46f9d5d5-a93f-4a02-b77c-d6392e9b67de'}
{'id': '9f218a71-d72f-4c39-868b-c53412c3ef7b'}
{'id': 'db6e9bc1-f51b-4047-9f15-739540c59feb'}
{'id': '5a3e52df-95e2-47a7-8e93-b604659fa6da'}
{'id': '14c40624-614b-4025-aeec-81c6506a38e6'}
{'id': '3ba199c8-f179-44ab-8d10-adfe1abca6b2'}
{'id': '572b5450-f17b-42bd-b52b-814ed2c1c304'}
{'id': '3494405e-8cbe-4340-91ce-91c5cfd05662'}
{'id': '700b3338-38a6-4c79-bcfe-7a6cbabbb9fa'}
{'id': 'a7c6d950-c43e-4075-bab6-8dc643d77078'}
{'id': '2b97c7ef-30dd-4aa7-8519-266949937498'}
{'id': 'bf82fefc-5f75-4483-9c33-954ce6ac65b9'}
{'id': '47971251-e7cd-4ca4-ba86-8ac234c22f3f'}
{'id': 'a1724e7c-3229-4aa8-9068-70be04fa738a'}
{'id': 'e0b83

In [53]:
# resource_url_file = "urls_" + date_time_now + ".txt"
# print(resource_url_file)

# with open(resource_url_file, 'w') as f:
#     for url in filtered_urls:
#         f.write(url + '\n')

# payload_file = "payloads_" + date_time_now + ".txt"
# print(payload_file)

# with open(payload_file, 'w') as f:
#     for payload in payloads:
#         f.write(str(payload) + '\n')

# resource_name_file = "names_" + date_time_now + ".txt"
# print(resource_name_file)

# with open(resource_name_file, 'w') as f:
#     for meta in payloads:
#         f.write(meta["resource_name"] + '\n')

resource_id_file = "ids_" + date_time_now + ".txt"
print(resource_id_file)

with open(resource_id_file, 'w') as f:
        for id in ids:
            f.write(id + '\n')



ids_20250829_095429.txt


In [63]:
search_result = client.search_datasets(["ebus_data"],server="local")

for dataset in search_result:
    print(f"Found dataset: {dataset['id']} - {dataset['name']}")
    if dataset["owner_org"] == "ebus_data":
        client.delete_resource_by_name(dataset["name"], server="local")

Found dataset: 44af8b65-845c-471a-a378-0af3b5af2e1a - ebus_2024_08-csv
Found dataset: bc5c6e4a-faf6-4b5a-86f7-914eef4aedd2 - ebus_2024_07-csv
Found dataset: 6fbb9b33-ae5f-4e42-bea2-3c6117fe7705 - ebus_2024_06-csv
Found dataset: dab9096d-2dad-4dcd-b08f-7729e8f75d34 - ebus_2024_05-csv
Found dataset: c47b8f8e-0c3a-4bf7-9ec6-410dd97758c9 - ebus_2024_04-csv
Found dataset: 5910d77d-241e-4994-a808-f0a138447ffb - ebus_2024_03-csv
Found dataset: 18e0a51d-5be3-4036-856e-031c20bebe7c - ebus_2024_02-csv
Found dataset: 8dbcab91-38f2-4178-ba3b-b6bb0393f1cb - ebus_2024_01-csv
Found dataset: d98be9dd-73f6-4a17-8f49-35e26f61e946 - ebus_2023_12-csv
Found dataset: 396c0259-5f66-4752-8216-ff094e48d2d1 - ebus_2023_11-csv
Found dataset: e642cad5-78d5-4e60-a73e-dad403f45e18 - ebus_2023_10-csv
Found dataset: 7ce7c9a4-d671-4b15-9a64-83953d99dd51 - ebus_2023_09-csv
Found dataset: 1c7c0527-a40e-4ddf-b52a-f3fef953322a - ebus_2023_08-csv
Found dataset: 17ef1d04-29e9-4b5c-b65f-eb87841a4047 - ebus_2023_07-csv
Found 

In [2]:
keywords = ['bus13', 'data is not modified']

In [None]:
import msgpack
import blosc
import pandas as pd
from kafka import KafkaProducer
from kafka.errors import MessageSizeTooLargeError

# Configuration
BOOTSTRAP = "10.244.2.206:9092"
CHUNK_SIZE = 25_000  # starting rows per message
SOFT_CAP_BYTES = 950_000  # stay under common 1MB broker limit

def compress_data(data: dict) -> bytes:
    packed = msgpack.packb(data, use_bin_type=True)
    return blosc.compress(packed, cname="zstd", clevel=5, shuffle=blosc.SHUFFLE)

def stream_register(list_of_keywords: list):
    search_result = client.search_datasets(list_of_keywords, server="local")
    print(f"Search result count: {len(search_result)}")
    topics = []

    for dataset in search_result:
        resource_url = dataset["resources"][0]["url"]
        resource_name = dataset["resources"][0]["name"]
        print(f"Found dataset: {resource_name} - {resource_url}")

        df = pd.read_csv(resource_url, low_memory=False)
        total_rows = len(df)
        print(f"Loaded CSV with {total_rows} rows and {len(df.columns)} columns")

        # Kafka Producer
        producer = KafkaProducer(
            bootstrap_servers=BOOTSTRAP,
            acks="all",
            linger_ms=0,
            max_request_size=5 * 1024 * 1024,  # client cap; broker may be lower
        )

        key = resource_url.encode("utf-8")
        topic = resource_name
        topics.append(topic)

        # ----- adaptive loop (replaces the for-range loop) -----
        i = 0
        chunk_size = CHUNK_SIZE
        min_rows = 1

        while i < total_rows:
            j = min(i + chunk_size, total_rows)
            chunk = df.iloc[i:j]

            payload = {
                "values": chunk.to_dict(orient="list"),
                "stream_info": {
                    "source_url": resource_url,
                    "rows": int(len(chunk)),
                    "cols": list(chunk.columns),
                    "chunk_index": int(i // max(1, chunk_size)),
                    "start_row": int(i),
                    "end_row": int(j - 1),
                    "encoding": "msgpack+blosc(zstd5,shuffle)",
                },
            }
            blob = compress_data(payload)

            # pre-shrink if we're near/over a conservative cap
            if len(blob) > SOFT_CAP_BYTES and len(chunk) > min_rows:
                ratio = (SOFT_CAP_BYTES * 0.85) / len(blob)
                new_size = max(min_rows, int(len(chunk) * max(0.10, min(0.80, ratio))))
                print(f"Chunk ~{len(blob)} bytes > cap; reducing rows {len(chunk)} → {new_size} and retrying.")
                chunk_size = new_size
                continue  # retry same offset

            try:
                producer.send(topic, key=key, value=blob).get(timeout=30)
                print(f"Sent chunk {i // max(1, chunk_size)} with {len(chunk)} rows (compressed: {len(blob)} bytes)")
                i = j  # advance
            except MessageSizeTooLargeError:
                if len(chunk) <= min_rows:
                    # a single row is too large even after compression → cannot proceed
                    raise
                # halve and retry same offset
                new_size = max(min_rows, len(chunk) // 2)
                print(f"Broker rejected message (too large). Reducing rows {len(chunk)} → {new_size} and retrying.")
                chunk_size = new_size
                # loop continues with same i

        producer.flush()
        # producer.close()  # uncomment if you want to close per dataset
        

    return topics

# Example call
topics = stream_register(keywords)


Search result count: 13
Found dataset: bus13_2025_03-csv - https://horel.chpc.utah.edu/data/meop/data/BUS13_2025_03.csv
Loaded CSV with 4255 rows and 27 columns
Sent chunk 0 with 4255 rows (compressed: 75999 bytes)
Found dataset: bus13_2025_02-csv - https://horel.chpc.utah.edu/data/meop/data/BUS13_2025_02.csv
Loaded CSV with 231650 rows and 27 columns
Sent chunk 0 with 25000 rows (compressed: 399062 bytes)
Sent chunk 1 with 25000 rows (compressed: 421468 bytes)
Sent chunk 2 with 25000 rows (compressed: 413221 bytes)
Sent chunk 3 with 25000 rows (compressed: 427736 bytes)
Sent chunk 4 with 25000 rows (compressed: 405341 bytes)
Sent chunk 5 with 25000 rows (compressed: 398850 bytes)
Sent chunk 6 with 25000 rows (compressed: 423835 bytes)
Sent chunk 7 with 25000 rows (compressed: 415323 bytes)
Sent chunk 8 with 25000 rows (compressed: 418961 bytes)
Sent chunk 9 with 6650 rows (compressed: 118609 bytes)
Found dataset: bus13_2025_01-csv - https://horel.chpc.utah.edu/data/meop/data/BUS13_202

In [10]:
resource_topic_file = "topics_" + date_time_now + ".txt"
print(resource_topic_file)

with open(resource_topic_file, 'w') as f:
    for topic in topics:
        f.write(topic + '\n')

topics_20250830_084318.txt


In [7]:
from confluent_kafka.admin import AdminClient

admin = AdminClient({'bootstrap.servers': '10.244.2.206:9092'})


fs = admin.delete_topics(topics, operation_timeout=30)

for topic, f in fs.items():
    try:
        f.result()  # raises exception if failed
        print(f"Topic '{topic}' deleted successfully.")
    except Exception as e:
        print(f"Failed to delete topic '{topic}': {e}")


Topic 'bus13_2025_03-csv' deleted successfully.
Topic 'bus13_2025_02-csv' deleted successfully.
Topic 'bus13_2025_01-csv' deleted successfully.
Topic 'bus13_2024_12-csv' deleted successfully.
Topic 'bus13_2024_11-csv' deleted successfully.
Topic 'bus13_2024_10-csv' deleted successfully.
Topic 'bus13_2024_09-csv' deleted successfully.
Topic 'bus13_2024_08-csv' deleted successfully.
Topic 'bus13_2024_07-csv' deleted successfully.
Topic 'bus13_2024_06-csv' deleted successfully.
Topic 'bus13_2024_05-csv' deleted successfully.
Topic 'bus13_2024_04-csv' deleted successfully.
Topic 'bus13_noqc_202409040000_202409050600-csv' deleted successfully.
