#  scidx_streaming URL Registration

This notebook crawls the public index at:

**https://horel.chpc.utah.edu/data/meop/data/**

It extracts file links (recursively), filters them to only include files whose **filename contains a year ≥ 2020**, and then **registers** those URLs into your **`scidx_streaming`** deployment.

> **Note:** You will need valid credentials and the correct POP/API endpoint for your `scidx_streaming` instance. This notebook includes a dry-run mode so you can verify which URLs would be registered before actually registering them.

In [None]:
import re
import queue
import urllib.parse as up
from dataclasses import dataclass
from typing import List, Set, Tuple
import requests
from bs4 import BeautifulSoup
from ndp_ep import APIClient
from scidx_streaming import StreamingClient
import os, datetime
import pandas as pd
import msgpack
import blosc
from kafka import KafkaProducer
from kafka import KafkaConsumer
from typing import Dict, Any, List
from pathlib import Path
from kafka.errors import MessageSizeTooLargeError
from dotenv import load_dotenv
import os


In [None]:
# ---- Configuration ----
load_dotenv(override=True)

# The root MEOP index to crawl
BASE_URL = os.getenv("BASE_URL")

# Only register a file if its filename contains a year >= MIN_YEAR
MIN_YEAR = 2020
MAX_YEAR = 2025

# Allowed file extensions to consider 
ALLOWED_EXTENSIONS = {'.csv', '.txt', '.json', '.nc'}

# Crawl settings
MAX_DEPTH = 3            # set higher if needed; beware of deep trees
TIMEOUT = 15             # seconds for HTTP requests
RESPECT_HOST = True      # only follow links on the same host as BASE_URL

# Registration settings
# read token from .env file
TOKEN = os.getenv("TOKEN")
API_URL = os.getenv("API_URL")
SERVER = os.getenv("SERVER")

# Kafka Configuration
KAFKA_HOST = os.getenv("KAFKA_HOST")
KAFKA_PORT = os.getenv("KAFKA_PORT")
BOOTSTRAP = f"{KAFKA_HOST}:{KAFKA_PORT}"
CHUNK_SIZE = 25_000  # starting rows per message
SOFT_CAP_BYTES = 950_000  # stay under common 1MB broker limit

# initializing ndp_ep APIClient
client = APIClient(base_url=API_URL, token=TOKEN)
streaming = StreamingClient(client)
print(f"Streaming Client initialized. User ID: {streaming.user_id}")
date_time_now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
org_name = "ebus_data"


Streaming Client initialized. User ID: fc624925-ef09-447d-bf16-378066799275


In [64]:
# Get the list of organizations
organizations = client.list_organizations(server=SERVER)


# If the organization already exists, delete it
if org_name in organizations:
    print(f"Organization '{org_name}' already exists.")
else:
    print(f"Organization '{org_name}' does not exist. Proceeding to create it.")
    # registering organization
    org_data = {
        "name": org_name,
        "title": org_name,
        "description": "Sumaiya test organization for testing purposes",
    }
    try:
        client.register_organization(org_data,server=SERVER)
        print(f"Organization '{org_name}' registered successfully.")
    except ValueError as e:
        print(e)

Organization 'kafka_stream' does not exist. Proceeding to create it.
Organization 'kafka_stream' registered successfully.


In [None]:
def same_host(url_a: str, url_b: str) -> bool:
    """Return True if url_b is on the same hostname as url_a."""
    pa = up.urlparse(url_a)
    pb = up.urlparse(url_b)
    return pa.netloc.lower() == pb.netloc.lower()

def is_directory_link(href: str) -> bool:
    """Heuristic: treat trailing '/' as a directory link."""
    return href.endswith('/')

def is_allowed_file(href: str, allowed_ext: Set[str]) -> bool:
    """Return True if href ends with one of the allowed extensions."""
    path = up.urlparse(href).path
    for ext in allowed_ext:
        if path.lower().endswith(ext.lower()):
            return True
    return False

def extract_years_from_filename(url: str) -> List[int]:
    """Return all 4-digit years found in the filename part of the URL."""
    path = up.urlparse(url).path
    fname = path.split('/')[-1]
    years = re.findall(r'(?:19|20)\d{2}', fname)
    return [int(y) for y in years]

def should_register(url: str, min_year: int, max_year: int) -> bool:
    years = extract_years_from_filename(url)
    if not years:
        return False
    return min(years) >= min_year and max(years) <= max_year


In [25]:
@dataclass
class CrawlResult:
    visited_pages: Set[str]
    file_urls: List[str]

def crawl_index(start_url: str, max_depth: int = 3, timeout: int = 15, respect_host: bool = True,
                allowed_ext: Set[str] = None) -> CrawlResult:
    if allowed_ext is None:
        allowed_ext = set()
    start = up.urlparse(start_url)
    start_host = start.netloc

    visited_pages: Set[str] = set()
    collected_files: List[str] = []

    Q = queue.Queue()
    Q.put((start_url, 0))

    while not Q.empty():
        url, depth = Q.get()
        if url in visited_pages:
            continue
        visited_pages.add(url)

        try:
            r = requests.get(url, timeout=timeout)
            r.raise_for_status()
        except Exception as e:
            print(f"[WARN] Failed to fetch {url}: {e}")
            continue

        soup = BeautifulSoup(r.text, 'html.parser')
        anchors = soup.find_all('a', href=True)

        for a in anchors:
            href = up.urljoin(url, a['href'])
            if not href.startswith('http'):
                continue
            if respect_host and not same_host(start_url, href):
                continue
            if is_directory_link(href):
                if depth < max_depth:
                    Q.put((href, depth + 1))
                continue
            if is_allowed_file(href, allowed_ext):
                collected_files.append(href)

    return CrawlResult(visited_pages=visited_pages, file_urls=sorted(set(collected_files)))


In [None]:
crawl = crawl_index(
    start_url=BASE_URL,
    max_depth=MAX_DEPTH,
    timeout=TIMEOUT,
    respect_host=RESPECT_HOST,
    allowed_ext=ALLOWED_EXTENSIONS
)

print(f"Visited pages: {len(crawl.visited_pages)}")
print(f"Discovered candidate files: {len(crawl.file_urls)}\n")

In [23]:
def generate_url(url_list: List[str], min_year: int, max_year: int) -> List[str]:
    filtered_urls = [u for u in url_list if should_register(u, min_year, max_year)]
    # print(f"Files containing {min_year} <= year <= {max_year} : {len(filtered_urls)}\n")
    return filtered_urls


def generate_resource_name(url: str) -> str:
    path = up.urlparse(url).path
    fname = path.split('/')[-1] or "resource"
    # lowercase + ascii-only (drop non-ascii)
    fname = fname.encode("ascii", "ignore").decode("ascii").lower()
    # replace any disallowed char with '-'
    fname = re.sub(r'[^a-z0-9_-]+', '-', fname)
    # collapse repeats and trim separators
    fname = re.sub(r'[-_]{2,}', '-', fname).strip('-_')
    # fallback if empty after sanitization
    if not fname:
        fname = "resource"
    return fname


def generate_resource_title(url: str) -> str:
    path = up.urlparse(url).path
    fname = path.split('/')[-1]
    title = fname.replace('_', ' ').replace('.csv', '')
    return f"Sensor Data – {title}"

def generate_file_type(url: str) -> str:
    return Path(url).suffix.lstrip(".").upper() or "UNKNOWN"


def generate_description_for_file_from_url(url: str) -> str:
    parts = []

    # VEHICLE:
    # 1) BUS/TRX/TRAIN/RAIL with a required numeric id (e.g., BUS01, TRX03, TRAIN02, RAIL1)
    # VEHICLE:
    # BUS and RAIL require numeric IDs; TRX may appear with or without an ID.
    m_bus  = re.search(r'(?:^|[/_])BUS(?P<id>\d+)(?=[_.\/]|$)', url, re.IGNORECASE)
    m_trx  = re.search(r'(?:^|[/_])TRX(?P<id>\d*)(?=[_.\/]|$)', url, re.IGNORECASE)  # id optional
    m_rail = re.search(r'(?:^|[/_])RAIL(?P<id>\d+)(?=[_.\/]|$)', url, re.IGNORECASE)

    if m_bus:
        parts.append(f"Vehicle: Bus {m_bus.group('id')}")
    elif m_trx:
        tid = m_trx.group('id')
        parts.append(f"Vehicle: Train {tid}" if tid else "Vehicle: Train")
    elif m_rail:
        parts.append(f"Vehicle: Rail {m_rail.group('id')}")
    else:
        # EBUS: optional number; if digits follow, include them
        m = re.search(r'(?:^|[/_])EBUS(?P<id>\d*)(?=[_.\/]|$)', url, re.IGNORECASE)
        if m:
            eid = m.group('id')
            parts.append(f"Vehicle: E-bus{(' ' + eid) if eid else ''}")
        else:
            parts.append("Vehicle: Unknown")

    # DATE: try exact range first (YYYYMMDDHHMM_YYYYMMDDHHMM), then monthly (YYYY_MM)
    m_range = re.search(r'_(\d{12})_(\d{12})(?=[^0-9]|$)', url)
    m_month = re.search(r'_(\d{4})_(\d{2})(?=[^0-9]|$)', url)

    if m_range:
        start, end = m_range.groups()
        start_fmt = f"{start[:4]}-{start[4:6]}-{start[6:8]} {start[8:10]}:{start[10:12]}"
        end_fmt   = f"{end[:4]}-{end[4:6]}-{end[6:8]} {end[8:10]}:{end[10:12]}"
        parts.append(f"Data period: {start_fmt} → {end_fmt}")
    elif m_month:
        year, month = m_month.groups()
        parts.append(f"Data period: {year}-{month}")
    else:
        parts.append("Data period: Unknown")

    # FILE TYPE
    parts.append(f"File type: {generate_file_type(url)}")

    # FLAGS
    low = url.lower()
    if "noqc" in low:
        parts.append("File marked 'noqc' (no quality control).")
    if re.search(r'(^|[/_])min([_/\.]|$)', low):
        parts.append("File marked 'min' appears to be minute-resolution of data.")
    if "/meop/" in low:
        parts.append("File marked 'meop' (Mobile Environment Observation Platform) where sensors are attached to UTA.")

    # PROCESSING LEVEL (Level 2 / Level 3)
    if re.search(r'(?<![a-z0-9])level[-_]?2(?![a-z0-9])', low):
        parts.append("Data processing level: Level 2 (modified on raw data)")
    elif re.search(r'(?<![a-z0-9])level[-_]?3(?![a-z0-9])', low):
        parts.append("Data processing level: Level 3 (modified on Level 2 data)")
    else:
        parts.append("Data processing level: data is not modified.")

    return f"This dataset is available at {url}. " + " ".join(parts)



def generate_payloads(filtered_urls: List[str]) -> List[Dict[str, Any]]:
    return [
        {
            'resource_name': generate_resource_name(u),
            'resource_title': generate_resource_title(u),
            'type': 'url',
            'resource_url': u,
            'notes': generate_description_for_file_from_url(u),
            'file_type': generate_file_type(u),
            'owner_org': org_name,
        }
        for u in filtered_urls
    ]

def register_in_scidx(payloads) -> List[str]:
    """Register URL-based data objects in scidx_streaming.
    Replace the body with your actual scidx_streaming client calls.
    """
    ids = []
    
    for meta in payloads:
        try:
            response = client.register_url(meta, server=SERVER)
            print(response)
            ids.append(response["id"])
        except Exception as e:
            print(str(e)) 
    return ids


In [1]:
# Read URLs from text file if it exists
with open('urls_20250828_094831.txt', 'r') as file:
    filtered_urls = [line.strip() for line in file if line.strip()]


# otherwise generate them by calling crawl()
# filtered_urls = generate_url(crawl.file_urls, MIN_YEAR, MAX_YEAR)
print(f"Loaded {len(filtered_urls)} URLs from file")

Loaded 880 URLs from file


In [None]:
payloads = generate_payloads(filtered_urls)

In [30]:
ids = register_in_scidx(payloads)

{'id': '54f8ea14-ca7f-49ca-9246-2ff827b6d121'}
{'id': '8b69369c-ca8f-4824-8b90-36d283abac1e'}
{'id': '275468bb-237b-4a43-881c-5a3b1115f7cb'}
{'id': '612f1a74-48e5-4ba4-ac40-d32a7fdfa4aa'}
{'id': 'fb4e8b42-7b3b-480a-a050-5b2c05956ac7'}
{'id': '4dba95ec-0e6c-48a3-827a-ee134a0724e2'}
{'id': '5a9cdb00-cabd-4b06-953c-b61778db8196'}
{'id': 'a3cb75d2-c28e-4a91-9522-7ed6588caa26'}
{'id': '6bba3278-2437-44d8-911d-caeac723763d'}
{'id': '0487cf85-a3d4-4f7a-86a1-ed1de5511d25'}
{'id': 'c2f8ff13-1c24-4a3d-a283-0b5b7427e1c3'}
{'id': '2721e078-2ff1-4ba0-bd02-c7bd28bc2176'}
{'id': '10427b0f-25d2-48db-b388-af817ddedc84'}
{'id': 'ac2023a5-1a3f-4299-8980-eb0b29423484'}
{'id': '177bec3d-a5f0-4d0f-9873-7d3e8cd367a7'}
{'id': '7e63d3f5-21de-4c87-9887-978411c9425f'}
{'id': '4af50679-b881-417b-b396-a4d5edb84ce5'}
{'id': 'f0af4d0c-1f40-4919-a25a-a80422f3fcc4'}
{'id': '9481f95e-d592-4cf1-aae7-89910775dad8'}
{'id': '6992fac7-992e-4c3f-93ca-770595115fe9'}
{'id': 'a2eef579-9d21-4daa-ba5d-c5dfec3abd02'}
{'id': '81b40