#  scidx_streaming URL Registration

This notebook crawls the public index at:

**https://horel.chpc.utah.edu/data/meop/data/**

It extracts file links (recursively), filters them to only include files whose **filename contains a year ≥ 2020**, and then **registers** those URLs into your **`scidx_streaming`** deployment.

> **Note:** You will need valid credentials and the correct POP/API endpoint for your `scidx_streaming` instance. This notebook includes a dry-run mode so you can verify which URLs would be registered before actually registering them.

In [None]:
# If running locally and you need these packages:
# %pip install requests beautifulsoup4 urllib3
# %pip install scidx-streaming  # uncomment if you have access to this package

import re
import queue
import urllib.parse as up
from dataclasses import dataclass
from typing import List, Set, Tuple
import requests
from bs4 import BeautifulSoup
from ndp_ep import APIClient
from scidx_streaming import StreamingClient
import os, datetime
import pandas as pd
import msgpack
import blosc
from kafka import KafkaProducer
from kafka import KafkaConsumer
from typing import Dict, Any, List
from pathlib import Path
from kafka.errors import MessageSizeTooLargeError
from dotenv import load_dotenv
import os


In [3]:
# ---- Configuration ----
load_dotenv(override=True)

# The root MEOP index to crawl
BASE_URL = os.getenv("BASE_URL")

# Only register a file if its filename contains a year >= MIN_YEAR
MIN_YEAR = 2020
MAX_YEAR = 2025

# Allowed file extensions to consider 
ALLOWED_EXTENSIONS = {'.csv', '.txt', '.json', '.nc'}

# Crawl settings
MAX_DEPTH = 3            # set higher if needed; beware of deep trees
TIMEOUT = 15             # seconds for HTTP requests
RESPECT_HOST = True      # only follow links on the same host as BASE_URL

# Registration settings
# read token from .env file
TOKEN = os.getenv("TOKEN")
API_URL = os.getenv("API_URL")
SERVER = os.getenv("SERVER")

# Kafka Configuration
KAFKA_HOST = os.getenv("KAFKA_HOST")
KAFKA_PORT = os.getenv("KAFKA_PORT")
BOOTSTRAP = f"{KAFKA_HOST}:{KAFKA_PORT}"
CHUNK_SIZE = 25_000  # starting rows per message
SOFT_CAP_BYTES = 950_000  # stay under common 1MB broker limit

# initializing ndp_ep APIClient
client = APIClient(base_url=API_URL, token=TOKEN)
streaming = StreamingClient(client)
print(f"Streaming Client initialized. User ID: {streaming.user_id}")
date_time_now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
org_name = "ebus_data"


Streaming Client initialized. User ID: fc624925-ef09-447d-bf16-378066799275


In [None]:
# Get the list of organizations
organizations = client.list_organizations(server=SERVER)


# If the organization already exists, delete it
if org_name in organizations:
    print(f"Organization '{org_name}' already exists.")
else:
    print(f"Organization '{org_name}' does not exist. Proceeding to create it.")
    # registering organization
    org_data = {
        "name": org_name,
        "title": org_name,
        "description": "Sumaiya test organization for testing purposes",
    }
    try:
        client.register_organization(org_data,server=SERVER)
        print(f"Organization '{org_name}' registered successfully.")
    except ValueError as e:
        print(e)

In [None]:
def same_host(url_a: str, url_b: str) -> bool:
    """Return True if url_b is on the same hostname as url_a."""
    pa = up.urlparse(url_a)
    pb = up.urlparse(url_b)
    return pa.netloc.lower() == pb.netloc.lower()

def is_directory_link(href: str) -> bool:
    """Heuristic: treat trailing '/' as a directory link."""
    return href.endswith('/')

def is_allowed_file(href: str, allowed_ext: Set[str]) -> bool:
    """Return True if href ends with one of the allowed extensions."""
    path = up.urlparse(href).path
    for ext in allowed_ext:
        if path.lower().endswith(ext.lower()):
            return True
    return False

def extract_years_from_filename(url: str) -> List[int]:
    """Return all 4-digit years found in the filename part of the URL."""
    path = up.urlparse(url).path
    fname = path.split('/')[-1]
    years = re.findall(r'(?:19|20)\d{2}', fname)
    return [int(y) for y in years]

def should_register(url: str, min_year: int, max_year: int) -> bool:
    years = extract_years_from_filename(url)
    if not years:
        return False
    return min(years) >= min_year and max(years) <= max_year


In [None]:
@dataclass
class CrawlResult:
    visited_pages: Set[str]
    file_urls: List[str]

def crawl_index(start_url: str, max_depth: int = 3, timeout: int = 15, respect_host: bool = True,
                allowed_ext: Set[str] = None) -> CrawlResult:
    if allowed_ext is None:
        allowed_ext = set()
    start = up.urlparse(start_url)
    start_host = start.netloc

    visited_pages: Set[str] = set()
    collected_files: List[str] = []

    Q = queue.Queue()
    Q.put((start_url, 0))

    while not Q.empty():
        url, depth = Q.get()
        if url in visited_pages:
            continue
        visited_pages.add(url)

        try:
            r = requests.get(url, timeout=timeout)
            r.raise_for_status()
        except Exception as e:
            print(f"[WARN] Failed to fetch {url}: {e}")
            continue

        soup = BeautifulSoup(r.text, 'html.parser')
        anchors = soup.find_all('a', href=True)

        for a in anchors:
            href = up.urljoin(url, a['href'])
            if not href.startswith('http'):
                continue
            if respect_host and not same_host(start_url, href):
                continue
            if is_directory_link(href):
                if depth < max_depth:
                    Q.put((href, depth + 1))
                continue
            if is_allowed_file(href, allowed_ext):
                collected_files.append(href)

    return CrawlResult(visited_pages=visited_pages, file_urls=sorted(set(collected_files)))


In [None]:
crawl = crawl_index(
    start_url=BASE_URL,
    max_depth=MAX_DEPTH,
    timeout=TIMEOUT,
    respect_host=RESPECT_HOST,
    allowed_ext=ALLOWED_EXTENSIONS
)

print(f"Visited pages: {len(crawl.visited_pages)}")
print(f"Discovered candidate files: {len(crawl.file_urls)}\n")

In [None]:
def generate_url(url_list: List[str], min_year: int, max_year: int) -> List[str]:
    filtered_urls = [u for u in url_list if should_register(u, min_year, max_year)]
    # print(f"Files containing {min_year} <= year <= {max_year} : {len(filtered_urls)}\n")
    return filtered_urls


def generate_resource_name(url: str) -> str:
    path = up.urlparse(url).path
    fname = path.split('/')[-1] or "resource"
    # lowercase + ascii-only (drop non-ascii)
    fname = fname.encode("ascii", "ignore").decode("ascii").lower()
    # replace any disallowed char with '-'
    fname = re.sub(r'[^a-z0-9_-]+', '-', fname)
    # collapse repeats and trim separators
    fname = re.sub(r'[-_]{2,}', '-', fname).strip('-_')
    # fallback if empty after sanitization
    if not fname:
        fname = "resource"
    return fname


def generate_resource_title(url: str) -> str:
    path = up.urlparse(url).path
    fname = path.split('/')[-1]
    title = fname.replace('_', ' ').replace('.csv', '')
    return f"Sensor Data – {title}"

def generate_file_type(url: str) -> str:
    return Path(url).suffix.lstrip(".").upper() or "UNKNOWN"


def generate_description_for_file_from_url(url: str) -> str:
    parts = []

    # VEHICLE:
    # 1) BUS/TRX/TRAIN/RAIL with a required numeric id (e.g., BUS01, TRX03, TRAIN02, RAIL1)
    # VEHICLE:
    # BUS and RAIL require numeric IDs; TRX may appear with or without an ID.
    m_bus  = re.search(r'(?:^|[/_])BUS(?P<id>\d+)(?=[_.\/]|$)', url, re.IGNORECASE)
    m_trx  = re.search(r'(?:^|[/_])TRX(?P<id>\d*)(?=[_.\/]|$)', url, re.IGNORECASE)  # id optional
    m_rail = re.search(r'(?:^|[/_])RAIL(?P<id>\d+)(?=[_.\/]|$)', url, re.IGNORECASE)

    if m_bus:
        parts.append(f"Vehicle: Bus {m_bus.group('id')}")
    elif m_trx:
        tid = m_trx.group('id')
        parts.append(f"Vehicle: Train {tid}" if tid else "Vehicle: Train")
    elif m_rail:
        parts.append(f"Vehicle: Rail {m_rail.group('id')}")
    else:
        # EBUS: optional number; if digits follow, include them
        m = re.search(r'(?:^|[/_])EBUS(?P<id>\d*)(?=[_.\/]|$)', url, re.IGNORECASE)
        if m:
            eid = m.group('id')
            parts.append(f"Vehicle: E-bus{(' ' + eid) if eid else ''}")
        else:
            parts.append("Vehicle: Unknown")

    # DATE: try exact range first (YYYYMMDDHHMM_YYYYMMDDHHMM), then monthly (YYYY_MM)
    m_range = re.search(r'_(\d{12})_(\d{12})(?=[^0-9]|$)', url)
    m_month = re.search(r'_(\d{4})_(\d{2})(?=[^0-9]|$)', url)

    if m_range:
        start, end = m_range.groups()
        start_fmt = f"{start[:4]}-{start[4:6]}-{start[6:8]} {start[8:10]}:{start[10:12]}"
        end_fmt   = f"{end[:4]}-{end[4:6]}-{end[6:8]} {end[8:10]}:{end[10:12]}"
        parts.append(f"Data period: {start_fmt} → {end_fmt}")
    elif m_month:
        year, month = m_month.groups()
        parts.append(f"Data period: {year}-{month}")
    else:
        parts.append("Data period: Unknown")

    # FILE TYPE
    parts.append(f"File type: {generate_file_type(url)}")

    # FLAGS
    low = url.lower()
    if "noqc" in low:
        parts.append("File marked 'noqc' (no quality control).")
    if re.search(r'(^|[/_])min([_/\.]|$)', low):
        parts.append("File marked 'min' appears to be minute-resolution of data.")
    if "/meop/" in low:
        parts.append("File marked 'meop' (Mobile Environment Observation Platform) where sensors are attached to UTA.")

    # PROCESSING LEVEL (Level 2 / Level 3)
    if re.search(r'(?<![a-z0-9])level[-_]?2(?![a-z0-9])', low):
        parts.append("Data processing level: Level 2 (modified on raw data)")
    elif re.search(r'(?<![a-z0-9])level[-_]?3(?![a-z0-9])', low):
        parts.append("Data processing level: Level 3 (modified on Level 2 data)")
    else:
        parts.append("Data processing level: data is not modified.")

    return f"This dataset is available at {url}. " + " ".join(parts)



def generate_payloads(filtered_urls: List[str]) -> List[Dict[str, Any]]:
    return [
        {
            'resource_name': generate_resource_name(u),
            'resource_title': generate_resource_title(u),
            'type': 'url',
            'resource_url': u,
            'notes': generate_description_for_file_from_url(u),
            'file_type': generate_file_type(u),
            'owner_org': org_name,
        }
        for u in filtered_urls
    ]

def register_in_scidx(payloads) -> List[str]:
    """Register URL-based data objects in scidx_streaming.
    Replace the body with your actual scidx_streaming client calls.
    """
    ids = []
    
    for meta in payloads:
        try:
            response = client.register_url(meta, server=SERVER)
            print(response)
            ids.append(response["id"])
        except Exception as e:
            print(str(e)) 
    return ids


In [None]:
# Read URLs from text file
with open('urls_20250828_094831.txt', 'r') as file:
    filtered_urls = [line.strip() for line in file if line.strip()]

print(f"Loaded {len(filtered_urls)} URLs from file")

In [None]:
filtered_urls = generate_url(crawl.file_urls, MIN_YEAR, MAX_YEAR)

In [None]:
payloads = generate_payloads(filtered_urls)
for payload in payloads:
    print(payload["notes"])

In [None]:
ids = register_in_scidx(payloads)

In [None]:
# resource_url_file = "urls_" + date_time_now + ".txt"
# print(resource_url_file)

# with open(resource_url_file, 'w') as f:
#     for url in filtered_urls:
#         f.write(url + '\n')

# payload_file = "payloads_" + date_time_now + ".txt"
# print(payload_file)

# with open(payload_file, 'w') as f:
#     for payload in payloads:
#         f.write(str(payload) + '\n')

# resource_name_file = "names_" + date_time_now + ".txt"
# print(resource_name_file)

# with open(resource_name_file, 'w') as f:
#     for meta in payloads:
#         f.write(meta["resource_name"] + '\n')

# resource_id_file = "ids_" + date_time_now + ".txt"
# print(resource_id_file)

# with open(resource_id_file, 'w') as f:
#         for id in ids:
#             f.write(id + '\n')



In [None]:
search_result = client.search_datasets([org_name],server=SERVER)

for dataset in search_result:
    print(f"Found dataset: {dataset['id']} - {dataset['name']}")
    if dataset["owner_org"] == org_name:
        client.delete_resource_by_name(dataset["name"], server=SERVER)

In [None]:
keywords = ['bus13', 'data is not modified']

In [None]:
def compress_data(data: dict) -> bytes:
    packed = msgpack.packb(data, use_bin_type=True)
    return blosc.compress(packed, cname="zstd", clevel=5, shuffle=blosc.SHUFFLE)

def stream_register(list_of_keywords: list):
    search_result = client.search_datasets(list_of_keywords, server=SERVER)
    print(f"Search result count: {len(search_result)}")
    topics = []

    for dataset in search_result:
        resource_id = dataset["id"]
        resource_url = dataset["resources"][0]["url"]
        resource_name = dataset["resources"][0]["name"]
        print(f"Found dataset: {resource_name} - {resource_url}")

        df = pd.read_csv(resource_url, low_memory=False)
        total_rows = len(df)
        print(f"Loaded CSV with {total_rows} rows and {len(df.columns)} columns")

        # Kafka Producer
        producer = KafkaProducer(
            bootstrap_servers=BOOTSTRAP,
            acks="all",
            linger_ms=0,
            max_request_size=5 * 1024 * 1024,  # client cap; broker may be lower
        )

        key = resource_url.encode("utf-8")
        topic = resource_name
        topics.append(topic)

        # ----- adaptive loop (replaces the for-range loop) -----
        i = 0
        chunk_size = CHUNK_SIZE
        min_rows = 1

        while i < total_rows:
            j = min(i + chunk_size, total_rows)
            chunk = df.iloc[i:j]

            payload = {
                "values": chunk.to_dict(orient="list"),
                "stream_info": {
                    "source_url": resource_url,
                    "rows": int(len(chunk)),
                    "cols": list(chunk.columns),
                    "chunk_index": int(i // max(1, chunk_size)),
                    "start_row": int(i),
                    "end_row": int(j - 1),
                    "encoding": "msgpack+blosc(zstd5,shuffle)",
                },
            }
            blob = compress_data(payload)

            # pre-shrink if we're near/over a conservative cap
            if len(blob) > SOFT_CAP_BYTES and len(chunk) > min_rows:
                ratio = (SOFT_CAP_BYTES * 0.85) / len(blob)
                new_size = max(min_rows, int(len(chunk) * max(0.10, min(0.80, ratio))))
                print(f"Chunk ~{len(blob)} bytes > cap; reducing rows {len(chunk)} → {new_size} and retrying.")
                chunk_size = new_size
                continue  # retry same offset

            try:
                producer.send(topic, key=key, value=blob).get(timeout=30)
                print(f"Sent chunk {i // max(1, chunk_size)} with {len(chunk)} rows (compressed: {len(blob)} bytes)")
                i = j  # advance
            except MessageSizeTooLargeError:
                if len(chunk) <= min_rows:
                    # a single row is too large even after compression → cannot proceed
                    raise
                # halve and retry same offset
                new_size = max(min_rows, len(chunk) // 2)
                print(f"Broker rejected message (too large). Reducing rows {len(chunk)} → {new_size} and retrying.")
                chunk_size = new_size
                # loop continues with same i

        producer.flush()

        payload = {
            "topic": topic,
            "status": "active",
            "format": "stream",
            "url": BOOTSTRAP,
            "description": f"Kafka stream for topic {topic}. This is a general stream without any filters.",
            "name": f"stream_dataset {topic}"
        }
        
        patch_response = client.patch_general_dataset(
            dataset_id=resource_id,
            server=SERVER,
            data={"resources": [payload]}
        )
        print(f"Dataset Id: {resource_id}, Dataset Name: {resource_name}, Patch response: {patch_response}")
        # producer.close()  # uncomment if you want to close per dataset
        

    return topics

# Example call
topics = stream_register(keywords)


In [None]:
resource_topic_file = "topics_" + date_time_now + ".txt"
print(resource_topic_file)

with open(resource_topic_file, 'w') as f:
    for topic in topics:
        f.write(topic + '\n')

In [None]:
def try_decompress(blob: bytes):
    try:
        unpacked = blosc.decompress(blob)
        return msgpack.unpackb(unpacked, raw=False)
    except Exception:
        return None  # Not a compressed binary message

def stream_consumption(topics: List[str]):
    for topic in topics:
        print(f"Listening to Kafka topic {topic}")
        consumer = KafkaConsumer(
            topic,
            bootstrap_servers=f"{BOOTSTRAP}",
            auto_offset_reset='earliest',
            group_id=None,
            value_deserializer=lambda x: x  # Raw bytes; decode manually
        )

        seen_chunks = set()
        for message in consumer:
            raw = message.value

            # Try decompressing as binary
            data = try_decompress(raw)
            if data:
                info = data.get("stream_info", {})
                print(info)
                print(f"Source: {info.get('source_url', 'N/A')}")
                print(f"\n Binary chunk received: {info.get('rows', '?')} rows")
                print(f"Columns: {info.get('cols', '?')}")
                chunk_index = info.get('chunk_index', 'N/A')
                print(f"Chunk index: {chunk_index}")
                if chunk_index in seen_chunks:
                    print("Duplicate chunk index detected; stopping consumption.")
                    break
                seen_chunks.add(chunk_index)

                preview = list(zip(*data["values"].values()))[:3]
                for row in preview:
                    print("→", row)
            else:
                # Fallback: treat as plain UTF-8 text
                try:
                    text = raw.decode("utf-8")
                    print(f"Text message: {text}")
                except UnicodeDecodeError as e:
                    print(f"Unrecognized message format: {e}")

# read topics from topic file
with open("topics_20250830_090935.txt", "r") as f:
    topics = [line.strip() for line in f.readlines()]
stream_consumption([topics[1]])

Listening to Kafka topic bus13_2025_02-csv
{'source_url': 'https://horel.chpc.utah.edu/data/meop/data/BUS13_2025_02.csv', 'rows': 25000, 'cols': ['Timestamp', 'Latitude', 'Longitude', 'Elevation', 'GPS_Speed', 'GPS_Direction', 'GPS_RMC_Valid', 'Battery_Voltage', 'Bus_Box_Temperature', 'Bus_Top_Temperature', 'Bus_Top_Relative_Humidity', 'ES405_PM1_Concentration', 'ES405_PM2.5_Concentration', 'ES405_PM4_Concentration', 'ES405_PM10_Concentration', 'ES405_Air_Flow_Rate', 'ES405_Internal_Air_Temperature', 'ES405_Internal_Relative_Humidity', 'ES405_Internal_Air_Pressure', 'ES405_Error_Code', '2B_Ozone_Concentration', '2B_Air_Flow_Rate', '2B_Internal_Air_Temperature', '2B_Internal_Air_Pressure', 'PM2.5_Data_Flagged', 'Ozone_Data_Flagged', 'GPS_Data_Flagged'], 'chunk_index': 0, 'start_row': 0, 'end_row': 24999, 'encoding': 'msgpack+blosc(zstd5,shuffle)'}
Source: https://horel.chpc.utah.edu/data/meop/data/BUS13_2025_02.csv

 Binary chunk received: 25000 rows
Columns: ['Timestamp', 'Latitude', '

In [7]:
keywords = ["stream_dataset","bus13"]

In [8]:
result=client.search_datasets(keywords,server=SERVER)

for dataset in result:
    print(f"Found dataset: {dataset['id']} - {dataset['name']}")


Found dataset: f78a704c-6753-4784-ba0a-468371ab88e6 - bus13_2025_03-csv
Found dataset: 6b3ebce3-dc4c-41bf-aae0-2bc2e5d1165f - bus13_2025_02-csv
Found dataset: 646e2c90-cb81-4d6e-a631-6e191d7b0aca - bus13_2025_01-csv
Found dataset: 680d4ea7-118b-4578-996e-e36b7023c237 - bus13_2024_12-csv
Found dataset: 11c70de5-61b4-49fb-9d04-8347f41099c6 - bus13_2024_11-csv
Found dataset: 9f5accf8-e78d-46a8-a493-e974dbcfb7f9 - bus13_2024_10-csv
Found dataset: 144ae773-eb90-4296-9ed3-e63ba718fe16 - bus13_2024_09-csv
Found dataset: 89778dab-ce08-4ddf-b970-ce660431a974 - bus13_2024_08-csv
Found dataset: 04ab5eea-9de2-4727-8b42-8b2eae008d48 - bus13_2024_07-csv
Found dataset: c0337729-05e6-4442-a68e-d4b194f10349 - bus13_2024_06-csv
Found dataset: 2a9b3705-49f7-439e-a28a-76ddc59ecc1a - bus13_2024_05-csv
Found dataset: 62a51e2d-9af2-4b66-a0ab-e92c96c282f4 - bus13_2024_04-csv
Found dataset: 3ba199c8-f179-44ab-8d10-adfe1abca6b2 - bus13_noqc_202409040000_202409050600-csv


In [9]:
stream = await streaming.create_kafka_stream(
    keywords=keywords,
    match_all=True,
    filter_semantics=[]
)

topic = stream.data_stream_id
print(f"Stream created: {topic}")
# Start consuming the filtered Kafka stream
consumer = streaming.consume_kafka_messages(topic)


Stream created: data_stream_fc624925-ef09-447d-bf16-378066799275_2


Error decompressing message: int is not allowed for map key when strict_map_key=True
Decompression failed: int is not allowed for map key when strict_map_key=True. Attempting UTF-8 decode.
Failed to decode message: 'utf-8' codec can't decode byte 0x91 in position 2: invalid start byte
Error decompressing message: int is not allowed for map key when strict_map_key=True
Decompression failed: int is not allowed for map key when strict_map_key=True. Attempting UTF-8 decode.
Failed to decode message: 'utf-8' codec can't decode byte 0x91 in position 2: invalid start byte
Error decompressing message: int is not allowed for map key when strict_map_key=True
Decompression failed: int is not allowed for map key when strict_map_key=True. Attempting UTF-8 decode.
Failed to decode message: 'utf-8' codec can't decode byte 0x91 in position 2: invalid start byte
Error decompressing message: int is not allowed for map key when strict_map_key=True
Decompression failed: int is not allowed for map key when

In [10]:
# Get the data from the consumer
df=consumer.dataframe
df = pd.DataFrame(df)
df = pd.DataFrame(df.iloc[0].to_dict())
df.reset_index(drop=True, inplace=True)

print(df)

IndexError: single positional indexer is out-of-bounds

In [None]:
from confluent_kafka.admin import AdminClient

admin = AdminClient({'bootstrap.servers': '10.244.2.206:9092'})


fs = admin.delete_topics(topics, operation_timeout=30)

for topic, f in fs.items():
    try:
        f.result()  # raises exception if failed
        print(f"Topic '{topic}' deleted successfully.")
    except Exception as e:
        print(f"Failed to delete topic '{topic}': {e}")
