#  scidx_streaming URL Registration

This notebook crawls the public index at:

**https://horel.chpc.utah.edu/data/meop/data/**

It extracts file links (recursively), filters them to only include files whose **filename contains a year ≥ 2020**, and then **registers** those URLs into your **`scidx_streaming`** deployment.

> **Note:** You will need valid credentials and the correct POP/API endpoint for your `scidx_streaming` instance. This notebook includes a dry-run mode so you can verify which URLs would be registered before actually registering them.

In [2]:
from typing import List
from ndp_ep import APIClient
import urllib.parse as up
import csv
import requests
import io
import json
import sys
# sys.path.append("/uufs/chpc.utah.edu/common/home/u1494915")
# from scidx_streaming import StreamingClient
from scidx_streaming import StreamingClient
import os, datetime
import pandas as pd
import msgpack
import blosc
from kafka import KafkaProducer
from kafka import KafkaConsumer
from kafka.errors import MessageSizeTooLargeError
from dotenv import load_dotenv
import os
import time
import re


In [3]:
# ---- Configuration ----
load_dotenv(override=True)


# Registration settings
# read token from .env file
TOKEN = os.getenv("TOKEN")
API_URL = os.getenv("API_URL")
SERVER = os.getenv("SERVER")

# Kafka Configuration
KAFKA_HOST = os.getenv("KAFKA_HOST")
KAFKA_PORT = os.getenv("KAFKA_PORT")
BOOTSTRAP = f"{KAFKA_HOST}:{KAFKA_PORT}"
CHUNK_SIZE = 25_000  # starting rows per message
SOFT_CAP_BYTES = 950_000  # stay under common 1MB broker limit

# initializing ndp_ep APIClient
client = APIClient(base_url=API_URL, token=TOKEN)
streaming = StreamingClient(client)
print(f"Streaming Client initialized. User ID: {streaming.user_id}")
date_time_now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
org_name = "kafka_stream"


Streaming Client initialized. User ID: fc624925-ef09-447d-bf16-378066799275


In [7]:
def generate_resource_name(url: str) -> str:
    path = up.urlparse(url).path
    fname = path.split('/')[-1] or "resource"
    # lowercase + ascii-only (drop non-ascii)
    fname = fname.encode("ascii", "ignore").decode("ascii").lower()
    # replace any disallowed char with '-'
    fname = re.sub(r'[^a-z0-9_-]+', '-', fname)
    # collapse repeats and trim separators
    fname = re.sub(r'[-_]{2,}', '-', fname).strip('-_')
    # fallback if empty after sanitization
    if not fname:
        fname = "resource"
    return fname

def generate_resource_title(url: str) -> str:
    path = up.urlparse(url).path
    fname = path.split('/')[-1]
    title = fname.replace('_', ' ').replace('.csv', '')
    return f"Sensor Data – {title}"

def update_csv_resource(resource_id: str, topic: str) -> dict:
        payload = {
            "topic": topic,
            "status": "active",
            "format": "kafka",
            "url": BOOTSTRAP,
            "description": f"Kafka stream for topic {topic}. This is a general stream without any filters.",
            "name": f"stream_dataset {topic}"
        }
        try:
            response = client.patch_general_dataset(
                dataset_id=resource_id,
                server=SERVER,
                data={"resources": [payload]}
            )
        except Exception as e:
            response = {"error": str(e)}
        return response

def register_in_scidx(url: str) -> dict:
    """Register URL-based data objects in scidx_streaming.
    Replace the body with your actual scidx_streaming client calls.
    """

    topic = generate_resource_name(url)
    title = generate_resource_title(url)

    # Define the payload data for the Kafka topic registration
    kafka_stream_metadata = {
            "dataset_name": f"kafka_{topic}",
            "dataset_title": f"Kafka {title}",
            "owner_org": org_name,
            "kafka_topic": topic,
            "kafka_host": KAFKA_HOST,
            "kafka_port": KAFKA_PORT
        }

        # Call the register_kafka_topic method to add the Kafka topic
    try:
        response = client.register_kafka_topic(kafka_stream_metadata, server=SERVER)
    except ValueError as e:
        print("Failed to register Kafka topic:", e)
        response = {"error": str(e)}
    return response


In [8]:
# read from urls.txt file
with open("bus13_url.txt", "r") as f:
    keywords = [line.strip() for line in f if line.strip()]

In [11]:
result = register_in_scidx(keywords[0])
print(result)

{'id': '3e11f423-b669-4950-8bae-341643dd85b1'}


In [None]:
stream = await streaming.create_kafka_stream(
    keywords=['kafka_bus13_2024_04-csv'],
    match_all=True,
    filter_semantics=[]
)

topic = stream.data_stream_id
print(f"Stream created: {topic}")

Stream created: data_stream_fc624925-ef09-447d-bf16-378066799275_2


Group Coordinator Request failed: [Error 15] GroupCoordinatorNotAvailableError
Group Coordinator Request failed: [Error 15] GroupCoordinatorNotAvailableError
Group Coordinator Request failed: [Error 15] GroupCoordinatorNotAvailableError
Group Coordinator Request failed: [Error 15] GroupCoordinatorNotAvailableError
Group Coordinator Request failed: [Error 15] GroupCoordinatorNotAvailableError
Group Coordinator Request failed: [Error 15] GroupCoordinatorNotAvailableError
Group Coordinator Request failed: [Error 15] GroupCoordinatorNotAvailableError
Group Coordinator Request failed: [Error 15] GroupCoordinatorNotAvailableError
Group Coordinator Request failed: [Error 15] GroupCoordinatorNotAvailableError
Group Coordinator Request failed: [Error 15] GroupCoordinatorNotAvailableError
Group Coordinator Request failed: [Error 15] GroupCoordinatorNotAvailableError
Group Coordinator Request failed: [Error 15] GroupCoordinatorNotAvailableError
Group Coordinator Request failed: [Error 15] GroupCo

In [None]:
consumer = streaming.consume_kafka_messages(topic)

In [None]:
# Get the data from the consumer
df=consumer.dataframe
df = pd.DataFrame(df)
df = pd.DataFrame(df.iloc[0].to_dict())
df.reset_index(drop=True, inplace=True)

print(df)

In [None]:
from kafka.admin import KafkaAdminClient

admin = KafkaAdminClient(bootstrap_servers="10.244.2.206:9092")
topics = admin.list_topics()
print(f"Found {len(topics)} topics: {topics}")

In [3]:
from kafka import KafkaConsumer

consumer = KafkaConsumer(
    "bus13_2024_04-csv",
    bootstrap_servers=BOOTSTRAP,
    auto_offset_reset="earliest"
)

for msg in consumer:
    print(msg.value)


NameError: name 'BOOTSTRAP' is not defined

In [4]:
result = client.search_datasets([org_name], server=SERVER)
for dataset in result:
    print(dataset)
    if dataset["owner_org"] == org_name:
        client.delete_resource_by_id(dataset["id"], server=SERVER)

{'id': '3e11f423-b669-4950-8bae-341643dd85b1', 'name': 'kafka_bus13_2024_04-csv', 'title': 'Kafka Sensor Data – BUS13 2024 04', 'owner_org': 'kafka_stream', 'notes': '', 'resources': [{'id': '5fe3a881-fd54-418b-9cbe-c801a96845f8', 'url': '', 'name': 'bus13_2024_04-csv', 'description': 'Kafka topic bus13_2024_04-csv hosted at 10.244.2.206:9092', 'format': 'kafka'}, {'id': '4882129a-16eb-4f75-a5c9-bda941bd5638', 'url': 'http://10.244.2.206:9092', 'name': 'derived data_stream_fc624925-ef09-447d-bf16-378066799275_1', 'description': 'Kafka stream for topic data_stream_fc624925-ef09-447d-bf16-378066799275_1. This stream is derived from the keywords: kafka_bus13_2024_04-csv. It is filtered by semantics: . All keywords must match: True. The stream status is active.', 'format': 'stream'}], 'extras': {'host': '10.244.2.206', 'port': '9092', 'topic': 'bus13_2024_04-csv'}}
