#  scidx_streaming URL Registration

This notebook crawls the public index at:

**https://horel.chpc.utah.edu/data/meop/data/**

It extracts file links (recursively), filters them to only include files whose **filename contains a year ≥ 2020**, and then **registers** those URLs into your **`scidx_streaming`** deployment.

> **Note:** You will need valid credentials and the correct POP/API endpoint for your `scidx_streaming` instance. This notebook includes a dry-run mode so you can verify which URLs would be registered before actually registering them.

In [1]:
from ndp_ep import APIClient
from scidx_streaming import StreamingClient
import os, datetime
import pandas as pd
from dotenv import load_dotenv
import os
from helper_func import register_kafka
import time
import json
import numpy as np
import matplotlib.pyplot as plt
import ast
# ---- Configuration ----
load_dotenv(override=True)


# Registration settings
# read token from .env file
TOKEN = os.getenv("TOKEN")
API_URL = os.getenv("API_URL")
SERVER = os.getenv("SERVER")

# Kafka Configuration
KAFKA_HOST = os.getenv("KAFKA_HOST")
KAFKA_PORT = os.getenv("KAFKA_PORT")
BOOTSTRAP = f"{KAFKA_HOST}:{KAFKA_PORT}"
CHUNK_SIZE = 25_000  # starting rows per message
SOFT_CAP_BYTES = 950_000  # stay under common 1MB broker limit

# initializing ndp_ep APIClient
client = APIClient(base_url=API_URL, token=TOKEN)
streaming = StreamingClient(client)
print(f"Streaming Client initialized. User ID: {streaming.user_id}")
date_time_now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
org_name = "kafka_stream"

bus13_urls = [
    "https://horel.chpc.utah.edu/data/meop/level3/ebus_2024/ebus_min_2024_12.csv",
    # "https://horel.chpc.utah.edu/data/meop/level3/ebus_2025/ebus_min_2025_01.csv",
    # "https://horel.chpc.utah.edu/data/meop/level3/ebus_2025/ebus_min_2025_02.csv",
    # "https://horel.chpc.utah.edu/data/meop/data/BUS15_2025_01.csv",
    # "https://horel.chpc.utah.edu/data/meop/data/BUS15_2025_02.csv",
    # "https://horel.chpc.utah.edu/data/meop/data/BUS14_2025_01.csv",
    # "https://horel.chpc.utah.edu/data/meop/data/BUS14_2025_02.csv",
    # "https://horel.chpc.utah.edu/data/meop/data/BUS12_2024_12.csv",
    # "https://horel.chpc.utah.edu/data/meop/data/BUS12_2025_01.csv"
]

# pandas configuration
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', None)



Streaming Client initialized. User ID: fc624925-ef09-447d-bf16-378066799275


  client = APIClient(base_url=API_URL, token=TOKEN)
  super().__init__(


In [16]:
# list all topic of kafka
register_kafka(bus13_urls, org_name, client, BOOTSTRAP, KAFKA_HOST, KAFKA_PORT, SERVER)

{'id': '7c1ee053-537e-45b7-8ad2-e8e1522e3b98'}


In [17]:
print(client.search_datasets("kafka_ebus_min_2024_12-csv", server=SERVER))
# print(client.search_datasets("kafka_ebus_min_2025_01-csv", server=SERVER))
# print(client.search_datasets("kafka_ebus_min_2025_02-csv", server=SERVER))

[{'id': '7c1ee053-537e-45b7-8ad2-e8e1522e3b98', 'name': 'kafka_ebus_min_2024_12-csv', 'title': 'Kafka Sensor Data – ebus min 2024 12', 'owner_org': 'kafka_stream', 'notes': "The kafka stream is generated from csv dataset. This dataset is available at https://horel.chpc.utah.edu/data/meop/level3/ebus_2024/ebus_min_2024_12.csv. Vehicle: E-bus Data period: 2024-12 File type: CSV File marked 'min' appears to be minute-resolution of data. File marked 'meop' (Mobile Environment Observation Platform) where sensors are attached to UTA. Data processing level: Level 3 (modified on Level 2 data)", 'resources': [{'id': 'df4bbaa0-2d22-4580-8199-d7b2cb3366b5', 'url': '', 'name': 'ebus_min_2024_12-csv', 'description': 'Kafka topic ebus_min_2024_12-csv hosted at 10.244.2.206:9092', 'format': 'kafka'}], 'extras': {'host': '10.244.2.206', 'port': '9092', 'topic': 'ebus_min_2024_12-csv'}}]


In [14]:
await streaming.delete_stream("data_stream_fc624925-ef09-447d-bf16-378066799275_1")

Topic 'data_stream_fc624925-ef09-447d-bf16-378066799275_1' does not exist. Skipping deletion.


{'message': "Stream 'data_stream_fc624925-ef09-447d-bf16-378066799275_1' already deleted or does not exist."}

In [None]:

stream = await streaming.create_kafka_stream(
    keywords = ["kafka_ebus_min_2024_12"],
    # keywords=["kafka_ebus_min_2025_01-csv"],
    # keywords=["kafka_ebus_min_2025_02-csv"],
    match_all=True,
    filter_semantics=[]
)

topic = stream.data_stream_id
print(f"Stream created: {topic}")

Stream created: data_stream_fc624925-ef09-447d-bf16-378066799275_1


Heartbeat session expired - marking coordinator dead
Failed fetch messages from 1: [Error 7] RequestTimedOutError
Heartbeat session expired - marking coordinator dead


In [8]:
consumer = streaming.consume_kafka_messages("data_stream_fc624925-ef09-447d-bf16-378066799275_3")

In [9]:
consumer.stop()

In [2]:
# --- Topic Selection ---
try:
    topic = input("Enter new topic name e.g data_stream_fc624925-ef09-447d-bf16-378066799275_x: ").strip()
except Exception:
    exit

# --- File Path Selection ---
try:
    file_path = input("Enter new file path e.g: raw_stream/raw_data_x.csv: ").strip()
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"[INFO] Existing file '{file_path}' deleted.")
except Exception:
    exit

# --- Kafka Consumption and Writing ---
consumer = streaming.consume_kafka_messages(topic)
time.sleep(5)
df = None
row_count = 0
count = 1
max_size_bytes = 500 * 1024 * 1024  # 10 MB

try:
    while True:
        if os.path.exists(file_path) and os.path.getsize(file_path) > max_size_bytes:
            print(f"Reached file size > 200MB ({os.path.getsize(file_path) / (1024*1024):.2f} MB)")
            break

        df = consumer.dataframe
        if df is not None and not df.empty:
            df.to_csv(file_path, index=False, mode="a")
            row_count += len(df)
            if row_count % 500 == 0:
                print(f"df row_count: {row_count}")
except KeyboardInterrupt as e:
    print("Interrupted:", e)
finally:
    if df is not None and not df.empty:
        row_count += len(df)
        print(f"df row_count: {row_count}")
        df.to_csv(file_path, index=False, mode="a")
    consumer.stop()


[INFO] Existing file 'raw_stream/raw_data_2412.csv' deleted.
Reached file size > 200MB (774.53 MB)
df row_count: 4769
