#  scidx_streaming URL Registration

This notebook crawls the public index at:

**https://horel.chpc.utah.edu/data/meop/data/**

It extracts file links (recursively), filters them to only include files whose **filename contains a year â‰¥ 2020**, and then **registers** those URLs into your **`scidx_streaming`** deployment.

> **Note:** You will need valid credentials and the correct POP/API endpoint for your `scidx_streaming` instance. This notebook includes a dry-run mode so you can verify which URLs would be registered before actually registering them.

In [8]:
from typing import List
from ndp_ep import APIClient
import urllib.parse as up
import csv
import requests
import io
import json
import sys
# sys.path.append("/uufs/chpc.utah.edu/common/home/u1494915")
# from scidx_streaming import StreamingClient
from scidx_streaming import StreamingClient
import os, datetime
import pandas as pd
import msgpack
import blosc
from kafka import KafkaProducer
from kafka import KafkaConsumer
from kafka.errors import MessageSizeTooLargeError
from dotenv import load_dotenv
import os
import time
import re


In [9]:
# ---- Configuration ----
load_dotenv(override=True)

# Kafka Configuration
KAFKA_HOST = os.getenv("KAFKA_HOST")
KAFKA_PORT = os.getenv("KAFKA_PORT")
BOOTSTRAP = f"{KAFKA_HOST}:{KAFKA_PORT}"
CHUNK_SIZE = 25_000  # starting rows per message
SOFT_CAP_BYTES = 950_000  # stay under common 1MB broker limit


In [10]:
def generate_resource_name(url: str) -> str:
    path = up.urlparse(url).path
    fname = path.split('/')[-1] or "resource"
    # lowercase + ascii-only (drop non-ascii)
    fname = fname.encode("ascii", "ignore").decode("ascii").lower()
    # replace any disallowed char with '-'
    fname = re.sub(r'[^a-z0-9_-]+', '-', fname)
    # collapse repeats and trim separators
    fname = re.sub(r'[-_]{2,}', '-', fname).strip('-_')
    # fallback if empty after sanitization
    if not fname:
        fname = "resource"
    return fname

In [None]:

def stream_register(url: str): 
    topic = generate_resource_name(url)   
    # Configure Kafka producer
    producer = KafkaProducer(
        bootstrap_servers=BOOTSTRAP,
        value_serializer=lambda v: json.dumps(v).encode("utf-8")
    )

    try:
        while True:
            # Fetch and stream CSV rows from remote URL
            response = requests.get(url)
            response.raise_for_status()  # raises error if download fails
            f = io.StringIO(response.text)
            reader = csv.DictReader(f)
            for i, row in enumerate(reader):
                producer.send(topic, value=row)
                print(f"Sent: {row}")
                time.sleep(1.0)
            time.sleep(10)
    except KeyboardInterrupt:
        print("Producer interrupted. Flushing and closing...")
        producer.flush()
        producer.close()

In [20]:
# read from urls.txt file
with open("bus13_url.txt", "r") as f:
    keywords = [line.strip() for line in f if line.strip()]

In [21]:
# Example call
stream_register(keywords[0])

UnboundLocalError: cannot access local variable 'f' where it is not associated with a value