Download any Substratus compatible dataset using simple HTTP requests

In [1]:
import urllib.request
import json
import os

In [8]:
from pathlib import Path

params = {}
params_path = Path("/content/params.json")
if params_path.is_file():
    with params_path.open("r", encoding="UTF-8") as params_file:
        params = json.load(params_file)

urls = params.get("urls")
if not urls:
    raise ValueError("Missing required param `urls`. Please provide `urls` as a comma separated string of urls"
                     "For example, set `spec.params: {urls: http://s.com/dataset.jsonl}` "
                     "in the Dataset resource")

urls = urls.strip().split(",")
urls

['https://huggingface.co/datasets/substratusai/k8s-instructions/raw/main/k8s-instructions.jsonl']

In [3]:
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse
from pathlib import Path

def get_filename(url: str) -> str:
    return Path(urlparse(url).path).name


def download_file(url: str) -> str:
    filename = get_filename(url)
    destination = f"/content/data/{filename}"
    print(f"Downloading {url} to {destination}")
    urllib.request.urlretrieve(url, destination)
    return destination


processes = []
with ThreadPoolExecutor(max_workers=10) as executor:
    for url in urls:
        processes.append(executor.submit(download_file, url))

for task in as_completed(processes):
    print(f"Finished downloading {task.result()}")

Downloading https://huggingface.co/datasets/substratusai/k8s-instructions/raw/main/k8s-instructions.jsonl to /content/data/k8s-instructions.jsonl
Finished downloading /content/data/k8s-instructions.jsonl


In [6]:
! ls -lash /content/data/

total 280K
4.0K drwxr-xr-x 1 root root 4.0K Jul 22 04:44 .
8.0K drwxr-xr-x 1 root root 4.0K Jul 22 04:42 ..
268K -rw-r--r-- 1 root root 267K Jul 22 04:44 k8s-instructions.jsonl


In [7]:
! head -n 10  /content/data/*

{"prompt": "Write YAML that defines a Kubernetes Deployment named \"iis\" with 3 replicas. The Deployment selects pods based on the label \"app=iis\". The pod template includes a container named \"iis\" that uses the \"microsoft/iis\" image and exposes port 80 within the container. Additionally, it includes an annotation specifying the isolation type as \"hyperv\" for experimental purposes.", "completion": "```yaml\napiVersion: apps/v1\nkind: Deployment\nmetadata:\n  name: iis\nspec:\n  selector:\n    matchLabels:\n      app: iis\n  replicas: 3\n  template:\n    metadata:\n      labels:\n        app: iis\n      annotations:\n        experimental.windows.kubernetes.io/isolation-type: hyperv\n    spec:\n      containers:\n      - name: iis\n        image: microsoft/iis\n        ports:\n        - containerPort: 80\n\n\n```"}
{"prompt": "Write YAML that defines a Kubernetes Pod resource named \"iis\" with the image set to \"microsoft/iis:windowsservercore-1709\" and the container listening