In [None]:
%%configure -f
{
  "conf": {
    "spark.notebook.parameters": "{\"runs\":[{\"name\":\"BFF-10k-LH-to-Delta-Full-Refresh\",\"dataset_name\":\"10k\",\"source\":\"lakehouse\",\"format\":\"delta\",\"update_strategy\":\"Full Refresh\"},{\"name\":\"BFF-10k-SQL-to-WH-Full-Compare\",\"dataset_name\":\"10k\",\"source\":\"sql\",\"format\":\"warehouse\",\"update_strategy\":\"Full Compare\"},{\"name\":\"BFF-1M-LH-to-WH-Increment\",\"dataset_name\":\"1m\",\"source\":\"lakehouse\",\"format\":\"warehouse\",\"update_strategy\":\"Full Refresh\"}]}"
  }
}


# 4. Run Benchmarks
### ðŸ”— Wake up Azure SQL Database if SQL is source.

Loop over the hard-coded parameter sets and trigger the `1.IngestData` notebook in each target workspace.
This notebook contains a %%configure cell with the run list (hard-coded from config/test_parameter_sets.yml).

Note: this runner requires a notebook-managed token (mssparkutils). It must run inside a Fabric workspace.

In [None]:
# Runner cell for 4.RunBenchmarks.
# Behavior:
# - Trigger RunNotebook jobs for each parameter-set (notebook '1.IngestData')
# - Poll the returned job instance Location URL until terminal (including "Completed")
# - Keep concise progress/status prints; comment out verbose debug prints
# - Add the notebook displayName to the run summary entries
#
# NOTE: This requires notebook-managed token via mssparkutils and must run inside Fabric.

import json, time
import requests

API_BASE = "https://api.fabric.microsoft.com/v1"
UPLOAD_TIMEOUT = 120
POLL_INTERVAL = 5        # seconds between polls
POLL_TIMEOUT_SECONDS = 900

# Read runs from the %%configure cell
params_raw = None
try:
    params_raw = spark.conf.get('spark.notebook.parameters')
except Exception:
    params_raw = None
if not params_raw:
    raise SystemExit('spark.notebook.parameters not set. Ensure the %%configure cell is present and contains the runs list.')

params = json.loads(params_raw)
runs = params.get('runs', [])
if not runs:
    print('No runs found in spark.notebook.parameters.runs')

# Notebook-managed token (required)
try:
    from notebookutils import mssparkutils
    token = mssparkutils.credentials.getToken('https://api.fabric.microsoft.com/')
except Exception:
    raise SystemExit('Failed to obtain notebook-managed token via mssparkutils. This notebook must be run inside a Fabric workspace.')

if not token:
    raise SystemExit('mssparkutils returned no token. This notebook must be run inside a Fabric workspace.')

headers = {'Authorization': f'Bearer {token}', 'Content-Type': 'application/json'}

# Helpers
def workspace_id_by_name(display_name):
    wr = requests.get(f'{API_BASE}/workspaces', headers=headers, timeout=30)
    wr.raise_for_status()
    for w in wr.json().get('value', []):
        if w.get('displayName') == display_name:
            return w.get('id')
    return None

def item_id_for_notebook(workspace_id, notebook_display):
    items_url = f'{API_BASE}/workspaces/{workspace_id}/items'
    ir = requests.get(items_url, headers=headers, timeout=30)
    ir.raise_for_status()
    for it in ir.json().get('value', []):
        if it.get('displayName') == notebook_display and it.get('type') == 'Notebook':
            return it.get('id')
    return None

def build_exec_params(raw_params: dict) -> dict:
    """
    Convert raw params into the Fabric RunNotebook parameters shape:
      { "name": {"value":"<string>", "type":"string"}, ... }
    Complex values are JSON-encoded into string values.
    """
    out = {}
    for k, v in (raw_params or {}).items():
        if v is None:
            sval = ""
        elif isinstance(v, (str, int, float, bool)):
            sval = str(v)
        else:
            sval = json.dumps(v, ensure_ascii=False)
        out[str(k)] = {"value": sval, "type": "string"}
    return out

def trigger_run_and_get_location(workspace_id, artifact_id, raw_param_obj):
    exec_params = build_exec_params(raw_param_obj)
    payload = {
        "executionData": {
            "parameters": exec_params,
            "configuration": {}
        }
    }
    run_url = f"{API_BASE}/workspaces/{workspace_id}/items/{artifact_id}/jobs/instances?jobType=RunNotebook"
    # Progress print (keeps high-level trace)
    print('  POST run_url:', run_url)
    # Debug prints commented out for now
    # print('  payload (truncated):', json.dumps(payload)[:1000])
    rr = requests.post(run_url, headers=headers, json=payload, timeout=UPLOAD_TIMEOUT)
    status = rr.status_code
    # capture Location/Operation headers and return them
    loc = rr.headers.get('Location') or rr.headers.get('Operation-Location') or rr.headers.get('Azure-AsyncOperation')
    # Detailed headers/debugging commented out
    # try:
    #     print('  response headers:')
    #     for k, v in rr.headers.items():
    #         print(f'    {k}: {v}')
    # except Exception:
    #     pass
    body_preview = None
    try:
        if rr.text:
            body_preview = rr.text[:2000]
    except Exception:
        body_preview = None
    return status, body_preview, loc

def poll_job_instance(location_url, timeout_seconds=POLL_TIMEOUT_SECONDS, interval_seconds=POLL_INTERVAL):
    """
    Poll the job instance URL (GET) until status is terminal or timeout.
    Returns the last JSON response (or None on non-JSON).
    Terminal statuses include 'Succeeded', 'Failed', 'Cancelled', 'Completed', 'SucceededWithWarnings'.
    """
    deadline = time.time() + timeout_seconds
    last_json = None
    print("  Polling job instance:", location_url)
    terminal_states = {"succeeded", "failed", "cancelled", "completed", "succeededwithwarnings"}
    while time.time() < deadline:
        try:
            gr = requests.get(location_url, headers=headers, timeout=30)
            # Progress/status print
            print("    GET op status:", gr.status_code)
            if gr.status_code == 200:
                try:
                    job = gr.json()
                    last_json = job
                except Exception:
                    # Non-JSON response; keep last_json as None
                    last_json = None
                status = (last_json or {}).get("status") if isinstance(last_json, dict) else None
                if status:
                    print(f"    job instance status: {status}")
                    if str(status).lower() in terminal_states:
                        return last_json
            else:
                print("    non-200 response while polling:", gr.status_code)
        except Exception as e:
            print("    poll GET failed:", e)
        time.sleep(interval_seconds)
    print("  Poll timed out after", timeout_seconds, "seconds")
    return last_json

# Runner loop (with polling the returned job instance). Include notebook name in the summary.
notebook_display = "1.IngestData"
results = []
for run_cfg in runs:
    ws_name = run_cfg.get('name')
    if not ws_name:
        results.append({'workspace': None, 'notebook': notebook_display, 'status': 'skipped_no_name', 'cfg': run_cfg})
        continue

    print('\n==> Processing run for workspace:', ws_name)
    ws_id = workspace_id_by_name(ws_name)
    if not ws_id:
        print('  workspace not found:', ws_name)
        results.append({'workspace': ws_name, 'notebook': notebook_display, 'status': 'workspace_not_found'})
        continue
    print('  workspace_id:', ws_id)

    item_id = item_id_for_notebook(ws_id, notebook_display)
    if not item_id:
        print(f'  {notebook_display} notebook not found in workspace:', ws_name)
        results.append({'workspace': ws_name, 'workspace_id': ws_id, 'notebook': notebook_display, 'status': 'notebook_not_found'})
        continue
    print('  found item_id:', item_id)

    status_code, body_preview, loc = trigger_run_and_get_location(ws_id, item_id, run_cfg)
    print('  trigger run response code:', status_code)

    polled = None
    if loc:
        polled = poll_job_instance(loc, timeout_seconds=POLL_TIMEOUT_SECONDS, interval_seconds=POLL_INTERVAL)
        if polled is None:
            print('  polling produced no JSON result or timed out')
            results.append({'workspace': ws_name, 'workspace_id': ws_id, 'notebook': notebook_display, 'item_id': item_id, 'status_code': status_code, 'location': loc, 'polled': None})
        else:
            # keep failureReason if present, but do not print verbose debug
            fr = polled.get('failureReason') if isinstance(polled, dict) else None
            if fr:
                print('  job failed with failureReason:', fr.get('message') if isinstance(fr, dict) else fr)
            # final state print (concise)
            print('  final job instance status:', polled.get('status') if isinstance(polled, dict) else polled)
            results.append({'workspace': ws_name, 'workspace_id': ws_id, 'notebook': notebook_display, 'item_id': item_id, 'status_code': status_code, 'location': loc, 'polled': polled})
    else:
        print('  no Location/Operation header returned by run POST; cannot poll instance')
        results.append({'workspace': ws_name, 'workspace_id': ws_id, 'notebook': notebook_display, 'item_id': item_id, 'status_code': status_code, 'location': None})

    # small pause between triggers
    time.sleep(1)

print('\nRun summary:')
print(json.dumps(results, indent=2))
