# ML QC Dashboard

Notebook ringkas untuk membaca log QC (`instance/logs/ml_scores_qc_*.json` dan ringkasan `ml_scores_qc_summary.json`).
Tujuan:
- Visualisasi heatmap Top-K per provinsi.
- Tren risk score & LOS ≤ 1.
- Identifikasi flag dominan.
- Definisikan ambang alert sederhana.

In [5]:
from __future__ import annotations

import json
from pathlib import Path
from typing import Any

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


def find_project_root(start: Path) -> Path:
    current = start.resolve()
    for candidate in [current] + list(current.parents):
        if (candidate / "instance/logs").exists():
            return candidate
    raise RuntimeError("Tidak dapat menemukan direktori project (folder 'instance/logs' tidak ditemukan)")


PROJECT_ROOT = find_project_root(Path.cwd())
LOG_DIR = PROJECT_ROOT / "instance/logs"
SUMMARY_PATH = LOG_DIR / "ml_scores_qc_summary.json"
DETAIL_PATTERN = "ml_scores_qc_*.json"

print(f"Project root: {PROJECT_ROOT}")
print(f"Summary path: {SUMMARY_PATH}")

Project root: /Users/ridomaulana/bpjs
Summary path: /Users/ridomaulana/bpjs/instance/logs/ml_scores_qc_summary.json


In [6]:
def load_summary(path: Path) -> dict[str, Any]:
    if not path.exists():
        raise FileNotFoundError(f"File summary tidak ditemukan: {path}\nJalankan 'python -m ml.pipelines.qc_summary' terlebih dahulu.")
    with path.open() as f:
        return json.load(f)


def load_snapshots(log_dir: Path) -> pd.DataFrame:
    records: list[dict[str, Any]] = []
    for file in sorted(log_dir.glob(DETAIL_PATTERN)):
        data = json.loads(file.read_text())
        summary = data.get("summary", {})
        if not summary:
            continue
        summary["file"] = file.name
        records.append(summary)
    return pd.DataFrame(records)


summary_payload = load_summary(SUMMARY_PATH)
snapshot_df = load_snapshots(LOG_DIR)
summary_payload.keys(), snapshot_df.head()

(dict_keys(['total_snapshots', 'latest_snapshot', 'averages', 'top_severity_in_top_k', 'top_province_in_top_k', 'top_flags_in_top_k', 'snapshots']),
           timestamp  total_rows  top_k  amount_claimed_mean  \
 0  20251104T192755Z     1176438     20             2.476058   
 1  20251104T193129Z     1176438     50             2.476058   
 2  20251104T195323Z     1176438     50             2.476058   
 3  20251105T165941Z     1176438     50             2.476058   
 4  20251106T090938Z     1176438     50             2.476058   
 
    amount_claimed_top_k_mean  cost_zscore_mean  cost_zscore_top_k_mean  \
 0                       2.15     -9.608866e-18                     NaN   
 1                       4.22     -9.608866e-18                0.749384   
 2                       4.22     -9.608866e-18                0.749384   
 3                       4.22     -1.052400e-17                0.749384   
 4                       4.22     -7.549823e-18                0.749384   
 
    los_le_1_

## Heatmap Top-K per Provinsi
Gunakan agregasi dari `summary_payload["top_province_in_top_k"]`.

In [7]:
top_province = pd.DataFrame(summary_payload.get("top_province_in_top_k", []), columns=["province", "count"])
top_province.head(3)

Unnamed: 0,province,count
0,DKI JAKARTA,68
1,RIAU,38
2,SUMATERA SELATAN,32


In [8]:
if not top_province.empty:
    fig = px.bar(top_province, x="province", y="count", title="Top-K Province Count", text="count")
    fig.update_layout(xaxis_title="Province", yaxis_title="Count in Top-K", xaxis_tickangle=-45)
    fig.show()
else:
    print("Tidak ada data top province.")

## Tren Risk Score & LOS ≤ 1
Gunakan dataframe `snapshot_df`.

In [9]:
if not snapshot_df.empty:
    snapshot_df["timestamp_dt"] = pd.to_datetime(snapshot_df["timestamp"], format="%Y%m%dT%H%M%SZ")
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=snapshot_df["timestamp_dt"], y=snapshot_df["risk_score_top_k_mean"], mode="lines+markers", name="Risk Score Mean (Top-K)"))
    fig.add_trace(go.Scatter(x=snapshot_df["timestamp_dt"], y=snapshot_df["los_le_1_ratio_top_k"], mode="lines+markers", name="LOS ≤ 1 Ratio (Top-K)", yaxis="y2"))
    fig.update_layout(
        title="Trend Risk Score & LOS ≤ 1",
        xaxis_title="Run Timestamp",
        yaxis=dict(title="Risk Score", rangemode="tozero"),
        yaxis2=dict(title="LOS ≤ 1 Ratio", overlaying="y", side="right", rangemode="tozero"),
    )
    fig.show()
else:
    print("Snapshot dataframe kosong.")

## Flag Dominan di Top-K

In [10]:
top_flags = pd.DataFrame(summary_payload.get("top_flags_in_top_k", []), columns=["flag", "count"])
if not top_flags.empty:
    fig = px.bar(top_flags, x="flag", y="count", title="Flag Dominan di Top-K", text="count")
    fig.update_layout(xaxis_title="Flag", yaxis_title="Count", xaxis_tickangle=-30)
    fig.show()
else:
    print("Tidak ada flag yang tercatat.")

## Ambang Alert & Notifikasi

Contoh logika alert sederhana: jika rata-rata `risk_score_top_k_mean` turun di bawah 0.7 ATAU `los_le_1_ratio_top_k` turun di bawah 5%, tandai sebagai "drift". Di produksi, blok ini dapat diganti dengan kirim Slack/email.

In [11]:
ALERT_RISK_MIN = 0.7
ALERT_LOS_RATIO_MIN = 0.05

latest = snapshot_df.sort_values("timestamp_dt").tail(1)
if latest.empty:
    alert_status = "No data"
else:
    risk_mean = latest["risk_score_top_k_mean"].iloc[0]
    los_ratio = latest["los_le_1_ratio_top_k"].iloc[0]
    if (risk_mean is not None and risk_mean < ALERT_RISK_MIN) or (los_ratio is not None and los_ratio < ALERT_LOS_RATIO_MIN):
        alert_status = f"ALERT: Risk mean={risk_mean:.2f}, LOS<=1 ratio={los_ratio:.2f}"
    else:
        alert_status = "OK: Semua metrik dalam ambang normal."

alert_status

'ALERT: Risk mean=0.94, LOS<=1 ratio=0.02'