<a href="https://colab.research.google.com/github/sergiocostaifes/PPCOMP_DM/blob/main/notebooks/01_ingest_validate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================
# 01_ingest_validate.ipynb
# Ingestão + Sanidade
# =========================

# 0) IMPORTANTE: rode o 00_env_paths antes.
import pandas as pd
import numpy as np

CSV_FILE = RAW_PATH / "borg_traces_data.csv"
assert CSV_FILE.exists(), f"Arquivo não encontrado: {CSV_FILE}"

# 1) Leitura
df = pd.read_csv(CSV_FILE)
log(f"Leitura OK: {df.shape} (linhas, colunas)")
log(f"Colunas: {list(df.columns)}")

# 2) Remover coluna lixo se existir
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

# 3) time: numérico + limpeza de sentinela (regra do seu Dia 1)
df["time"] = pd.to_numeric(df["time"], errors="coerce")
sentinel_cutoff = 1e16
n_before = df["time"].isna().sum()
df.loc[df["time"] > sentinel_cutoff, "time"] = np.nan
n_after = df["time"].isna().sum()

log(f"time NaNs antes: {n_before} | depois de sentinela: {n_after}")

# 4) Estatísticas rápidas
log(f"time min: {df['time'].min()}")
log(f"time median: {df['time'].median()}")
log(f"time max: {df['time'].max()}")

# 5) Validar falha: FAIL vs failed (Dia 1)
if "event" in df.columns and "failed" in df.columns:
    concord = (df["event"].eq("FAIL") == df["failed"].eq(1)).mean()
    log(f"Concordância event=='FAIL' vs failed==1: {concord:.4f}")
else:
    log("Colunas event/failed não encontradas; revise dataset.")

# 6) Relatório curto (markdown-friendly)
summary = {
    "rows": int(df.shape[0]),
    "cols": int(df.shape[1]),
    "time_nan": int(df["time"].isna().sum()),
    "failed_1_count": int((df["failed"] == 1).sum()) if "failed" in df.columns else None,
    "event_top": df["event"].value_counts().head(10).to_dict() if "event" in df.columns else None,
}

summary