In [1]:
# -*- coding: utf-8 -*-
# Robust accuracy graph + heatmap with auto-fix of labels.csv

# If needed once:
# !pip install pandas numpy scikit-learn matplotlib

import os, re
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_recall_fscore_support,
    roc_auc_score, roc_curve, precision_recall_curve
)

# =========================
# CONFIG
# =========================
BASE_DIR     = r"C:\Users\sagni\Downloads\Resume Ranker"
SCORES_FULL  = str(Path(BASE_DIR) / "scores_full.csv")
LABELS_CSV   = str(Path(BASE_DIR) / "labels.csv")
ALIGNED_PREV = str(Path(BASE_DIR) / "labels_aligned_preview.csv")

# =========================
# Helpers
# =========================
def read_csv_robust(path: str) -> pd.DataFrame:
    encodings = ["utf-8", "utf-16", "latin-1", "cp1252"]
    last_err = None
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, encoding_errors="ignore", engine="python")
        except TypeError:
            try:
                return pd.read_csv(path, encoding=enc, engine="python")
            except Exception as e2:
                last_err = e2
        except Exception as e:
            last_err = e
    if last_err:
        raise last_err
    return pd.DataFrame()

def normalize_bool(val):
    if pd.isna(val): return np.nan
    s = str(val).strip().lower()
    true_vals  = {"1","true","yes","y","selected","hire","hired","positive","pos","shortlisted","good","relevant"}
    false_vals = {"0","false","no","n","rejected","reject","negative","neg","not selected","bad","irrelevant","non-relevant"}
    if s in true_vals:  return 1
    if s in false_vals: return 0
    try:
        f = float(s);  return 1 if f >= 0.5 else 0
    except:
        return np.nan

def unify_name(s: str) -> str:
    if pd.isna(s): return ""
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9]+", "", s)
    return s

def write_template_from_scores(scores_df: pd.DataFrame, out_path: str):
    tmpl = scores_df[["id","name","score"]].copy()
    tmpl["label"] = ""  # to be filled 1/0
    tmpl = tmpl.sort_values("score", ascending=False).reset_index(drop=True)
    tmpl.to_csv(out_path, index=False, encoding="utf-8")

# =========================
# 1) Load ALL scores
# =========================
if not Path(SCORES_FULL).exists():
    raise SystemExit(f"Missing {SCORES_FULL}. Run your scoring pipeline first.")

scores = read_csv_robust(SCORES_FULL)
scores.columns = [c.strip().lower() for c in scores.columns]
need_cols = {"id","name","score"}
if not need_cols.issubset(set(scores.columns)):
    raise SystemExit(f"{SCORES_FULL} must contain columns {need_cols}. Found: {list(scores.columns)}")
scores = scores[["id","name","score"]].copy()

# =========================
# 2) Load/Fix labels.csv
# =========================
labels_path = Path(LABELS_CSV)
if not labels_path.exists():
    # create template directly from scores (guaranteed IDs)
    write_template_from_scores(scores, LABELS_CSV)
    print(f"[TEMPLATE WRITTEN] → {LABELS_CSV}")
    print("Open it, fill the 'label' column with 1/0, save, then rerun this cell.")
    raise SystemExit()

labels = read_csv_robust(LABELS_CSV)
labels.columns = [c.strip().lower() for c in labels.columns]

# normalize label column name
if "label" not in labels.columns:
    for alt in ["labels","target","y","class"]:
        if alt in labels.columns:
            labels = labels.rename(columns={alt:"label"})
            break

# Case A: labels has id column and some non-empty labels
has_id = "id" in labels.columns
has_any_label = "label" in labels.columns and labels["label"].notna().any() and (labels["label"].astype(str).str.strip() != "").any()

if not has_id:
    # Try to recover ids by joining on normalized name
    if "name" in labels.columns:
        labels["name_key"] = labels["name"].astype(str).map(unify_name)
        sc = scores.copy()
        sc["name_key"] = sc["name"].astype(str).map(unify_name)
        recovered = pd.merge(labels, sc[["name_key","id"]], how="left", on="name_key")
        if recovered["id"].notna().any():
            labels = recovered.drop(columns=["name_key"])
            has_id = True
            print("[FIX] Recovered 'id' in labels.csv by matching normalized names.")
    # If still no id, rewrite a clean template
    if not has_id:
        write_template_from_scores(scores, LABELS_CSV)
        print(f"[TEMPLATE REWRITTEN] → {LABELS_CSV}")
        print("Reason: labels.csv had no 'id' column and couldn't recover via name match.")
        print("Fill 'label' for the rows you want to evaluate, then rerun this cell.")
        raise SystemExit()

# Ensure label column exists; if not, create empty and ask user to fill
if "label" not in labels.columns:
    labels["label"] = ""
if not has_any_label:
    # Warn the user to fill labels
    # But still save a cleaned template with ids aligned
    cleaned = labels[["id","name"]].merge(scores[["id","name","score"]], how="left", on="id", suffixes=("_labels",""))
    cleaned = cleaned[["id","name","score"]].copy()
    cleaned["label"] = ""
    cleaned.to_csv(LABELS_CSV, index=False, encoding="utf-8")
    print(f"[TEMPLATE UPDATED] → {LABELS_CSV}")
    print("Please fill 'label' (1/0) for the rows you want to evaluate, save, then rerun this cell.")
    raise SystemExit()

# =========================
# 3) Align by id (now guaranteed)
# =========================
labels["label_bin"] = labels["label"].map(normalize_bool)
labels = labels.dropna(subset=["label_bin"]).copy()
labels["label_bin"] = labels["label_bin"].astype(int)

merged = pd.merge(scores, labels[["id","label_bin"]], how="inner", on="id")
if merged.empty:
    # Debug info
    print("[DEBUG] Example IDs from scores_full:", scores["id"].head(5).tolist())
    print("[DEBUG] Example IDs from labels.csv :", labels["id"].head(5).tolist())
    raise SystemExit("No overlap after auto-fix. Make sure you labeled rows from the template generated here.")

# Save aligned preview
merged.to_csv(ALIGNED_PREV, index=False, encoding="utf-8")
print(f"[OK] Aligned {len(merged)} labeled rows. Preview → {ALIGNED_PREV}")

# =========================
# 4) Plot Accuracy/F1 vs Threshold + Confusion Matrix
# =========================
y_true  = merged["label_bin"].astype(int).values
y_score = merged["score"].astype(float).values

thresholds = np.linspace(0.0, 1.0, 201)
accs, f1s = [], []
for t in thresholds:
    y_pred = (y_score >= t).astype(int)
    accs.append(accuracy_score(y_true, y_pred))
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    f1s.append(f1)

best_idx = int(np.argmax(accs))
best_t   = float(thresholds[best_idx])
best_acc = float(accs[best_idx])
best_f1  = float(f1s[best_idx])

plt.figure(figsize=(8,5))
plt.plot(thresholds, accs, label="Accuracy")
plt.plot(thresholds, f1s, label="F1")
plt.axvline(best_t, linestyle="--", label=f"Best T={best_t:.3f}")
plt.title("Accuracy / F1 vs Threshold")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.legend()
plt.grid(True)
plt.show()

print(f"Best threshold: {best_t:.3f} | Accuracy={best_acc:.4f} | F1={best_f1:.4f}")

# Confusion Matrix @ best T
from sklearn.metrics import confusion_matrix
y_pred_best = (y_score >= best_t).astype(int)
cm = confusion_matrix(y_true, y_pred_best, labels=[0,1])
tn, fp, fn, tp = cm.ravel()

plt.figure(figsize=(5,4))
plt.imshow(cm, aspect="auto")
plt.title("Confusion Matrix (Heatmap)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.xticks([0,1], ["0 (Neg)","1 (Pos)"])
plt.yticks([0,1], ["0 (Neg)","1 (Pos)"])
for (i,j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha='center', va='center', fontsize=12)
plt.colorbar()
plt.tight_layout()
plt.show()

print(f"TN={tn}  FP={fp}  FN={fn}  TP={tp}")

# Optional: ROC & PR
try:
    auc = roc_auc_score(y_true, y_score)
    fpr, tpr, _ = roc_curve(y_true, y_score)
    prec, rec, _ = precision_recall_curve(y_true, y_score)

    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
    plt.plot([0,1],[0,1], linestyle="--")
    plt.title("ROC Curve")
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.legend()
    plt.grid(True)
    plt.show()

    plt.figure(figsize=(6,5))
    plt.plot(rec, prec)
    plt.title("Precision-Recall Curve")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.grid(True)
    plt.show()
except Exception as e:
    print("Skipped ROC/PR:", e)


[TEMPLATE UPDATED] → C:\Users\sagni\Downloads\Resume Ranker\labels.csv
Please fill 'label' (1/0) for the rows you want to evaluate, save, then rerun this cell.


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
