In [1]:
# -*- coding: utf-8 -*-
# If needed once:
# !pip install pandas numpy scikit-learn pdfminer.six docx2txt tqdm matplotlib

import os, re, warnings
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_recall_fscore_support,
    roc_auc_score, roc_curve, precision_recall_curve
)

from pdfminer.high_level import extract_text as pdf_extract_text
try:
    import docx2txt
    HAS_DOCX = True
except Exception:
    HAS_DOCX = False
    warnings.warn("docx2txt not installed; DOCX parsing will be skipped.")

# =========================
# CONFIG — edit as needed
# =========================
BASE_DIR   = r"C:\Users\sagni\Downloads\Resume Ranker"
PDF_DIR    = r"C:\Users\sagni\Downloads\Resume Ranker\archive (1)\data\data"
CSV_PATH   = r"C:\Users\sagni\Downloads\Resume Ranker\archive (1)\Resume\Resume.csv"
LABELS_CSV = r"C:\Users\sagni\Downloads\Resume Ranker\labels.csv"
RANKINGS_CSV = str(Path(BASE_DIR) / "rankings.csv")  # optional fast-path if exists

JD_TEXT = ("Looking for an ML/Data Scientist with strong Python, NLP, TensorFlow or PyTorch, "
           "Docker/Kubernetes, and cloud (AWS/GCP/Azure). Experience with MLOps a plus.")

# =========================
# Helpers
# =========================
def read_csv_robust(path: str) -> pd.DataFrame:
    encodings = ["utf-8", "utf-16", "latin-1", "cp1252"]
    last_err = None
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, encoding_errors="ignore", engine="python")
        except TypeError:
            try:
                return pd.read_csv(path, encoding=enc, engine="python")
            except Exception as e2:
                last_err = e2
        except Exception as e:
            last_err = e
    if last_err:
        raise last_err
    return pd.DataFrame()

def clean_text(text: str) -> str:
    text = text or ""
    text = re.sub(r"\s+", " ", text).strip().lower()
    return text

def extract_text_from_pdf(pdf_path: Path) -> str:
    try:
        return pdf_extract_text(str(pdf_path))
    except Exception:
        return ""

def extract_text_from_docx(docx_path: Path) -> str:
    if not HAS_DOCX:
        return ""
    try:
        return docx2txt.process(str(docx_path)) or ""
    except Exception:
        return ""

def safe_stem_filename(name: str, max_len: int = 80) -> str:
    s = re.sub(r"[^A-Za-z0-9._\- ]+", "", name).strip()
    return s[:max_len] if len(s) > max_len else s

def load_csv_resumes(csv_path: str) -> pd.DataFrame:
    if not Path(csv_path).exists():
        return pd.DataFrame(columns=["id","name","source","text_raw"])
    df = read_csv_robust(csv_path)
    if df.empty:
        return pd.DataFrame(columns=["id","name","source","text_raw"])
    df.columns = [c.strip() for c in df.columns]

    possible_text_cols_priority = [
        ["Resume"],
        ["Resume_str"],
        ["resume_text"],
        ["Resume","skills","education","experience"],
        ["resume","skills","education","experience"]
    ]
    text = None
    for cols in possible_text_cols_priority:
        if all(c in df.columns for c in cols):
            text = df[cols].astype(str).agg(" ".join, axis=1)
            break
    if text is None:
        str_cols = [c for c in df.columns if df[c].dtype == "object"]
        if not str_cols:
            return pd.DataFrame(columns=["id","name","source","text_raw"])
        text = df[str_cols].astype(str).agg(" ".join, axis=1)

    name = None
    for nc in ["Name","Candidate Name","name","full_name","title"]:
        if nc in df.columns:
            name = df[nc].astype(str)
            break
    if name is None:
        name = pd.Series([f"csv_resume_{i}" for i in range(len(df))])

    out = pd.DataFrame({
        "id": [f"csv_{i}" for i in range(len(df))],
        "name": name,
        "source": "csv",
        "text_raw": text
    })
    return out

def load_pdf_dir_resumes(pdf_dir: str) -> pd.DataFrame:
    p = Path(pdf_dir)
    if not p.exists():
        return pd.DataFrame(columns=["id","name","source","text_raw"])
    records = []
    for file in tqdm(list(p.rglob("*")), desc="Parsing files"):
        if not file.is_file(): continue
        ext = file.suffix.lower()
        if ext == ".pdf":
            text = extract_text_from_pdf(file)
        elif ext in (".docx",".doc"):
            text = extract_text_from_docx(file)
        else:
            continue
        if not text.strip():
            continue
        records.append({
            "id": f"file_{len(records)}",
            "name": safe_stem_filename(file.stem),
            "source": ext.lstrip("."),
            "text_raw": text
        })
    return pd.DataFrame.from_records(records)

def compute_scores_tfidf(texts: pd.Series, jd_text: str) -> np.ndarray:
    docs = texts.tolist() + [clean_text(jd_text)]
    vec = TfidfVectorizer(max_features=50000, ngram_range=(1,2), stop_words="english")
    X = vec.fit_transform(docs)
    X_res, X_jd = X[:-1], X[-1]
    # cosine similarity for normalized TF-IDF
    from sklearn.metrics.pairwise import cosine_similarity
    return cosine_similarity(X_res, X_jd)[:, 0]

def normalize_bool(val):
    if pd.isna(val):
        return np.nan
    s = str(val).strip().lower()
    true_vals  = {"1","true","yes","y","selected","hire","hired","positive","pos","shortlisted","good","relevant"}
    false_vals = {"0","false","no","n","rejected","reject","negative","neg","not selected","bad","irrelevant","non-relevant"}
    if s in true_vals:  return 1
    if s in false_vals: return 0
    try:
        f = float(s)
        return 1 if f >= 0.5 else 0
    except:
        return np.nan

def unify_name(s: str) -> str:
    if s is None or (isinstance(s,float) and np.isnan(s)): return ""
    s = s.lower()
    s = re.sub(r"[^a-z0-9]+", "", s)
    return s

# =========================
# Load labels
# =========================
if not Path(LABELS_CSV).exists():
    raise SystemExit("labels.csv not found. Create it first, then rerun.")

labels = read_csv_robust(LABELS_CSV)
if labels.empty:
    raise SystemExit("labels.csv is empty. Please fill labels and save.")

labels.columns = [c.strip().lower() for c in labels.columns]
# choose key col
key_col = "id" if "id" in labels.columns else ("name" if "name" in labels.columns else None)
if key_col is None:
    raise SystemExit("labels.csv must have 'id' or 'name' column.")

# pick label column
label_col = None
for cand in ["label","labels","target","y","class","gt","ground_truth","selected","hired","suitable"]:
    if cand in labels.columns:
        label_col = cand
        break
if label_col is None:
    raise SystemExit("labels.csv needs a label column (e.g., 'label').")

labels["label_bin"] = labels[label_col].map(normalize_bool)
labels = labels.dropna(subset=["label_bin"]).copy()
labels["label_bin"] = labels["label_bin"].astype(int)

# =========================
# Get/compute scores
# =========================
scores_df = None
if Path(RANKINGS_CSV).exists():
    try:
        rnk = read_csv_robust(RANKINGS_CSV)
        rnk.columns = [c.strip().lower() for c in rnk.columns]
        # keep minimal
        keep = [c for c in ["id","name","score","source"] if c in rnk.columns]
        scores_df = rnk[keep].copy()
    except Exception:
        scores_df = None

if scores_df is None or "score" not in scores_df.columns or scores_df["score"].isna().all():
    # fallback: compute scores (slower; parses PDFs/DOCX)
    print("Computing scores from raw resumes (this may take a while)...")
    csv_df  = load_csv_resumes(CSV_PATH)
    file_df = load_pdf_dir_resumes(PDF_DIR)
    resumes = pd.concat([csv_df, file_df], ignore_index=True)
    if resumes.empty:
        raise SystemExit("No resumes found to compute scores.")
    resumes["text"] = resumes["text_raw"].astype(str).map(clean_text)
    resumes["score"] = compute_scores_tfidf(resumes["text"], JD_TEXT)
    scores_df = resumes[["id","name","score","source"]].copy()

# =========================
# Align labels & scores
# =========================
if key_col == "id":
    merged = pd.merge(labels[[key_col,"label_bin"]], scores_df, how="inner", on="id")
else:
    # name-based join with normalization for safety
    tmp_lab = labels.copy();   tmp_lab["name_key"] = tmp_lab["name"].astype(str).map(unify_name)
    tmp_sc  = scores_df.copy(); tmp_sc["name_key"]  = tmp_sc["name"].astype(str).map(unify_name)
    merged = pd.merge(tmp_lab[["name_key","label_bin"]], tmp_sc[["name_key","name","score","id","source"]], how="inner", on="name_key")

if merged.empty:
    raise SystemExit("No overlap between labels and scores. Check IDs/names.")

# =========================
# Metrics & Plots
# =========================
y_true  = merged["label_bin"].astype(int).values
y_score = merged["score"].astype(float).values

# Accuracy & F1 vs threshold
thresholds = np.linspace(0.0, 1.0, 201)
accs, f1s = [], []
for t in thresholds:
    y_pred = (y_score >= t).astype(int)
    accs.append(accuracy_score(y_true, y_pred))
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    f1s.append(f1)

best_idx = int(np.argmax(accs))
best_t   = float(thresholds[best_idx])
best_acc = float(accs[best_idx])
best_f1  = float(f1s[best_idx])

plt.figure(figsize=(8,5))
plt.plot(thresholds, accs, label="Accuracy")
plt.plot(thresholds, f1s, label="F1")
plt.axvline(best_t, linestyle="--", label=f"Best T={best_t:.3f}")
plt.title("Accuracy / F1 vs Threshold")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.legend()
plt.grid(True)
plt.show()

print(f"Best threshold: {best_t:.3f} | Accuracy={best_acc:.4f} | F1={best_f1:.4f}")

# Confusion matrix at best threshold
from sklearn.metrics import confusion_matrix
y_pred_best = (y_score >= best_t).astype(int)
cm = confusion_matrix(y_true, y_pred_best, labels=[0,1])
tn, fp, fn, tp = cm.ravel()

plt.figure(figsize=(5,4))
plt.imshow(cm, aspect="auto")
plt.title("Confusion Matrix (Heatmap)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.xticks([0,1], ["0 (Neg)","1 (Pos)"])
plt.yticks([0,1], ["0 (Neg)","1 (Pos)"])
for (i,j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha='center', va='center', fontsize=12)
plt.colorbar()
plt.tight_layout()
plt.show()

print(f"TN={tn}  FP={fp}  FN={fn}  TP={tp}")

# Optional: ROC & PR curves
try:
    auc = roc_auc_score(y_true, y_score)
    fpr, tpr, _ = roc_curve(y_true, y_score)
    prec, rec, _ = precision_recall_curve(y_true, y_score)

    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
    plt.plot([0,1],[0,1], linestyle="--")
    plt.title("ROC Curve")
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.legend()
    plt.grid(True)
    plt.show()

    plt.figure(figsize=(6,5))
    plt.plot(rec, prec)
    plt.title("Precision-Recall Curve")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.grid(True)
    plt.show()
except Exception as e:
    print("Skipped ROC/PR (need both positive & negative labels):", e)


SystemExit: No overlap between labels and scores. Check IDs/names.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
