In [2]:
# Creates a labels template at: C:\Users\sagni\Downloads\Resume Ranker\labels.csv
# Columns: id,name,source,score,label   (fill label with 1/0 and save)

import pandas as pd
from pathlib import Path
import re

# ---- paths (same as your config) ----
BASE_DIR = r"C:\Users\sagni\Downloads\Resume Ranker"
PDF_DIR  = r"C:\Users\sagni\Downloads\Resume Ranker\archive (1)\data\data"
CSV_PATH = r"C:\Users\sagni\Downloads\Resume Ranker\archive (1)\Resume\Resume.csv"
LABELS_CSV = str(Path(BASE_DIR) / "labels.csv")

# ---- light loaders reused from earlier ----
def read_csv_robust(path: str) -> pd.DataFrame:
    import pandas as pd
    encodings = ["utf-8", "utf-16", "latin-1", "cp1252"]
    last_err = None
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, encoding_errors="ignore", engine="python")
        except TypeError:
            try:
                return pd.read_csv(path, encoding=enc, engine="python")
            except Exception as e2:
                last_err = e2
        except Exception as e:
            last_err = e
    if last_err:
        raise last_err
    return pd.DataFrame()

def clean_text(text: str) -> str:
    import re
    text = text or ""
    return re.sub(r"\s+", " ", text).strip().lower()

def load_csv_resumes(csv_path: str) -> pd.DataFrame:
    from pathlib import Path
    if not Path(csv_path).exists():
        return pd.DataFrame(columns=["id","name","source","text_raw"])
    df = read_csv_robust(csv_path)
    if df.empty:
        return pd.DataFrame(columns=["id","name","source","text_raw"])
    df.columns = [c.strip() for c in df.columns]

    possible_text_cols_priority = [
        ["Resume"], ["Resume_str"], ["resume_text"],
        ["Resume","skills","education","experience"],
        ["resume","skills","education","experience"]
    ]
    text = None
    for cols in possible_text_cols_priority:
        if all(c in df.columns for c in cols):
            text = df[cols].astype(str).agg(" ".join, axis=1); break
    if text is None:
        str_cols = [c for c in df.columns if df[c].dtype == "object"]
        if not str_cols:
            return pd.DataFrame(columns=["id","name","source","text_raw"])
        text = df[str_cols].astype(str).agg(" ".join, axis=1)

    name = None
    for nc in ["Name","Candidate Name","name","full_name","title"]:
        if nc in df.columns: name = df[nc].astype(str); break
    if name is None:
        name = pd.Series([f"csv_resume_{i}" for i in range(len(df))])

    out = pd.DataFrame({
        "id": [f"csv_{i}" for i in range(len(df))],
        "name": name,
        "source": "csv",
        "text_raw": text
    })
    return out

def load_pdf_dir_resumes(pdf_dir: str) -> pd.DataFrame:
    from pathlib import Path
    from tqdm import tqdm
    p = Path(pdf_dir)
    if not p.exists():
        return pd.DataFrame(columns=["id","name","source","text_raw"])
    records = []
    for file in tqdm(list(p.rglob("*")), desc="Scanning resumes"):
        if not file.is_file(): 
            continue
        ext = file.suffix.lower()
        if ext not in (".pdf",".docx",".doc"):
            continue
        records.append({
            "id": f"file_{len(records)}",
            "name": re.sub(r"[^A-Za-z0-9._\- ]+", "", file.stem).strip()[:80],
            "source": ext.lstrip("."),
            "text_raw": ""  # we don't need full text just to make labels template
        })
    return pd.DataFrame.from_records(records)

# ---- build candidate list to label ----
csv_df  = load_csv_resumes(CSV_PATH)
file_df = load_pdf_dir_resumes(PDF_DIR)
resumes = pd.concat([csv_df[["id","name","source"]], file_df[["id","name","source"]]], ignore_index=True)

if resumes.empty:
    raise SystemExit("No resumes found. Check your paths.")

# If you already produced scores earlier and saved rankings.csv, we can include those scores for reference.
rankings_path = Path(BASE_DIR) / "rankings.csv"
if rankings_path.exists():
    try:
        rnk = pd.read_csv(rankings_path)
        # try join by id or name
        if "id" in rnk.columns:
            resumes = resumes.merge(rnk[["id","score"]], how="left", on="id")
        elif "name" in rnk.columns:
            resumes = resumes.merge(rnk[["name","score"]], how="left", on="name")
    except Exception:
        pass

# Add empty label column for you to fill (1/0)
if "score" not in resumes.columns:
    resumes["score"] = None
resumes["label"] = ""   # fill with 1 or 0

# Save template (sorted by score desc if available)
if resumes["score"].notna().any():
    resumes = resumes.sort_values("score", ascending=False)

resumes.to_csv(LABELS_CSV, index=False, encoding="utf-8")
print(f"[OK] Wrote labels template with {len(resumes)} rows → {LABELS_CSV}")
print("Open it, fill the 'label' column with 1 (positive) or 0 (negative), save, then re-run your accuracy/heatmap cell.")


Scanning resumes: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2508/2508 [00:00<00:00, 10599.45it/s]


[OK] Wrote labels template with 4968 rows → C:\Users\sagni\Downloads\Resume Ranker\labels.csv
Open it, fill the 'label' column with 1 (positive) or 0 (negative), save, then re-run your accuracy/heatmap cell.
