In [2]:
# -*- coding: utf-8 -*-
# If needed, install once:
# !pip install pandas numpy scikit-learn pdfminer.six docx2txt PyYAML h5py tqdm

import os, re, json, yaml, h5py, warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

from pdfminer.high_level import extract_text as pdf_extract_text
try:
    import docx2txt
    HAS_DOCX = True
except Exception:
    HAS_DOCX = False
    warnings.warn("docx2txt not installed; DOCX parsing will be skipped.")

# =========================
# CONFIG — edit as needed
# =========================
BASE_DIR = r"C:\Users\sagni\Downloads\Resume Ranker"
PDF_DIR  = r"C:\Users\sagni\Downloads\Resume Ranker\archive (1)\data\data"
CSV_PATH = r"C:\Users\sagni\Downloads\Resume Ranker\archive (1)\Resume\Resume.csv"

JD_TEXT = ("Looking for an ML/Data Scientist with strong Python, NLP, TensorFlow or PyTorch, "
           "Docker/Kubernetes, and cloud (AWS/GCP/Azure). Experience with MLOps a plus.")
TOP_K = 20  # how many top results to save

ARTIFACT_DIR = BASE_DIR
Path(ARTIFACT_DIR).mkdir(parents=True, exist_ok=True)

# =========================
# Lightweight Skills DB
# =========================
SKILLS = {
    "programming": ["python","c++","java","c","c#","javascript","typescript","go","rust","scala","matlab","sql"],
    "data_ml": ["machine learning","deep learning","nlp","computer vision","tensorflow","pytorch","keras",
                "scikit-learn","pandas","numpy","opencv","transformers","bert","xgboost","lightgbm","catboost"],
    "cloud_devops": ["aws","gcp","azure","docker","kubernetes","ci/cd","jenkins","terraform","mlops","sagemaker","vertex ai"],
    "tools": ["git","linux","bash","jira","tableau","power bi","airflow","dbt","spark","hadoop","snowflake","bigquery","redshift"],
    "web": ["react","node","flask","django","streamlit","fastapi","html","css"]
}
SKILL_FLAT = sorted({s.lower() for v in SKILLS.values() for s in v})

# =========================
# Helpers
# =========================
def read_text_file(path: Path) -> str:
    try:
        return Path(path).read_text(encoding="utf-8", errors="ignore")
    except Exception:
        try:
            return Path(path).read_text(encoding="latin-1", errors="ignore")
        except Exception:
            return ""

def clean_text(text: str) -> str:
    text = text or ""
    text = re.sub(r"\s+", " ", text).strip().lower()
    return text

def extract_text_from_pdf(pdf_path: Path) -> str:
    try:
        return pdf_extract_text(str(pdf_path))
    except Exception:
        return ""

def extract_text_from_docx(docx_path: Path) -> str:
    if not HAS_DOCX:
        return ""
    try:
        return docx2txt.process(str(docx_path)) or ""
    except Exception:
        return ""

def extract_skills(text: str) -> list:
    txt = (text or "").lower()
    found = [s for s in SKILL_FLAT if s in txt]
    return sorted(list(set(found)))

def safe_stem_filename(name: str, max_len: int = 80) -> str:
    s = re.sub(r"[^A-Za-z0-9._\- ]+", "", name).strip()
    return s[:max_len] if len(s) > max_len else s

def read_csv_robust(path: str) -> pd.DataFrame:
    """Try multiple encodings and pandas kw combos for maximum compatibility."""
    encodings = ["utf-8", "utf-16", "latin-1", "cp1252"]
    last_err = None
    for enc in encodings:
        # Try with encoding_errors first (newer pandas)
        try:
            return pd.read_csv(path, encoding=enc, encoding_errors="ignore", engine="python")
        except TypeError:
            # Older pandas: no encoding_errors kw
            try:
                return pd.read_csv(path, encoding=enc, engine="python")
            except Exception as e2:
                last_err = e2
        except Exception as e:
            last_err = e
    if last_err:
        raise last_err
    # Fallback empty
    return pd.DataFrame()

def load_csv_resumes(csv_path: str) -> pd.DataFrame:
    if not Path(csv_path).exists():
        return pd.DataFrame(columns=["id","name","source","text_raw"])
    df = read_csv_robust(csv_path)
    if df.empty:
        return pd.DataFrame(columns=["id","name","source","text_raw"])

    df.columns = [c.strip() for c in df.columns]

    possible_text_cols_priority = [
        ["Resume"],
        ["Resume_str"],
        ["resume_text"],
        ["Resume","skills","education","experience"],
        ["resume","skills","education","experience"]
    ]

    text = None
    for cols in possible_text_cols_priority:
        if all(c in df.columns for c in cols):
            text = df[cols].astype(str).agg(" ".join, axis=1)
            break

    if text is None:
        str_cols = [c for c in df.columns if df[c].dtype == "object"]
        if not str_cols:
            return pd.DataFrame(columns=["id","name","source","text_raw"])
        text = df[str_cols].astype(str).agg(" ".join, axis=1)

    name = None
    for nc in ["Name","Candidate Name","name","full_name","title"]:
        if nc in df.columns:
            name = df[nc].astype(str)
            break
    if name is None:
        name = pd.Series([f"csv_resume_{i}" for i in range(len(df))])

    out = pd.DataFrame({
        "id": [f"csv_{i}" for i in range(len(df))],
        "name": name,
        "source": "csv",
        "text_raw": text
    })
    return out

def load_pdf_dir_resumes(pdf_dir: str) -> pd.DataFrame:
    p = Path(pdf_dir)
    if not p.exists():
        return pd.DataFrame(columns=["id","name","source","text_raw"])

    records = []
    files = list(p.rglob("*"))
    for file in tqdm(files, desc="Parsing files"):
        if not file.is_file():
            continue
        ext = file.suffix.lower()
        text = ""
        if ext == ".pdf":
            text = extract_text_from_pdf(file)
        elif ext in (".docx",".doc"):
            text = extract_text_from_docx(file)
        else:
            continue
        if not text.strip():
            continue
        records.append({
            "id": f"file_{len(records)}",
            "name": safe_stem_filename(file.stem),
            "source": file.suffix.lower().lstrip("."),
            "text_raw": text
        })
    return pd.DataFrame.from_records(records)

def build_corpus(res_df: pd.DataFrame) -> pd.DataFrame:
    if res_df.empty:
        return res_df
    df = res_df.copy()
    df["text"] = df["text_raw"].astype(str).map(clean_text)
    df["skills"] = df["text"].map(extract_skills)
    return df

def vectorize_and_rank(df: pd.DataFrame, jd_text: str):
    docs = df["text"].tolist() + [clean_text(jd_text)]
    vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1,2), stop_words="english")
    X = vectorizer.fit_transform(docs)
    X_res, X_jd = X[:-1], X[-1]
    sim = cosine_similarity(X_res, X_jd)[:, 0]
    return vectorizer, X_res, X_jd, sim

def save_artifacts(vectorizer, X_res, df_ranked, jd_text: str):
    timestamp = datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S")

    # 1) Vectorizer (.pkl)
    pkl_path = os.path.join(ARTIFACT_DIR, "vectorizer.pkl")
    with open(pkl_path, "wb") as f:
        pickle.dump(vectorizer, f)

    # 2) Embeddings (.h5)
    h5_path = os.path.join(ARTIFACT_DIR, "resume_embeddings.h5")
    with h5py.File(h5_path, "w") as h5:
        arr = X_res.toarray() if hasattr(X_res, "toarray") else np.asarray(X_res)
        h5.create_dataset("tfidf_vectors", data=arr)
        h5.attrs["resume_names_json"] = json.dumps(df_ranked["name"].astype(str).tolist())
        h5.attrs["resume_ids_json"]   = json.dumps(df_ranked["id"].astype(str).tolist())
        h5.attrs["created_utc"]       = timestamp

    # 3) Rankings (.json and .csv)
    json_path = os.path.join(ARTIFACT_DIR, "rankings.json")
    ranked_for_json = df_ranked[["rank","score","name","id","source","top_skills"]].to_dict(orient="records")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(ranked_for_json, f, ensure_ascii=False, indent=2)

    csv_path = os.path.join(ARTIFACT_DIR, "rankings.csv")
    df_ranked.to_csv(csv_path, index=False, encoding="utf-8")

    # 4) Config (.yaml)
    cfg = {
        "created_utc": timestamp,
        "base_dir": BASE_DIR,
        "pdf_dir": PDF_DIR,
        "csv_path": CSV_PATH,
        "artifacts": {
            "vectorizer_pkl": pkl_path,
            "embeddings_h5": h5_path,
            "rankings_json": json_path,
            "rankings_csv": csv_path
        },
        "tfidf": {"max_features": 50000, "ngram_range": [1,2], "stop_words": "english"},
        "jd_preview": jd_text[:500]
    }
    with open(os.path.join(ARTIFACT_DIR, "config.yaml"), "w", encoding="utf-8") as f:
        yaml.safe_dump(cfg, f, sort_keys=False, allow_unicode=True)

    return {
        "vectorizer_pkl": pkl_path,
        "embeddings_h5": h5_path,
        "rankings_json": json_path,
        "rankings_csv": csv_path,
        "config_yaml": os.path.join(ARTIFACT_DIR, "config.yaml")
    }

# =========================
# PIPELINE (runs now)
# =========================
# 1) Load inputs
csv_df  = load_csv_resumes(CSV_PATH)
file_df = load_pdf_dir_resumes(PDF_DIR)

resumes_df = pd.concat([csv_df, file_df], ignore_index=True)
if resumes_df.empty:
    raise SystemExit("No resumes found. Check CSV/PDF paths and try again.")

# 2) Build corpus
corpus_df = build_corpus(resumes_df)

# 3) Vectorize + rank
vectorizer, X_res, X_jd, sim = vectorize_and_rank(corpus_df, JD_TEXT)
corpus_df["score"] = sim
corpus_df["top_skills"] = corpus_df["skills"].apply(lambda s: ", ".join(s[:15]) if s else "")

corpus_df = corpus_df.sort_values("score", ascending=False).reset_index(drop=True)
corpus_df.insert(0, "rank", corpus_df.index + 1)

top_df = corpus_df.head(max(1, int(TOP_K))).copy()

# 4) Save artifacts
paths = save_artifacts(vectorizer, X_res, top_df, JD_TEXT)

# 5) Display summary
print(f"[OK] Ranked {len(corpus_df)} resumes. Top {len(top_df)} saved to:")
for k, v in paths.items():
    print(f" - {k}: {v}")

# Show top 10 in notebook (preview)
display_cols = ["rank","score","name","source","top_skills"]
try:
    from IPython.display import display
    display(top_df[display_cols].head(10))
except Exception:
    print(top_df[display_cols].head(10))


Parsing files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 2508/2508 [18:22<00:00,  2.28it/s]


[OK] Ranked 4967 resumes. Top 20 saved to:
 - vectorizer_pkl: C:\Users\sagni\Downloads\Resume Ranker\vectorizer.pkl
 - embeddings_h5: C:\Users\sagni\Downloads\Resume Ranker\resume_embeddings.h5
 - rankings_json: C:\Users\sagni\Downloads\Resume Ranker\rankings.json
 - rankings_csv: C:\Users\sagni\Downloads\Resume Ranker\rankings.csv
 - config_yaml: C:\Users\sagni\Downloads\Resume Ranker\config.yaml


Unnamed: 0,rank,score,name,source,top_skills
0,1,0.103024,20824105,pdf,"aws, bash, c, linux, python"
1,2,0.103024,csv_resume_298,csv,"aws, bash, c, linux, python"
2,3,0.095727,21297521,pdf,"aws, azure, c, c#, c++, computer vision, deep ..."
3,4,0.095727,csv_resume_2199,csv,"aws, azure, c, c#, c++, computer vision, deep ..."
4,5,0.079948,csv_resume_1737,csv,"c, node"
5,6,0.079709,55595908,pdf,"c, node"
6,7,0.074273,csv_resume_929,csv,"aws, c, go, java, jira, pandas, python, redshi..."
7,8,0.074234,csv_resume_926,csv,"bert, c, c#, css, flask, go, html, java, javas..."
8,9,0.074234,62994611,pdf,"bert, c, c#, css, flask, go, html, java, javas..."
9,10,0.073967,11813872,pdf,"aws, c, go, java, jira, pandas, python, redshi..."
