In [1]:
pip install pandas numpy scikit-learn pdfminer.six python-docx docx2txt PyYAML h5py tqdm


Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Installing collected packages: docx2txt, python-docx

   -------------------- ------------------- 1/2 [python-docx]
   -------------------- ------------------- 1/2 [python-docx]
   -------------------- ------------------- 1/2 [python-docx]
   ---------------------------------------- 2/2 [python-docx]

Successfully installed docx2txt-0.9 python-docx-1.2.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
# -*- coding: utf-8 -*-
r"""
AI Resume Ranker (TF-IDF based)
- Reads:
    1) PDFs/DOCX from: C:\Users\sagni\Downloads\Resume Ranker\archive (1)\data\data
    2) CSV from    : C:\Users\sagni\Downloads\Resume Ranker\archive (1)\Resume\Resume.csv
- Ranks resumes vs a Job Description (JD) text or file
- Saves artifacts to: C:\Users\sagni\Downloads\Resume Ranker
    - vectorizer.pkl (Pickle)
    - resume_embeddings.h5 (HDF5)
    - config.yaml (YAML)
    - rankings.json (JSON)
    - rankings.csv (CSV)
"""

import os
import re
import json
import yaml
import h5py
import argparse
import warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from tqdm import tqdm

# Text & ML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# File parsing
from pdfminer.high_level import extract_text as pdf_extract_text
try:
    import docx2txt
    HAS_DOCX = True
except ImportError:
    HAS_DOCX = False
    warnings.warn("docx2txt not installed; DOCX parsing will be skipped.")

# -----------------------
# Paths (Windows raw strings)
# -----------------------
BASE_DIR = r"C:\Users\sagni\Downloads\Resume Ranker"
PDF_DIR  = r"C:\Users\sagni\Downloads\Resume Ranker\archive (1)\data\data"
CSV_PATH = r"C:\Users\sagni\Downloads\Resume Ranker\archive (1)\Resume\Resume.csv"

ARTIFACT_DIR = BASE_DIR  # saving here as requested

# -----------------------
# Light skill dictionary (extend as you like)
# -----------------------
SKILLS = {
    "programming": [
        "python", "c++", "java", "c", "c#", "javascript", "typescript", "go", "rust", "scala", "matlab", "sql"
    ],
    "data_ml": [
        "machine learning", "deep learning", "nlp", "computer vision", "tensorflow", "pytorch",
        "keras", "scikit-learn", "pandas", "numpy", "opencv", "transformers", "bert", "xgboost"
    ],
    "cloud_devops": [
        "aws", "gcp", "azure", "docker", "kubernetes", "ci/cd", "jenkins", "terraform"
    ],
    "tools": [
        "git", "linux", "bash", "jira", "tableau", "power bi"
    ],
    "web": [
        "react", "node", "flask", "django", "streamlit", "fastapi", "html", "css"
    ]
}
SKILL_FLAT = sorted({s.lower() for v in SKILLS.values() for s in v})

# -----------------------
# Helpers
# -----------------------
def read_text_file(path: Path) -> str:
    try:
        return Path(path).read_text(encoding="utf-8", errors="ignore")
    except Exception:
        try:
            return Path(path).read_text(encoding="latin-1", errors="ignore")
        except Exception:
            return ""

def clean_text(text: str) -> str:
    text = text or ""
    text = re.sub(r"\s+", " ", text).strip().lower()
    return text

def extract_text_from_pdf(pdf_path: Path) -> str:
    try:
        return pdf_extract_text(str(pdf_path))
    except Exception:
        return ""

def extract_text_from_docx(docx_path: Path) -> str:
    if not HAS_DOCX:
        return ""
    try:
        return docx2txt.process(str(docx_path)) or ""
    except Exception:
        return ""

def extract_skills(text: str) -> list:
    text_low = text.lower()
    found = []
    for s in SKILL_FLAT:
        if s in text_low:
            found.append(s)
    return sorted(list(set(found)))

def safe_stem_filename(name: str, max_len: int = 80) -> str:
    s = re.sub(r"[^A-Za-z0-9._\- ]+", "", name).strip()
    return s[:max_len] if len(s) > max_len else s

def load_csv_resumes(csv_path: str) -> pd.DataFrame:
    """
    Robust loading for common Kaggle Resume.csv schemas.
    Returns columns: ['id','name','source','text_raw']
    """
    df = pd.read_csv(csv_path, encoding="utf-8", errors="ignore")
    df.columns = [c.strip() for c in df.columns]

    possible_text_cols_priority = [
        ["Resume"],
        ["Resume_str"],
        ["resume_text"],
        ["Resume", "skills", "education", "experience"],
        ["resume", "skills", "education", "experience"]
    ]

    text = None
    for cols in possible_text_cols_priority:
        if all(c in df.columns for c in cols):
            text = df[cols].astype(str).agg(" ".join, axis=1)
            break

    if text is None:
        str_cols = [c for c in df.columns if df[c].dtype == "object"]
        if not str_cols:
            raise ValueError("Could not find textual columns in CSV.")
        text = df[str_cols].astype(str).agg(" ".join, axis=1)

    name = None
    for nc in ["Name", "Candidate Name", "name", "full_name", "title"]:
        if nc in df.columns:
            name = df[nc].astype(str)
            break
    if name is None:
        name = pd.Series([f"csv_resume_{i}" for i in range(len(df))])

    out = pd.DataFrame({
        "id": [f"csv_{i}" for i in range(len(df))],
        "name": name,
        "source": "csv",
        "text_raw": text
    })
    return out

def load_pdf_dir_resumes(pdf_dir: str) -> pd.DataFrame:
    """
    Walk PDF_DIR and DOCX files, parse text.
    Returns columns: ['id','name','source','text_raw']
    """
    records = []
    p = Path(pdf_dir)
    if not p.exists():
        return pd.DataFrame(columns=["id", "name", "source", "text_raw"])

    for file in tqdm(list(p.rglob("*")), desc="Parsing files"):
        if not file.is_file():
            continue
        ext = file.suffix.lower()
        text = ""
        if ext == ".pdf":
            text = extract_text_from_pdf(file)
        elif ext in (".docx", ".doc"):
            text = extract_text_from_docx(file)
        else:
            continue

        if not text.strip():
            continue

        records.append({
            "id": f"file_{len(records)}",
            "name": safe_stem_filename(file.stem),
            "source": file.suffix.lower().lstrip("."),
            "text_raw": text
        })
    return pd.DataFrame.from_records(records)

def build_corpus(res_df: pd.DataFrame) -> pd.DataFrame:
    df = res_df.copy()
    df["text"] = df["text_raw"].astype(str).map(clean_text)
    df["skills"] = df["text"].map(extract_skills)
    return df

def vectorize_and_rank(df: pd.DataFrame, jd_text: str):
    # TF-IDF on resumes + JD
    docs = df["text"].tolist() + [clean_text(jd_text)]
    vectorizer = TfidfVectorizer(
        max_features=50000,
        ngram_range=(1,2),
        stop_words="english"
    )
    X = vectorizer.fit_transform(docs)  # shape: (N_resumes+1, V)
    X_res = X[:-1]
    X_jd  = X[-1]

    sim = cosine_similarity(X_res, X_jd)[:, 0]  # (N_resumes, )
    return vectorizer, X_res, X_jd, sim

def save_artifacts(vectorizer, X_res, df_ranked, jd_text: str, config_path_yaml: str):
    timestamp = datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S")

    # 1) Save vectorizer as .pkl
    pkl_path = os.path.join(ARTIFACT_DIR, "vectorizer.pkl")
    with open(pkl_path, "wb") as f:
        pickle.dump(vectorizer, f)

    # 2) Save embeddings to HDF5 .h5
    h5_path = os.path.join(ARTIFACT_DIR, "resume_embeddings.h5")
    with h5py.File(h5_path, "w") as h5:
        if hasattr(X_res, "toarray"):
            arr = X_res.toarray()
        else:
            arr = np.asarray(X_res)
        h5.create_dataset("tfidf_vectors", data=arr)

        # metadata
        names = df_ranked["name"].astype(str).tolist()
        ids   = df_ranked["id"].astype(str).tolist()
        h5.attrs["resume_names_json"] = json.dumps(names)
        h5.attrs["resume_ids_json"]   = json.dumps(ids)
        h5.attrs["created_utc"]       = timestamp

    # 3) Save rankings as JSON
    json_path = os.path.join(ARTIFACT_DIR, "rankings.json")
    ranked_for_json = df_ranked[["rank", "score", "name", "id", "source", "top_skills"]].to_dict(orient="records")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(ranked_for_json, f, ensure_ascii=False, indent=2)

    # 4) Save config as YAML
    cfg = {
        "created_utc": timestamp,
        "base_dir": BASE_DIR,
        "pdf_dir": PDF_DIR,
        "csv_path": CSV_PATH,
        "artifacts": {
            "vectorizer_pkl": pkl_path,
            "embeddings_h5": h5_path,
            "rankings_json": json_path,
            "rankings_csv": os.path.join(ARTIFACT_DIR, "rankings.csv")
        },
        "tfidf": {
            "max_features": 50000,
            "ngram_range": [1, 2],
            "stop_words": "english"
        },
        "jd_preview": jd_text[:500]
    }
    with open(config_path_yaml, "w", encoding="utf-8") as f:
        yaml.safe_dump(cfg, f, sort_keys=False, allow_unicode=True)

def main():
    parser = argparse.ArgumentParser(description="AI Resume Ranker (TF-IDF)")
    parser.add_argument("--jd", type=str, default=None, help="Job description text (quoted)")
    parser.add_argument("--jd_file", type=str, default=None, help="Path to a JD text file")
    parser.add_argument("--top_k", type=int, default=50, help="How many top results to save")
    args = parser.parse_args()

    Path(ARTIFACT_DIR).mkdir(parents=True, exist_ok=True)

    # Read JD
    if args.jd_file and Path(args.jd_file).exists():
        jd_text = read_text_file(Path(args.jd_file))
    elif args.jd is not None:
        jd_text = args.jd
    else:
        jd_text = "We are looking for an ML Engineer with Python, scikit-learn, NLP, and deployment experience."

    # Load CSV resumes
    csv_df = pd.DataFrame()
    if Path(CSV_PATH).exists():
        try:
            csv_df = load_csv_resumes(CSV_PATH)
        except Exception as e:
            warnings.warn(f"Error reading CSV resumes: {e}")

    # Load PDF/DOCX resumes
    file_df = load_pdf_dir_resumes(PDF_DIR)

    # Combine
    resumes_df = pd.concat([csv_df, file_df], ignore_index=True)
    if resumes_df.empty:
        raise SystemExit("No resumes found. Please check CSV/PDF paths.")

    # Build corpus
    corpus_df = build_corpus(resumes_df)

    # Vectorize + similarity
    vectorizer, X_res, X_jd, sim = vectorize_and_rank(corpus_df, jd_text)

    # Attach scores & skills
    corpus_df["score"] = sim
    corpus_df["top_skills"] = corpus_df["skills"].apply(lambda s: ", ".join(s[:15]) if s else "")

    # Rank
    corpus_df = corpus_df.sort_values("score", ascending=False).reset_index(drop=True)
    corpus_df.insert(0, "rank", corpus_df.index + 1)

    # Trim to top_k for outputs
    top_k = max(1, int(args.top_k))
    top_df = corpus_df.head(top_k).copy()

    # Save CSV rankings
    rankings_csv_path = os.path.join(ARTIFACT_DIR, "rankings.csv")
    top_df.to_csv(rankings_csv_path, index=False, encoding="utf-8")

    # Save artifacts
    config_yaml_path = os.path.join(ARTIFACT_DIR, "config.yaml")
    save_artifacts(vectorizer, X_res, top_df, jd_text, config_yaml_path)

    # Console summary
    print(f"\n[OK] Ranked {len(corpus_df)} resumes. Top {top_k} saved.")
    print(f"- Rankings CSV : {rankings_csv_path}")
    print(f"- Rankings JSON: {os.path.join(ARTIFACT_DIR, 'rankings.json')}")
    print(f"- Vectorizer   : {os.path.join(ARTIFACT_DIR, 'vectorizer.pkl')}")
    print(f"- Embeddings   : {os.path.join(ARTIFACT_DIR, 'resume_embeddings.h5')}")
    print(f"- Config YAML  : {config_yaml_path}")

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--jd JD] [--jd_file JD_FILE] [--top_k TOP_K]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\sagni\AppData\Roaming\jupyter\runtime\kernel-8c6446d5-4033-434c-b916-26bbbef478b2.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
