In [1]:
from pathlib import Path
from uuid import UUID

import chromadb
import httpx
import logfire
import pymupdf4llm
from chromadb.utils import embedding_functions
from loguru import logger
from markitdown import MarkItDown

from hiring_force_app import AgentRequest, Resume

logfire.configure()

%load_ext autoreload
%autoreload 2


In [2]:
async def send_request(agent_request: AgentRequest) -> Resume:
    async with httpx.AsyncClient(timeout=120) as client:
        response = await client.post("http://localhost:8000/run_agent", json=agent_request.model_dump())
        response.raise_for_status()
        return Resume(**response.json())

In [3]:
def doc_to_md(doc: Path | str, md_path: Path | str = "") -> str:
    if not Path(doc).exists():
        raise FileNotFoundError(f"Document not found: {doc}")
    md = ""
    try:
        doc = Path(doc)
        if doc.suffix == ".md":
            md = doc.read_text()
        elif doc.suffix == ".pdf":
            md = pymupdf4llm.to_markdown(doc=str(doc))
        else:
            marker = MarkItDown()
            md = marker.convert(source=str(doc)).text_content
    except Exception:
        logger.error(f"Error converting {doc} to markdown")
    if md_path and md:
        md_path = Path(md_path)
        md_path.parent.mkdir(parents=True, exist_ok=True)
        md_path.write_text(md)
    return md

In [4]:
async def create_ideal_candidate(
    user_id: UUID | str,
    job_desc: str,
    path: Path | str = "",
    memorize: bool = True,
    memories_dir: Path | str = "",
) -> Resume:
    agent_request = AgentRequest(
        user_prompt=job_desc,
        agent_name="ideal_candidate_agent",
        user_id=user_id,
        memorize=memorize,
        memories_dir=memories_dir,
    )
    ideal_candidate = await send_request(agent_request=agent_request)
    if path:
        Path(path).write_text(ideal_candidate.model_dump_json())
    return ideal_candidate

In [11]:
async def save_resume_objects(
    user_id: UUID | str,
    resumes_path: Path,
    resume_objects_path: Path | str,
    suffix: str = "",
    num_resumes: int | None = None,
    memorize: bool = True,
    memories_dir: Path | str = "",
):
    resume_objects_path = Path(resume_objects_path)
    resume_objects_path.mkdir(parents=True, exist_ok=True)
    n = 0
    for resume in resumes_path.glob(f"*{suffix}"):
        if num_resumes and n >= num_resumes:
            break
        object_path = resume_objects_path.joinpath(resume.name).with_suffix(".json")
        if object_path.exists():
            logger.warning(f"Skipping {resume.name} because it already exists")
            continue
        agent_request = AgentRequest(
            user_prompt=doc_to_md(doc=resume),
            agent_name="resume_agent",
            user_id=user_id,
            memorize=memorize,
            memories_dir=memories_dir,
        )
        resume_object = await send_request(agent_request=agent_request)
        object_path.write_text(resume_object.model_dump_json())
        logger.success(f"Saved resume object for {resume.name}")
        n += 1

In [6]:
def index_resume_objects(collection: chromadb.Collection, resume_objects_path: Path | str):
    resume_objects_path = Path(resume_objects_path)
    for resume in resume_objects_path.iterdir():
        collection.add(ids=[resume.name], documents=[resume.read_text()])
        logger.success(f"Added resume {resume.name} to collection")


In [7]:
# user_id = uuid4()
user_id = UUID("7b94f43d-570b-456c-a8cf-51221eec4797")
memories_dir = Path("grc/memories")
user_id

UUID('7b94f43d-570b-456c-a8cf-51221eec4797')

In [8]:
ideal_candidate = await create_ideal_candidate(
    user_id=user_id,
    job_desc=Path("grc/job_posting.md").read_text(),
    path="grc/ideal_candidate.json",
    memories_dir=memories_dir,
)
ideal_candidate = Resume.model_validate_json(Path("grc/ideal_candidate.json").read_text())
ideal_candidate

Resume(years_of_experience=10.0, summary='Seasoned Director of Governance, Risk, and Compliance with over a decade of experience in GRC and information security. Proven expertise in designing and implementing effective compliance frameworks aligned with regulatory standards. Strong leadership in cross-functional teams and adept at enhancing organizational security and trust-building strategies.', work_experience=[WorkExperience(company='CyberShield Inc.', title='Director of Governance, Risk, and Compliance (GRC)', employment_type=<EmploymentType.FULL_TIME: 'full_time'>, location='Remote', is_current=True, description='Lead enterprise-wide GRC initiatives ensuring compliance with regulatory standards, overseeing third-party vendor risks, and executing incident response strategies. Develop risk metrics and report to executive leadership while collaborating with cross-functional teams.', achievements=['Successfully implemented a GRC framework that improved compliance scores by 35% within 

In [9]:
chroma_client = chromadb.PersistentClient(path="chroma_db")
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(  # type: ignore
    model_name="Alibaba-NLP/gte-modernbert-base",
)
chroma_client.delete_collection(name="grc")
collection = chroma_client.get_or_create_collection(name="grc", embedding_function=sentence_transformer_ef)


In [13]:
await save_resume_objects(
    user_id=user_id,
    resumes_path=Path("grc/resumes"),
    resume_objects_path=Path("grc/resume_objects"),
    suffix=".docx",
    memories_dir=memories_dir,
)

[32m2025-01-26 19:10:15.109[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36msave_resume_objects[0m:[36m30[0m - [32m[1mSaved resume object for emily.docx[0m
[32m2025-01-26 19:10:35.583[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36msave_resume_objects[0m:[36m30[0m - [32m[1mSaved resume object for jane.docx[0m
[32m2025-01-26 19:11:00.423[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36msave_resume_objects[0m:[36m30[0m - [32m[1mSaved resume object for john.docx[0m
[32m2025-01-26 19:11:32.942[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36msave_resume_objects[0m:[36m30[0m - [32m[1mSaved resume object for lisa.docx[0m


In [14]:
index_resume_objects(collection=collection, resume_objects_path=Path("grc/resume_objects"))

[32m2025-01-26 19:13:04.276[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mindex_resume_objects[0m:[36m5[0m - [32m[1mAdded resume jane.json to collection[0m
[32m2025-01-26 19:13:04.817[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mindex_resume_objects[0m:[36m5[0m - [32m[1mAdded resume john.json to collection[0m
[32m2025-01-26 19:13:05.026[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mindex_resume_objects[0m:[36m5[0m - [32m[1mAdded resume emily.json to collection[0m
[32m2025-01-26 19:13:06.211[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mindex_resume_objects[0m:[36m5[0m - [32m[1mAdded resume greg.json to collection[0m
[32m2025-01-26 19:13:06.679[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mindex_resume_objects[0m:[36m5[0m - [32m[1mAdded resume lisa.json to collection[0m


In [15]:
top_candidates = collection.query(query_texts=[ideal_candidate.model_dump_json()], n_results=3)

In [16]:
top_candidates

{'ids': [['greg.json', 'jane.json', 'emily.json']],
 'embeddings': None,
 'documents': [['{"years_of_experience":15.0,"summary":"Accomplished executive with extensive experience leading large organizations while building strong relationships with clients and business partners to achieve strategic goals and deliver value. Proven leader in operations and information security with a focus on planning and implementing key initiatives, assessing, and mitigating business risk, and leading continuous process improvement programs that drive operational efficiency and quality.","work_experience":[{"company":"Cyderes, LLC","title":"Director, Governance Risk and Compliance","employment_type":"full_time","location":"","is_current":true,"description":"Lead enterprise-wide GRC initiatives to ensure alignment with security, regulatory, and organizational trust objectives including GDPR, ISO 27001, PCI, CMMC, CSA, CIS, and NIST. Develop and maintain a comprehensive governance framework, enhancing trus

In [17]:
top_candidates2 = collection.query(query_texts=[Path("grc/job_posting.md").read_text()], n_results=3)

In [18]:
top_candidates2

{'ids': [['greg.json', 'jane.json', 'emily.json']],
 'embeddings': None,
 'documents': [['{"years_of_experience":15.0,"summary":"Accomplished executive with extensive experience leading large organizations while building strong relationships with clients and business partners to achieve strategic goals and deliver value. Proven leader in operations and information security with a focus on planning and implementing key initiatives, assessing, and mitigating business risk, and leading continuous process improvement programs that drive operational efficiency and quality.","work_experience":[{"company":"Cyderes, LLC","title":"Director, Governance Risk and Compliance","employment_type":"full_time","location":"","is_current":true,"description":"Lead enterprise-wide GRC initiatives to ensure alignment with security, regulatory, and organizational trust objectives including GDPR, ISO 27001, PCI, CMMC, CSA, CIS, and NIST. Develop and maintain a comprehensive governance framework, enhancing trus