# Load System

In [1]:
import faiss
import json
import pickle
import numpy as np

from dtos.services.search.hard_criteria import HardCriteria
from services.search.profiles import SearchProfilesService

from start_utils import voyageai_client

  from .autonotebook import tqdm as notebook_tqdm
[32m2025-07-30 21:26:47.392[0m | [1mINFO    [0m | [36mstart_utils[0m:[36m<module>[0m:[36m29[0m - [1mLoading .env file and environment variables[0m
[32mJuly-30-2025[0m | [30m21:26:47[0m | [1mINFO[0m | [36mLoading .env file and environment variables[0m | [35mstart_utils:<module>:29[0m | [33m{}[0m
[32m2025-07-30 21:26:47.394[0m | [1mINFO    [0m | [36mstart_utils[0m:[36m<module>[0m:[36m32[0m - [1mLoading Configurations[0m
[32mJuly-30-2025[0m | [30m21:26:47[0m | [1mINFO[0m | [36mLoading Configurations[0m | [35mstart_utils:<module>:32[0m | [33m{}[0m
[32m2025-07-30 21:26:47.395[0m | [34m[1mDEBUG   [0m | [36mconfigurations.db[0m:[36mload_config[0m:[36m32[0m - [34m[1mDB config loaded successfully.[0m
[32mJuly-30-2025[0m | [30m21:26:47[0m | [34m[1mDEBUG[0m | [36mDB config loaded successfully.[0m | [35mconfigurations.db:load_config:32[0m | [33m{}[0m
[32m2025-07-30 21:26:47

In [2]:
def load_system(filepath: str):
    """
    Load the clustering system from disk
    """
    with open(filepath, 'rb') as f:
        system_data = pickle.load(f)
    
    profiles = system_data.get('profiles')
    clusters = system_data.get('clusters')
    cluster_centers = system_data.get('cluster_centers')
    cluster_metadata = system_data.get('cluster_metadata')
    embeddings = system_data.get('embeddings')
    cluster_labels = system_data.get('cluster_labels')
    
    # Load FAISS indices
    faiss_indices = {}
    for cluster_id in np.unique(clusters):
        faiss_path = f"{filepath}_cluster_{cluster_id}.faiss"
        try:
            index = faiss.read_index(faiss_path)
            cluster_mask = clusters == cluster_id
            profile_indices = np.where(cluster_mask)[0]
            
            faiss_indices[cluster_id] = {
                'index': index,
                'profile_indices': profile_indices,
                'size': len(profile_indices)
            }
        except FileNotFoundError:
            print(f"Warning: FAISS index for cluster {cluster_id} not found")

    print(f"System loaded from {filepath}")

    return profiles, clusters, cluster_centers, cluster_metadata, cluster_labels, embeddings, faiss_indices

In [3]:
pickel_file_path = "data/clustering_system_v1.pkl"
profiles, clusters, cluster_centers, cluster_metadata, cluster_labels, embeddings, faiss_indices = load_system(
    filepath=pickel_file_path
)

System loaded from data/clustering_system_v1.pkl


# Load test dataset

In [4]:
file_path = "test_dataset.json"
with open(file_path, "r") as f:
    dataset = json.load(f)

# Search Profiles

In [5]:
from services.search.filter import FilterProfilesService
import collections

In [6]:
model_name = "voyage-3"

In [None]:
from ast import Dict
import json
from constants.prompt import Prompt
from dtos.services.etl.profile import ProfileData
from dtos.services.search.hard_criteria import HardCriteria
from services.llm.generate import LLMGenerateService
from services.search.abstraction import ISearchService


class FilterProfilesService(ISearchService):
    def __init__(self):
        self.llm_generate_service = LLMGenerateService()

    def generate_prompt(
        self,
        job_title: str,
        job_description: str,
        profile: ProfileData,
        hard_criteria: HardCriteria,
        soft_criteria={},
    ) -> str:
        prompt = Prompt.LLM_ANALYSIS_PROMPT.format(
            profile=profile,
            hard_criteria=hard_criteria,
            soft_criteria=soft_criteria,
            job_title=job_title,
            job_description=job_description,
        )
        return prompt

    def run(
        self,
        job_title: str,
        job_description: str,
        profile: ProfileData,
        hard_criteria: HardCriteria,
        soft_criteria={},
    ) -> Dict:

        prompt = self.generate_prompt(
            job_title=job_title,
            job_description=job_description,
            profile=profile,
            hard_criteria=hard_criteria,
            soft_criteria=soft_criteria,
        )
        response = self.llm_generate_service.run(prompt)

        # Parse the JSON response from the LLM
        try:
            response_dict = json.loads(response.choices[0].message.content)
        except json.JSONDecodeError:
            # If JSON parsing fails, return a default response
            response_dict = {
                "overall_score": 0,
                "criterion_breakdown": {},
                "profile": profile,
            }

        return {
            "overall_score": response_dict.get("overall_score"),
            "criterion_breakdown": response_dict.get("criterion_breakdown"),
            "profile": profile,
        }


In [9]:
results = collections.defaultdict(list)
for data in dataset:
    hard_cirteria = HardCriteria.parse_obj(data)
    search_profiles_service = SearchProfilesService(
        profiles=profiles,
        faiss_indices=faiss_indices,
        cluster_centers=cluster_centers,
        cluster_labels=cluster_labels,
        embedding_model=voyageai_client,
        model_name=model_name,
    )
    
    selected_profiles = search_profiles_service.run(
        query=data.get("natural_language_description", hard_cirteria.model_dump_json()),
        hard_criteria=None,
        top_k=15,
        top_k_clusters=3
    )

    filter_profiles_service = FilterProfilesService()
    for profile in selected_profiles:
        filter_result = filter_profiles_service.run(
            job_title=data.get("job_title"),
            job_description=data.get("job_description"),
            profile=profile,
            hard_criteria=hard_cirteria.model_dump_json(),
        )
        results[data.get("file_name")].append(filter_result)

/var/folders/df/xkqqlpfj5xl4m4v3kbnz70zm0000gn/T/ipykernel_30709/3205945039.py:3: PydanticDeprecatedSince20: The `parse_obj` method is deprecated; use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  hard_cirteria = HardCriteria.parse_obj(data)



Searching for: 'Seasoned attorney with a JD from a top U.S. law school and over three years of legal practice, specializing in corporate tax structuring and compliance. Has represented clients in IRS audits and authored legal opinions on federal tax code matters.'


[32m2025-07-30 21:28:45.410[0m | [1mINFO    [0m | [36mservices.search.profiles[0m:[36mfind_relevant_clusters[0m:[36m185[0m - [1mSelected clusters for query: [3, 12, 4][0m
[32mJuly-30-2025[0m | [30m21:28:45[0m | [1mINFO[0m | [36mSelected clusters for query: [3, 12, 4][0m | [35mservices.search.profiles:find_relevant_clusters:185[0m | [33m{}[0m
[32m2025-07-30 21:28:45.773[0m | [1mINFO    [0m | [36mservices.llm.generate[0m:[36mrun[0m:[36m24[0m - [1mGenerating response from LLM[0m
[32mJuly-30-2025[0m | [30m21:28:45[0m | [1mINFO[0m | [36mGenerating response from LLM[0m | [35mservices.llm.generate:run:24[0m | [33m{}[0m
[32m2025-07-30 21:28:52.258[0m | [1mINFO    [0m | [36mservices.llm.generate[0m:[36mrun[0m:[36m31[0m - [1mResponse from LLM: ChatCompletion(id='chatcmpl-Bz3UU9zWzVXjzenkeo3fFJ0tJ7eyx', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Overall Compatibility Score: 3\n\nHa

TypeError: the JSON object must be str, bytes or bytearray, not ChatCompletion

In [36]:
results

{'tax_lawyer.yml': ['YQHNdUaRpL9tqnaxZWi5ei8cbewGnRf',
  'HTnEBnunVuDX90jJWqwKPSL103ggeID',
  '_N3o8s8A7uauZ5P5qfDCUiILuD2-3Jb',
  'Sdc6Spp0p6R0f9JRd10q_dKbkQzjgYk',
  'nMDYogKerkrwTibPWFteilppaC-xnMk'],
 'junior_corporate_lawyer.yml': ['phvtrw9oo0CY9QuFQfLXmtvxadGmHum',
  'D1zLQ4RxIyFZELZD9En3S3S7VIL5tk0',
  'SsxkcbEm0g1zOUzl6aCnj0_Piy_Zsc1',
  'ikZPdcLu_nyhVrgFwjcE32D1J0uCpDY',
  'VAgR1QkXb7McyksRHALY2tibY2_AP8Q'],
 'radiology.yml': ['e19Rh75DwkqLjm1CG5v9s_PjwkfRh9G',
  'wf0eb5JVc_kN76VSJ65-1dcJx6NHi4X',
  'HUKTaQ6on1YYH3frg99GsNPl9lrz0oM',
  'bL8ByyYNOvuCXQho7soRat1_zaKUZl2',
  'BKSGOGz2Zefo'],
 'doctors_md.yml': ['uqGTO7Rqw3XZ4ZNeB__muevXTMsdh18',
  'ppO-whpTg0grqQ9O0LqNmLDkmF-RSyu',
  'WPT8Y7hkneuAj6DuPTz-mg7DUpiqjmm',
  'kOmgIbzMhh2-DsM2c4cr_Jm5V1lCYF8',
  'mWsiNf0ZNmCW3jO0qIhIqgaHDbFJCcL'],
 'biology_expert.yml': ['fmC7p0BRnuFeqr-96pgyI3IdKEK1Z3J',
  'wULtTx9e1rFNM0qJ2k-GXMsL2FFLLtF',
  'UNxtgJKmpz3nAIcrqMoCNEaj8aqM5gi',
  'DZtWq99bhEozctFxjE_tzsAoLjuYe_r',
  'wKLtxASNxV4bsvOlAt

In [8]:
selected_profiles

[{'profile': ProfileData(profile_id='7sWcQgPU-9viFtqhqKXH1NWyEyLcl51', name='Festum Q.', current_title='Backend Software Developer', current_company='Ppro', profile_location='Bavaria, Germany', experience_years=22.7, experiences=[], skills=['Python', 'gRPC', 'Amazon Web Services', 'Kubernetes', 'ETL', 'ETL', 'ETL', 'GoLang', 'GoLang', 'GoLang', 'Python', 'Python', 'Python', 'Kafka', 'Kafka', 'Kafka', 'Amazon Web Services', 'Amazon Web Services', 'Amazon Web Services', 'Docker', 'Docker', 'Docker', 'Java', 'Java', 'Java', '.NET', '.NET', '.NET', 'C', 'C', 'UI/UX Design', 'Machine Learning', 'PHP', 'UI/UX Design', 'PHP', 'JavaScript', 'Photoshop', 'Python', 'TypeScript', 'GoLang', 'C', 'Kotlin', 'Terraform', 'Docker', 'Kubernetes', 'gRPC', 'PostgreSQL', 'Microservices', 'Amazon Web Services', 'Python', 'Docker', 'Kubernetes', 'Elasticsearch', 'React', 'Kafka', 'RabbitMQ', 'Python', 'Docker', 'Amazon Web Services', 'NoSQL', 'Python', 'PHP', 'Docker', 'Angular', 'PostgreSQL', 'Jenkins', 'K