In [2]:
import os
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity

class JobMatcher:
    def __init__(self, job_description_folder, cv_folder, output_folder):
        self.job_description_folder = job_description_folder
        self.cv_folder = cv_folder
        self.output_folder = output_folder

        os.makedirs(self.output_folder, exist_ok=True)

    def load_embeddings(self, folder):
        embeddings = {}
        for root, _, files in os.walk(folder):
            for file in files:
                if file.endswith(".npy"):
                    embedding = np.load(os.path.join(root, file))
                    key = os.path.splitext(file)[0]  # Remove file extension from the key
                    embeddings[key] = embedding.tolist()  # Convert NumPy array to Python list
        return embeddings

    def match_candidates_to_jobs(self):
        # Load embeddings for job descriptions and CVs
        job_description_embeddings = self.load_embeddings(self.job_description_folder)
        cv_embeddings = self.load_embeddings(self.cv_folder)

        # Match CVs to job descriptions and rank based on similarity
        result_data = {}
        for job_name, job_embedding in job_description_embeddings.items():
            similarities = cosine_similarity([job_embedding], list(cv_embeddings.values()))
            ranked_indices = np.argsort(similarities[0])[::-1]  # Sort in descending order
            top_candidates = [{"CV_File": list(cv_embeddings.keys())[j], "Similarity": similarities[0][j]} for j in ranked_indices[:5]]
            result_data[job_name] = {
                "Top_Candidates": top_candidates
            }

        # Save the results as JSON
        output_json_path = os.path.join(self.output_folder, 'candidate_job_matching_results.json')
        with open(output_json_path, 'w', encoding='utf-8') as json_file:
            json.dump(result_data, json_file, ensure_ascii=False, indent=4)

        return result_data

if __name__ == "__main__":
    job_description_folder = "word_embeddings/job_description_embeddedings" 
    cv_folder = "word_embeddings/resume_preprocessed_embeddedings"  
    output_folder = "matching_results" 

    matcher = JobMatcher(job_description_folder, cv_folder, output_folder)

    # Match candidates to jobs and get the top 5 CVs for each job description
    matching_results = matcher.match_candidates_to_jobs()

    # Print or process the matching results as needed
    print(matching_results)


{' Business Development Project Manager_preprocessed.txt': {'Top_Candidates': [{'CV_File': '38650096.pdf_tokenized', 'Similarity': 0.7619707792445218}, {'CV_File': '11232471.pdf_tokenized', 'Similarity': 0.7480307522618559}, {'CV_File': '37818861.pdf_tokenized', 'Similarity': 0.7439784658938098}, {'CV_File': '79041971.pdf_tokenized', 'Similarity': 0.7410386415356964}, {'CV_File': '25930778.pdf_tokenized', 'Similarity': 0.7385965231957199}]}, ' Construction Project Manager_preprocessed.txt': {'Top_Candidates': [{'CV_File': '25930778.pdf_tokenized', 'Similarity': 0.7659567743339635}, {'CV_File': '37818861.pdf_tokenized', 'Similarity': 0.7657579124377455}, {'CV_File': '34419403.pdf_tokenized', 'Similarity': 0.7623062025726006}, {'CV_File': '11232471.pdf_tokenized', 'Similarity': 0.7583225395707489}, {'CV_File': '36621169.pdf_tokenized', 'Similarity': 0.7578789659396633}]}, '$17.50hr - Patient Advocate - Part Time Position - Mon - Fri ..._preprocessed.txt': {'Top_Candidates': [{'CV_File': 