In [3]:
from pypdf import PdfReader

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as file:
        pdf_reader = PdfReader(file)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

In [4]:
documents = []
org_docs = [
    "Data/Resumes/alfred_pennyworth_pm.pdf",
    "Data/Resumes/barry_allen_fe.pdf",
    "Data/Resumes/bruce_wayne_fullstack.pdf",
    "Data/Resumes/john_doe.pdf",
    "Data/Resumes/James_Coal.pdf",
    "Data/Resumes/Johny_Yes.pdf"
]
for filename in org_docs:
    document_text = extract_text_from_pdf(filename)
    documents.append(document_text)
documents


['Alfred Pennyworth\nProduct ManagerSilicon Valley, CA, USA\n♂¶obile-alt(123) 456-7890\n/envel⌢pealfred.pennyworth@email.com\n/linkedin-inapennyworth\n/githubapennyworth\nProfessional Summary\nSeasoned Product Manager with over 20 years of experience in software development and product\nmanagement, having worked at all FAANG companies. Exceptional leadership skills, strategic\nthinking, and a track record of managing products from conception to market success.\nSkills\nProduct management, Agile methodologies, Leadership, Communication, Project\nmanagement, User Experience Design, Market Research, Data Analysis, Java,\nPython, JavaScript, HTML/CSS, SQL, AWS\nExperience\n2017 –\nPresentProduct Manager ,Google , Mountain View, CA, USA\nLeading cross-functional teams to design, develop, and launch innovative products. Devel-\noping product strategies and making data-driven decisions to improve user experience and\nmeet business goals.\n2012 – 2017 Software Development Engineer III ,Amazon 

In [5]:
target_document = extract_text_from_pdf("Data/JobDescription/job_desc_front_end_engineer.pdf")

In [13]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("all-MiniLM-L12-v2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [14]:
# Encode documents and target document
documents_embeddings = model.encode(documents)
target_document_embedding = model.encode(target_document)

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
# Calculate cosine similarity between job description and resumes
similarities = [util.cos_sim(target_document_embedding, resume_embedding) for resume_embedding in documents_embeddings]
# Sort resumes based on similarity scores
ranked_resumes = sorted(zip(org_docs, similarities), key=lambda x: x[1], reverse=True)

for idx, (resume_path, similarity) in enumerate(ranked_resumes, start=1):
    similarity_str = f"{float(similarity[0]):.4f}"
    print(f"{idx}. {resume_path} - Similarity: {similarity_str}")

1. Data/Resumes/barry_allen_fe.pdf - Similarity: 0.7259
2. Data/Resumes/john_doe.pdf - Similarity: 0.4285
3. Data/Resumes/alfred_pennyworth_pm.pdf - Similarity: 0.3215
4. Data/Resumes/bruce_wayne_fullstack.pdf - Similarity: 0.2946
5. Data/Resumes/James_Coal.pdf - Similarity: 0.2331
6. Data/Resumes/Johny_Yes.pdf - Similarity: 0.1296
