Downloading the sentence comparison model from huggingface (only done once by Petar - it will be saved and accessable through Git from now on)

In [1]:
from huggingface_hub import snapshot_download

# Download the model and store it in the models directory
snapshot_download(
    repo_id="sentence-transformers/all-MiniLM-L6-v2",
    local_dir="models/all-MiniLM-L6-v2",
    local_dir_use_symlinks=False  # This ensures full files are copied, not symlinked
)


  from .autonotebook import tqdm as notebook_tqdm
For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.
Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 707.95it/s]


'C:\\Repos\\TSM_MLOps\\resume_matcher\\notebooks\\models\\all-MiniLM-L6-v2'

Load the model to use it - you run this cell before you want to use the model

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("../models/all-MiniLM-L6-v2")


In [3]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "\n".join([page.extract_text() or "" for page in pdf.pages])

# Load your own resume file here
resume_path = "../data/resumes/sample_resume.pdf"
resume_text = extract_text_from_pdf(resume_path)

print(resume_text[:10000])  # Preview the first 1000 characters


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


CYNTHIA DWAYNE
Software Developer
CONTACT WORK EXPERIENCE
cynthia@beamjobs.com Software Developer
(123) 456-7890
QuickBooks
Brooklyn, NY
January 2017 - current / New York, NY
LinkedIn
· Worked on the payments team to save time and improve cash
Github
flow for over 50,000 through the development of modern,
responsive customer experiences
· Led the migration from AWS to GCP for the team to reduce
CAREER OBJECTIVE
cloud costs by $260,000 per year
Throughout my 7-year-plus · Worked closely with the product team to re-configure the
career as a software developer, I processing of invoices, saving customers over 125,000 manual
have focused on developing hours of work per month
scalable and well-documented · Mentored 3 junior front-end developers on the team on React,
code. I enjoy working
and documented best practices within the organization
collaboratively but can also run
with projects independently.
Front-End Developer
Excited about the prospect of
joining a product-driven AMR
company like

In [4]:
job_description = """Construction is a multi-trillion-dollar industry used by every human across the planet. It needs your help: we need to invent new software systems so construction becomes as efficient, quick, automated and green as possible. The team at Benetics (60+ years of engineering experience, 2 founding engineers from Google, and a CEO with a successful exit in tech.) is laser-focused on a vision of understanding each building’s DNA: How are people on-site communicating? How was the building constructed compared to what the plan said? How can we automate planning, implementation, logistics, and maintenance into a seamless process flow?
Join the fastest-moving company in this space.

Tasks

Build reliable, performant, and scalable software solutions.
Follow data-driven experimentation to iterate on features.
Bring machine learning models for construction process flows.
Work closely and collaborate with engineers, product managers, designers, and other team members.
Share expertise, hold tech talks, contribute to design meetings, and enrich our engineering culture.
Tackle greenfield algorithmic, scaling, and performance challenges.
Code using primarily TypeScript and Python
Requirements

\[required] 2+ years of software engineering experience
\[required] Master’s or bachelor’s degree (or equivalent) in computer science, mathematics, or related fields (ETH Zurich or equivalent)
Your A-Game: top-notch software engineering, a curious mind looking to learn constantly, and an empathic style that wants to teach others.
Benefits

Experienced and powerful team; a start-up without the chaos.
Quick pacing while paying attention to all the details: we build a delightful, bullet-proof product.
Performance-driven compensation, including equity.
A place to learn and develop yourself.
A once-in-a-generation chance to reshape an industry of crucial importance for humanity and the planet.
We are hiring the best people around the world and offer visa sponsorship.

"""

In [5]:
import re

def clean_text(text):
    # lowercase the text
    text = text.lower()
    #remove special characters 
    text = re.sub(r"[\n•\-]+", " ", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [6]:
resume_clean = clean_text(resume_text)
job_clean = clean_text(job_description)


In [7]:
from sentence_transformers.util import cos_sim

# encode texts to get their embeddings
embeddings = model.encode([resume_text, job_description], convert_to_tensor=True)

# cosine similarity score
similarity = cos_sim(embeddings[0], embeddings[1]).item()
print(f"Resume Fit Score: {similarity * 100:.2f}/100")


Resume Fit Score: 47.22/100


In [8]:
# basic keyword overlap
resume_words = set(clean_text(resume_text).split())
job_words = set(clean_text(job_description).split())
overlap = len(resume_words & job_words) / len(job_words)
print(f"Keyword Overlap: {overlap:.2%}")


Keyword Overlap: 14.76%


In [9]:
from transformers import pipeline

summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

summary_job_desc = summarizer("This is a job description - summarizing in order to fit potential resumes:" + job_description, max_length=450, min_length=300, do_sample=False)[0]['summary_text']
print("Summarized Job Description:\n", summary_job_desc)

summary_resume_text = summarizer("This is a resume - summarizing in order to fit potential job descriptions:" + resume_text, max_length=450, min_length=300, do_sample=False)[0]['summary_text']
print("Summarized Resume:\n", summary_resume_text)


Device set to use cpu
Your max_length is set to 450, but your input_length is only 432. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=216)


Summarized Job Description:
  This is a job description - summarizing in order to fit potential resumes . The team at Benetics (60+ years of engineering experience, 2 founding engineers from Google, and a CEO with a successful exit in tech.) is laser-focused on a vision of understanding each building’s DNA . How are people on-site communicating? How was the building constructed compared to what the plan said? How can we automate planning, implementation, logistics, and maintenance into a seamless process flow? We need to invent new software systems so construction becomes as efficient, quick, automated and green as possible . The company is hiring the best people around the world and offer visa sponsorship. It is a once-in-a-generation chance to reshape an industry of crucial importance for humanity and the planet. It's a place to learn and develop yourself. We build a delightful, bullet-proof product. The job description is available for $100,000 to $300,000 per year. We are hiring fo

In [10]:
embeddings = model.encode([summary_resume_text, summary_job_desc], convert_to_tensor=True)

# cosine similarity score
similarity = cos_sim(embeddings[0], embeddings[1]).item()
print(f"Summarized Resume Fit Score: {similarity * 100:.2f}/100")

Summarized Resume Fit Score: 49.18/100


In [11]:
model2 = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v4") #maybe more task specific
embeddings2 = model2.encode([resume_clean, job_clean], convert_to_tensor=True)
similarity2 = cos_sim(embeddings2[0], embeddings2[1]).item()
print(f"Resume Fit Score (MS MARCO): {similarity2 * 100:.2f}/100")

Resume Fit Score (MS MARCO): 28.49/100


In [12]:
print(f"Resume Fit Score (MS MARCO): {similarity2 * 100:.2f}/100")

Resume Fit Score (MS MARCO): 28.49/100


Another approach: creating objects out of both texts and comparing them "manually" using ollama. does not work yet

In [13]:
%pip install ollama

Note: you may need to restart the kernel to use updated packages.


In [18]:
import ollama
ollama.pull('tinyllama')

ProgressResponse(status='success', completed=None, total=None, digest=None)

In [19]:
from ollama import Client
client = Client()
response = client.chat(model='tinyllama', messages=[
    {"role": "user", "content": "Extract skills and experience from this resume: <your text>"}
])
print(response['message']['content'])


1. Sales Experience: As an experienced sales manager with over six years of experience in the retail industry, you have demonstrated success in managing a team of professionals responsible for achieving sales targets and generating revenue.

2. Project Management Skills: As a project manager, you have extensive experience in leading complex projects across various industries. You are skilled in managing budgets, stakeholders, timelines, and deliverables.

3. Time Management and Prioritization: As a time-management expert with more than 10 years of experience, you excel at prioritizing tasks and ensuring that deadlines are met. You have also been trained in effective time management strategies to manage workloads effectively.

4. Leadership Skills: As an experienced leader in the sales department, you possess leadership skills that enable you to motivate, inspire and train your team. You have led successful sales campaigns, developed strong relationships with clients and built a dedicat

In [20]:
def extract_resume_info(resume_text: str, model: str = 'mistral') -> dict:
    from ollama import Client
    client = Client()

    prompt = f"""
Extract the following fields from the resume below and return them as a valid JSON object:

- name
- years_of_experience
- skills
- past_job_titles
- education
- soft_skills

Example output:
{{
  "name": "Emma Liu",
  "years_of_experience": 3,
  "skills": ["Go", "Kubernetes", "PostgreSQL"],
  "past_job_titles": ["Platform Engineer", "Backend Developer"],
  "education": "MSc in Software Engineering",
  "soft_skills": ["teamplayer", "problem solver"]
}}

Resume:
{resume_text}

Respond only with valid JSON. Do not include any extra text or explanation.
"""

    response = client.chat(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )

    return response['message']['content']


In [22]:
json_output = extract_resume_info(resume_clean, model='tinyllama')
print(json_output)

[
{
   "name": "Emma Liu",
   "years_of_experiencce": 3,
   "skills": ["Go", "Kubernetes", "PostgreSQL"],
   "past_job_title": ["Platform Engineer", "Backend Developer"],
   "eduation": "MSc in Software Engineering",
   "soft_skills": ["Teamplayer", "Problem Solver"]
}]


In [23]:
def extract_job_description_info(jd_text: str, model: str = 'mistral') -> dict:
    from ollama import Client
    client = Client()

    prompt = f"""
Extract the following fields from the job description and return them as a valid JSON object:

- title
- required_experience
- required_skills
- preferred_skills
- education
- soft_skills

Example output:
{{
  "title": "Backend Engineer",
  "required_experience": 2,
  "required_skills": ["Python", "AWS", "PostgreSQL"],
  "preferred_skills": ["Kubernetes", "Docker"],
  "education": "Bachelor's in Computer Science",
  "soft_skills": ["communication", "problem-solving"]
}}

Job Description:
{jd_text}

Respond only with valid JSON. Do not include any additional explanation or formatting.
"""

    response = client.chat(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )

    return response['message']['content']


In [24]:
json_output_job = extract_job_description_info(job_clean)
print(json_output_job)

 {
      "title": "Software Engineer",
      "required_experience": 2,
      "required_skills": ["TypeScript", "Python"],
      "preferred_skills": [],
      "education": "Bachelor's or Master's degree in Computer Science, Mathematics, or related fields",
      "soft_skills": ["curious mind", "empathic style"]
   }


In [25]:
resume_data = extract_resume_info(resume_clean)
job_data = extract_job_description_info(job_clean)

In [26]:
import json

resume = json.loads(resume_data)
job = json.loads(job_data)

# Compare skills
common_skills = set(resume["skills"]).intersection(job["required_skills"])
match_score = len(common_skills) / len(job["required_skills"]) * 100

print(f"🔍 Resume matches {match_score:.2f}% of required job skills.")

🔍 Resume matches 50.00% of required job skills.


In [27]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('models/all-MiniLM-L6-v2')  

In [28]:
from sentence_transformers import util

def compare_resume_and_job(resume: dict, job: dict, model, explain: bool = False) -> float:
    def sim(a, b):
        return util.cos_sim(model.encode(a, convert_to_tensor=True),
                            model.encode(b, convert_to_tensor=True)).item()

    def sim_list(list1, list2, threshold=0.7):
        matches = []
        for item1 in list1:
            for item2 in list2:
                if sim(item1, item2) >= threshold:
                    matches.append((item1, item2))
        return matches

    # --- 1. Skills ---
    resume_skills = resume.get("skills", [])
    required_skills = job.get("required_skills", [])
    preferred_skills = job.get("preferred_skills", [])

    required_matches = sim_list(resume_skills, required_skills)
    preferred_matches = sim_list(resume_skills, preferred_skills)

    required_score = len(required_matches) / max(1, len(required_skills))
    preferred_score = len(preferred_matches) / max(1, len(preferred_skills))
    skill_score = (required_score * 0.8) + (preferred_score * 0.2)

    # --- 2. Experience (rule-based) ---
    resume_exp = resume.get("years_of_experience", 0)
    job_exp = job.get("required_experience", 0)

    if isinstance(resume_exp, str):
        resume_exp = int(''.join(filter(str.isdigit, resume_exp)) or 0)

    exp_score = 1.0 if resume_exp >= job_exp else 0.5 if resume_exp >= job_exp * 0.75 else 0.0

    # --- 3. Job Title (semantic similarity) ---
    resume_titles = resume.get("past_job_titles", [])
    job_title = job.get("title", "")
    title_score = 0.0

    for rt in resume_titles:
        if sim(rt, job_title) > 0.75:
            title_score = 1.0
            break

    # --- 4. Education (semantic match) ---
    import re

    def simplify_education(text):
        text = text.lower()
        text = re.sub(r"bachelor(?:'s)?|bsc", "bachelor", text)
        text = re.sub(r"master(?:'s)?|msc", "master", text)
        text = re.sub(r"(in|of)", "", text)
        text = re.sub(r"[,\.]", "", text)
        # remove university names, keep degree + subject
        keywords = ["bachelor", "master", "computer science", "mathematics", "data science", "engineering"]
        return ' '.join([kw for kw in keywords if kw in text])

    resume_edu_clean = simplify_education(resume.get("education", ""))
    job_edu_clean = simplify_education(job.get("education", ""))

    edu_score = sim(resume_edu_clean, job_edu_clean) if resume_edu_clean and job_edu_clean else 0.0
    edu_score = 1.0 if edu_score > 0.7 else 0.0

    # --- 5. Final Weighted Score ---
    final_score = (
        skill_score * 0.5 +
        exp_score * 0.2 +
        title_score * 0.2 +
        edu_score * 0.1
    )

    percent_score = round(final_score * 100, 2)

    if explain:
        print("📊 Match Breakdown:")
        print(f"✔️ Required skills matched: {[m[1] for m in required_matches]}")
        print(f"➕ Preferred skills matched: {[m[1] for m in preferred_matches]}")
        print(f"🛠️ Skill score: {round(skill_score * 100, 2)}%")
        print(f"📈 Experience score: {round(exp_score * 100, 2)}%")
        print(f"🎯 Title score: {round(title_score * 100, 2)}%")
        print(f"🎓 Education score: {round(edu_score * 100, 2)}%")

    return percent_score


In [29]:
import json


resume_raw = extract_resume_info(resume_clean)
job_raw = extract_job_description_info(job_clean)

resume_dict = json.loads(resume_raw)
job_dict = json.loads(job_raw)


model = SentenceTransformer('models/all-MiniLM-L6-v2')  # or just use HuggingFace name
score = compare_resume_and_job(resume_dict, job_dict, model, True)

print(f"\n🔍 Final Resume Fit Score: {score}%")


📊 Match Breakdown:
✔️ Required skills matched: ['Typescript']
➕ Preferred skills matched: []
🛠️ Skill score: 40.0%
📈 Experience score: 100.0%
🎯 Title score: 100.0%
🎓 Education score: 100.0%

🔍 Final Resume Fit Score: 70.0%
