In [1]:
!pip install sentence-transformers pdfminer.six python-docx


Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transfo

In [2]:
import os
import torch
import pdfminer.high_level
from sentence_transformers import SentenceTransformer, util
import re


In [3]:

model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
def extract_text_from_pdf(pdf_path):
    try:
        text = pdfminer.high_level.extract_text(pdf_path)
        return text.strip() if text else " "
    except Exception as e:
        print(f"Error extracting {pdf_path}: {e}")
        return " "


In [13]:
def extract_relevant_sections(resume_text):
    """ Extract Skills, Experience, and Certifications separately to give them higher weight in ranking """
    skills_match = re.search(r"Skills[:\s](.+)", resume_text, re.IGNORECASE)
    experience_match = re.search(r"Experience[:\s](.+)", resume_text, re.IGNORECASE)

    skills = skills_match.group(1) if skills_match else ""
    experience = experience_match.group(1) if experience_match else ""

    return skills, experience


In [5]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9,. ]', '', text)
    return text.lower().strip()


In [10]:
job_description = """
We are seeking a highly skilled **Information Technology Specialist** with expertise in **network security, system administration, and IT infrastructure management**.

### **Key Responsibilities:**
- **System Administration & Network Security:** Manage **Active Directory, Group Policy Objects (GPO), and Microsoft Exchange**.
- **Cloud & Virtualization:** Experience with **VMware, Azure, AWS, and Office 365**.
- **Backup & Disaster Recovery:** Implement enterprise **backup solutions, data recovery, and storage management**.
- **Hardware & Software Maintenance:** Troubleshoot and manage **network devices, servers, and enterprise software solutions**.
- **Project Management:** Oversee **IT infrastructure upgrades, process automation, and security compliance**.
- **Programming & Scripting:** Proficiency in **PowerShell, Python, C++, and SQL**.
- **Cybersecurity & Compliance:** Hands-on experience with **firewalls, VPNs, and IT security protocols**.

### **Preferred Qualifications:**
- Bachelor's/Master’s degree in **Information Technology, Computer Science, or Network Administration**.
- Certifications such as **CompTIA Network+, Security+, MCP, or CISSP**.
- Strong **troubleshooting and problem-solving skills**.

We are looking for a **detail-oriented, proactive IT professional** with experience in **managing enterprise IT environments, security policies, and business continuity strategies**.
"""


In [11]:
def rank_resumes(resume_folder, job_description):
    resume_scores = []


    job_desc_embedding = model.encode(job_description, convert_to_tensor=True)

    for resume_file in os.listdir(resume_folder):
        if resume_file.endswith(".pdf"):
            resume_path = os.path.join(resume_folder, resume_file)
            resume_text = extract_text_from_pdf(resume_path)
            resume_text = clean_text(resume_text)

            if len(resume_text) < 50:
                continue

            resume_embedding = model.encode(resume_text, convert_to_tensor=True)
            similarity_score = util.pytorch_cos_sim(job_desc_embedding, resume_embedding).item()

            resume_scores.append((resume_file, similarity_score))


    resume_scores.sort(key=lambda x: x[1], reverse=True)

    return resume_scores


In [16]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = re.sub(r"\d+", "", text)
    words = text.lower().split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [17]:
resume_folder = "/content/resume_datashit"

ranked_resumes = rank_resumes(resume_folder, job_description)

print("Resume ranking based on jjob description:")
for rank, (resume, score) in enumerate(ranked_resumes, 1):
    print(f"{rank}. {resume} - Score: {score:.2f}")


Resume ranking based on jjob description:
1. 10089434.pdf - Score: 0.67
2. 10840430.pdf - Score: 0.65
3. 10641230.pdf - Score: 0.63
4. 10839851.pdf - Score: 0.61
5. 10247517.pdf - Score: 0.58
6. 10553553.pdf - Score: 0.47
7. 10265057.pdf - Score: 0.32
