In [29]:
!pip install PyPDF2
!pip install PyMuPDF
!pip install --upgrade PyPDF2
import os
import nltk
import PyPDF2
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')

def extract_skills_from_text(text):
    # Extract the skills section from the text
    skills_start_pattern = r"(?i)\bSkills\b"
    skills_end_pattern = r"(?i)\bEducation\b|\bExperience\b|\bWork\b|\bSummary\b"
    skills_section = re.search(skills_start_pattern, text)
    if skills_section:
        text = text[skills_section.start():]
        end_match = re.search(skills_end_pattern, text)
        if end_match:
            text = text[:end_match.start()]
    return text

def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

def calculate_similarity(resume1, resume2):
    # Preprocess resumes
    stop_words = stopwords.words('english')
    tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
    resume_list = [resume1, resume2]
    tfidf_matrix = tfidf_vectorizer.fit_transform(resume_list)

    # Calculate cosine similarity
    similarity_score = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

    return similarity_score

# Path to the job listing resume PDF
job_listing_resume_path = "/content/Accountant.pdf"

# Resumes folder path
resumes_folder = "/content/drive/MyDrive/Accounting"

# Extract skills from the job listing resume
job_listing_resume = extract_text_from_pdf(job_listing_resume_path)
job_listing_resume_skills = extract_skills_from_text(job_listing_resume)

# List to store similarity results
similarity_results = []

# Process each resume in the folder
for filename in os.listdir(resumes_folder):
    if filename.endswith(".pdf"):
        # Read worker's resume
        resume_file = os.path.join(resumes_folder, filename)
        worker_resume = extract_text_from_pdf(resume_file)
        worker_resume_skills = extract_skills_from_text(worker_resume)

        # Calculate similarity
        similarity = calculate_similarity(job_listing_resume_skills, worker_resume_skills)

        # Add the filename and similarity score to the results list
        similarity_results.append((filename, similarity))

# Sort the results in descending order based on similarity score
similarity_results.sort(key=lambda x: x[1], reverse=True)





# Print the results with 2 decimal places
for result in similarity_results:
    filename, similarity = result
    similarity_rounded = round(similarity, 4)
    similarity_rounded = similarity_rounded * 100
    print(f"{filename} - {similarity_rounded:.2f}% similarity")


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Chad_Gibbons_02.pdf - 23.21% similarity
Howard_Ong_01.pdf - 14.08% similarity
Howard_Ong_02.pdf - 8.59% similarity
Marceline_Anderson_04.pdf - 8.16% similarity
Rachelle_Beaudry_04.pdf - 7.35% similarity
Claudia_Alves_04.pdf - 6.51% similarity
Juliana_Silva_04.pdf - 6.51% similarity
Donna_Stroupe_01.pdf - 5.66% similarity
Harper_Russo_03.pdf - 5.48% similarity
Samira_Hadid_02.pdf - 4.86% similarity
Harper_Russo_07.pdf - 4.78% similarity
Greta_Mae_Evans_01.pdf - 4.06% similarity
Hannah_Morales_01.pdf - 4.01% similarity
Korina_Villanueva_05.pdf - 3.58% similarity
Korina_Villanueva_06.pdf - 3.14% similarity
Jonathan_Patterson_03.pdf - 2.98% similarity
Marceline_Anderson_08.pdf - 2.56% similarity
Korina_Villanueva_02.pdf - 2.52% similarity
Samira_Hadid_01.pdf - 1.68% similarity
Olivia_Wilson_10.pdf - 1.63% similarity
Helene_Paquet_01.pdf - 0.68% similarity
Korina_Villanueva_08.pdf - 0.45% similarity
Juliana_Silva_05.pdf - 0.21% similarity
Isabel_Mercado_01.pdf - 0.20% similarity
Bailey_Dupo