## Function to Extract Text from PDF
___

In [1]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

## Preprocessing Function
___

In [2]:
import re

def preprocess(text):
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7f]', r' ', text)

    # Lowercase the text for uniformity
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)

    # Remove email addresses
    text = re.sub(r'\S*@\S*\s?', '', text)

    # Remove Twitter-specific artifacts like RT (retweet) and cc
    text = re.sub(r'\brt\b|cc', ' ', text)

    # Remove hashtags and mentions
    text = re.sub(r'#\S+|@\S+', ' ', text)

    # Remove punctuations and special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    return text


## Convert Text to Bag of Words Vectors
___

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

def text_to_bow_vector(texts):
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform(texts)
    return vectors, vectorizer

## Similarity Calculation Function
___

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
def calculate_similarity(vector1, vector2):
    return cosine_similarity(vector1, vector2)

## Function to Process PDFs in a Folder
___

In [5]:
import os
def process_pdfs_in_folder(folder_path):
    resume_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.pdf')]
    resumes_text = [preprocess(extract_text_from_pdf(resume_path)) for resume_path in resume_paths]
    return resumes_text, resume_paths

## Function to Match Job Description with Resumes
___

In [6]:
def match_resumes_with_job(job_desc_text, folder_path):
    job_desc_text = preprocess(job_desc_text)
    resumes_text, resume_paths = process_pdfs_in_folder(folder_path)

    all_text = [job_desc_text] + resumes_text
    vectors, vectorizer = text_to_bow_vector(all_text)

    job_vector = vectors[0]
    resume_vectors = vectors[1:]

    similarities = [calculate_similarity(job_vector, resume_vector)[0][0] for resume_vector in resume_vectors]

    matched_resumes = sorted(zip(resume_paths, similarities), key=lambda x: x[1], reverse=True)
    
    return matched_resumes


## Implementation
___

In [7]:
job_description = input("Your job description text here")
folder_path = "/Users/pranavkhot/Documents/INTERNSHIP/TASK2/PROJECT/CV"

matched_resumes = match_resumes_with_job(job_description, folder_path)
for resume, score in matched_resumes:
    rounded_score = round(score * 100)  # Convert to percentage and round
    print(f"Resume: {resume}, Similarity Score: {rounded_score}%")

Resume: /Users/pranavkhot/Documents/INTERNSHIP/TASK2/PROJECT/CV/RESUME9.pdf, Similarity Score: 58%
Resume: /Users/pranavkhot/Documents/INTERNSHIP/TASK2/PROJECT/CV/RESUME18.pdf, Similarity Score: 58%
Resume: /Users/pranavkhot/Documents/INTERNSHIP/TASK2/PROJECT/CV/RESUME11.pdf, Similarity Score: 57%
Resume: /Users/pranavkhot/Documents/INTERNSHIP/TASK2/PROJECT/CV/RESUME12.pdf, Similarity Score: 56%
Resume: /Users/pranavkhot/Documents/INTERNSHIP/TASK2/PROJECT/CV/RESUME10.pdf, Similarity Score: 56%
Resume: /Users/pranavkhot/Documents/INTERNSHIP/TASK2/PROJECT/CV/RESUME4.pdf, Similarity Score: 56%
Resume: /Users/pranavkhot/Documents/INTERNSHIP/TASK2/PROJECT/CV/RESUME5.pdf, Similarity Score: 56%
Resume: /Users/pranavkhot/Documents/INTERNSHIP/TASK2/PROJECT/CV/RESUME6.pdf, Similarity Score: 55%
Resume: /Users/pranavkhot/Documents/INTERNSHIP/TASK2/PROJECT/CV/RESUME8.pdf, Similarity Score: 53%
Resume: /Users/pranavkhot/Documents/INTERNSHIP/TASK2/PROJECT/CV/RESUME19.pdf, Similarity Score: 51%
Resum