In [6]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pdfplumber

def extract_text_from_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

def preprocess_text(text):

    tokens = word_tokenize(text.lower())
    words = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in words if word not in stop_words]


    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    return " ".join(lemmatized_words)

if __name__ == "__main__":

    job_description = "Software Engineer with experience in Bootstrap"


    resumes = [
        "resume 1.pdf",
        "resume 2.pdf",
        "resume 3.pdf",

    ]

    preprocessed_job_desc = preprocess_text(job_description)

    candidate_scores = []
    vectorizer = TfidfVectorizer()

    for resume_path in resumes:
        resume_text = extract_text_from_pdf(resume_path)
        preprocessed_resume = preprocess_text(resume_text)


        corpus = [preprocessed_resume, preprocessed_job_desc]
        tfidf_matrix = vectorizer.fit_transform(corpus)


        similarity_matrix = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
        similarity_score = similarity_matrix[0][0]

        candidate_scores.append((resume_path, similarity_score))


    candidate_scores.sort(key=lambda x: x[1], reverse=True)


    for rank, (resume_path, score) in enumerate(candidate_scores, 1):
        print(f"Rank {rank}: {resume_path}, Similarity Score: {score}")


Rank 1: resume 3.pdf, Similarity Score: 0.16320139082325497
Rank 2: resume 1.pdf, Similarity Score: 0.1568605939388511
Rank 3: resume 2.pdf, Similarity Score: 0.1557121520926502
