In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pdfminer.high_level import extract_text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download NLTK assets once
nltk.download("punkt")
nltk.download("stopwords")


class ResumeMatcherPipeline:
    def __init__(self, jd_path):
        self.jd_path = jd_path
        self.jd_text = self.extract_text(jd_path)
        self.jd_keywords = self.extract_keywords(self.jd_text)

    def extract_text(self, path):
        try:
            return extract_text(path)
        except:
            return ""

    def extract_keywords(self, text):
        stop_words = set(stopwords.words("english"))
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
        return tokens

    def calculate_cosine_similarity(self, text1, text2):
        tfidf = TfidfVectorizer()
        tfidf_matrix = tfidf.fit_transform([text1, text2])
        cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
        return float(cosine_sim[0][0])

    def process_resume(self, resume_path):
        resume_text = self.extract_text(resume_path)
        resume_keywords = self.extract_keywords(resume_text)

        full_score = self.calculate_cosine_similarity(self.jd_text, resume_text)
        skill_score = self.calculate_cosine_similarity(" ".join(self.jd_keywords), " ".join(resume_keywords))

        final_score = round((0.3 * full_score + 0.7 * skill_score) * 100, 2)

        return {
            "full_resume_score": round(full_score * 100, 2),
            "skills_score": round(skill_score * 100, 2),
            "average_score": final_score
        }


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
