In [None]:
pip install pymupdf pillow pytesseract scikit-learn


In [None]:
import fitz              # PyMuPDF
import pytesseract
from PIL import Image
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sys


# -------------------------------------------------
# Clean text
# -------------------------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# -------------------------------------------------
# Extract text from PDF
# -------------------------------------------------
def extract_from_pdf(path):
    doc = fitz.open(path)
    text = ""

    for page in doc:
        text += page.get_text() + "\n"

    return clean_text(text)


# -------------------------------------------------
# Extract text from image (OCR)
# -------------------------------------------------
def extract_from_image(path):
    img = Image.open(path)
    text = pytesseract.image_to_string(img)
    return clean_text(text)


# -------------------------------------------------
# Extract text from TXT file
# -------------------------------------------------
def extract_from_text(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return clean_text(f.read())


# -------------------------------------------------
# Detect file type + route to correct extractor
# -------------------------------------------------
def extract_resume(path):
    if path.endswith(".pdf"):
        return extract_from_pdf(path)

    elif path.lower().endswith((".png", ".jpg", ".jpeg")):
        return extract_from_image(path)

    elif path.endswith(".txt"):
        return extract_from_text(path)

    else:
        print("‚ùå Unsupported resume file type.")
        sys.exit(1)


def extract_job_description(path):
    if path.endswith(".pdf"):
        return extract_from_pdf(path)
    else:
        return extract_from_text(path)


# -------------------------------------------------
# Calculate ATS Score
# -------------------------------------------------
def calculate_ats_score(resume_text, job_text):
    docs = [resume_text, job_text]
    cv = CountVectorizer(stop_words="english")

    vectors = cv.fit_transform(docs)
    similarity = cosine_similarity(vectors[0], vectors[1])[0][0]

    score = round(similarity * 100, 2)

    # Missing keywords
    resume_words = set(resume_text.split())
    job_words = set(job_text.split())
    missing = list(job_words - resume_words)

    return score, missing


# -------------------------------------------------
# Main program
# -------------------------------------------------
if __name__ == "__main__":
    resume_path = input("Enter path to resume (pdf/image/txt): ")
    job_path = input("Enter path to job description (pdf/txt): ")

    print("\nExtracting resume text...")
    resume_text = extract_resume(resume_path)

    print("Extracting job description text...")
    job_text = extract_job_description(job_path)

    print("\nCalculating ATS score...")
    score, missing_keywords = calculate_ats_score(resume_text, job_text)

    print("\n-----------------------------------------")
    print(f"‚úÖ ATS Score: {score}%")
    print("-----------------------------------------")

    print("\n‚ùó Missing Keywords:")
    if missing_keywords:
        print(", ".join(missing_keywords))
    else:
        print("No missing keywords! Great match üéâ")

    print("\nDone.")
