Installing dependcies

In [1]:
%pip install pdfplumber

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
%pip install PyMuPDF

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [4]:
# %pip install PyMuPDF
import re
import pdfplumber
import fitz  # PyMuPDF
from collections import Counter
 

Extracting the text from the pdf

In [5]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
    return text


Cleaning The Text

In [6]:
import re

def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r"\s+", " ", text)
    return text.strip().lower()

Feature Extraction

In [7]:
def extract_features(cleaned_text):
    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    phone_pattern = r"\+?\d[\d -]{8,}\d" 

    features = {
        "email" : len(re.findall(email_pattern, cleaned_text)) >0,
        "phone" : len(re.findall(phone_pattern, cleaned_text)) >0,
        "has_experience": "experience" in cleaned_text,
        "has_education": "education" in cleaned_text,  
        "has_skills": "skills" in cleaned_text,
        "has_projects": "projects" in cleaned_text,
        "has_certifications": "certifications" in cleaned_text,
        "has_extracurricular_activities": "extracurricular activities" in cleaned_text,
        "has_summary": "summary" in cleaned_text,
        "has_metrics": bool(re.search(r'\d+%|\$\d+|[\d,]+\+', cleaned_text))
    }

    return features
      

# This returns True if the text contains things such as:
# Percentages ‚Üí "20%", "75%"
# Dollar amounts ‚Üí "$500", "$12000"
# Numbers followed by a + sign ‚Üí "100+", "1,000+"

Computing Content Score

In [8]:
def calculate_content_score(features):
    feature_weights = {
    "email": 5,
    "phone": 5,
    "has_experience": 20,
    "has_education": 15,
    "has_skills": 15,
    "has_projects": 10,
    "has_certifications": 5,
    "has_extracurricular_activities": 5,
    "has_summary": 10,
    "has_metrics": 10
}
    score = sum(feature_weights[key] for key, value in features.items() if value)
    return score

Format Checking

In [9]:
def check_format(pdf_path, resume_text, cleaned_text):
    issues = []
    score = 100

    doc = fitz.open(pdf_path)

    #Fonts
    fonts = set()
    font_size = []
    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        fonts.add(span["font"])
                        font_size.append(span["size"])
        
    if len(fonts) > 3:
        issues.append("Inconsistent fonts - use max 1-2")
        score -= 10
    
    if len(set(font_size)) > 3:
        issues.append("Inconsistent font sizes - use max 2-3 sizes")
        score -= 5
    
    #Images 
    for page in doc:
        if page.get_images():
            issues.append("Contains images (ATS may not read content)")
            score -= 15
            break
    
    # Tables
    for page in doc:
        tables = page.find_tables()
        if tables:
            issues.append("Contains tables (ATS may skip structured data)")
            score -= 10
            break

    # Columns
    for page in doc:
        blocks = page.get_text("blocks")
        if blocks:
            x_positions = [b[0] for b in blocks]  # left x-coordinates
            if max(x_positions) - min(x_positions) > 300:  # arbitrary threshold for 2 columns
                issues.append("Multi-column layout detected (ATS may misread order)")
                score -= 10
                break

    # Bullet Points
    if not any(char in resume_text for char in ['‚Ä¢', '-', '*', '¬∑']):
        issues.append("No bullet points used")
        score -= 10

    # Missing Sections
    required_sections = ["experience", "education", "skills", "projects"]
    missing_sections = [section for section in required_sections if section not in cleaned_text]
    if missing_sections:
        issues.append(f"Missing sections: {', '.join(missing_sections)}")
        score -= 10 * len(missing_sections)
    
    # Length Check
    word_count = len(cleaned_text.split())
    if word_count < 300:
        issues.append("Resume too short")
        score -= 10
    elif word_count > 2000:
        issues.append("Resume too long")
        score -= 10
    
    return issues, max(0,score)

Keyword Match

In [10]:
def match_job_description(cleaned_text, job_description):
    if not job_description:
        return 0
    
    cleaned_job_description = re.sub(r"\s+", " ", job_description).strip().lower()

    tech_keywords = re.findall(
        r'\b(?:python|sql|power bi|azure|aws|machine learning|data analysis|excel|r|git|'
        r'spark|hadoop|tableau|tensorflow|pytorch|nlp|deep learning|statistics)\b',
        cleaned_job_description
    )

    match = []
    match_count = 0

    for keyword in set(tech_keywords):
        if keyword in cleaned_text:
            match.append(keyword)
            match_count += cleaned_text.count(keyword)
    

    # Tf-idf similarity
    try:
        vectorizer = TfidfVectorizer()
        vector_matrix = vectorizer.fit_transform([cleaned_text, cleaned_job_description])
        similarity = cosine_similarity(vector_matrix[0:1], vector_matrix[1:2])[0][0]
    except:
        similarity = 0.0
    
    keyword_coverage = (len(match) / max(len(set(tech_keywords)), 1)) * 50
    semantic_similarity = similarity * 50 

    keyword_score = min(100, keyword_coverage + semantic_similarity)

    return keyword_score, match


Final Score

In [11]:
def calculate_final_score(content_score, format_score, keyword_score):
    final_score = (0.4 * content_score) + (0.3 * format_score) + (0.3 * keyword_score)
    return round(final_score, 2)

Final Report

In [12]:
def generate_report(total_score, content_score, format_score, keyword_score, features, format_issues, matched_keywords):
    report = []
    report.append("=" * 60)
    report.append("RESUME ATS ANALYSIS REPORT")
    report.append("=" * 60)
    report.append(f"\nüìä OVERALL SCORE: {total_score}/100")
    report.append(f"\nüìã Content Score: {content_score}/100")
    report.append(f"üé® Format Score: {format_score}/100")
    report.append(f"üîë Keyword Match: {keyword_score}/100")

    # Missing features
    missing = [k.replace("has_", "").title() for k, v in features.items() if not v]
    if missing:
        report.append(f"\n‚ùå Missing Elements: {', '.join(missing)}")

    if format_issues:
        report.append("\n‚ö†Ô∏è Format Issues:")
        for issue in format_issues:
            report.append(f"  ‚Ä¢ {issue}")

    if matched_keywords:
        report.append(f"\n‚úÖ Matched Keywords: {', '.join(matched_keywords[:15])}")

    report.append("\nüí° Recommendations:")
    if total_score < 60:
        report.append("  ‚Ä¢ Improve resume quality significantly.")
    if not features.get("has_metrics"):
        report.append("  ‚Ä¢ Add metrics to achievements (e.g., 'Improved accuracy by 20%').")
    if not features.get("has_summary"):
        report.append("  ‚Ä¢ Add a professional summary.")
    if format_issues:
        report.append("  ‚Ä¢ Fix formatting issues.")
    if keyword_score < 50:
        report.append("  ‚Ä¢ Add more job-relevant keywords.")

    report.append("=" * 60)
    return "\n".join(report)

Usage

In [None]:
if __name__ == "__main__":
    pdf_path = input("Enter the full path of your resume PDF: ").strip()
    print("\nPaste the Job Description below (finish input and press Enter):")
    job_description = input().strip()

    resume_text = extract_text_from_pdf(pdf_path)
    cleaned_text = clean_text(resume_text)

    features = extract_features(cleaned_text)
    content_score = calculate_content_score(features)

    format_issues, format_score = check_format(pdf_path, resume_text, cleaned_text)

    keyword_score, match = match_job_description(cleaned_text, job_description)

    total_score = calculate_final_score(content_score, format_score, keyword_score)

    report = generate_report(total_score, content_score, format_score, keyword_score,
                             features, format_issues, match)

    print(report)