In [1]:
import pdfplumber
import re
import pandas as pd

def load_pdf(file_path):
    """Loads the PDF file and extracts text."""
    with open(file_path, 'rb') as pdf_file:
        with pdfplumber.open(pdf_file) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text()  # Concatenate text from all pages
        return text

def parse_resume_text(text):
    """Parses the extracted text to extract key information."""
    # Initialize a dictionary to store parsed data
    parsed_data = {
        "contact": {
            "email": None,
            "phone": None
        },
        "education": [],
        "experience": [],
        "skills": [],
        "projects": [],
        "certifications": []
    }

    # Extract contact information (email and phone)
    email_match = re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text)
    if email_match:
        parsed_data["contact"]["email"] = email_match.group(0)

    phone_match = re.search(r"(\+?\d{10,15})", text)
    if phone_match:
        parsed_data["contact"]["phone"] = phone_match.group(0)

    # Extract education details (typically starts with the keyword "Education")
    education_pattern = re.compile(r"(Education|Academic Qualifications|Educational Background)(.*?)(?=(Experience|Skills|Projects|Certifications|$))", re.DOTALL | re.IGNORECASE)
    education_matches = education_pattern.findall(text)
    for match in education_matches:
        parsed_data["education"].append(match[1].strip())

    # Extract experience details (typically starts with the keyword "Experience")
    experience_pattern = re.compile(r"(Experience|Work Experience)(.*?)(?=(Skills|Education|Projects|Certifications|$))", re.DOTALL | re.IGNORECASE)
    experience_matches = experience_pattern.findall(text)
    for match in experience_matches:
        parsed_data["experience"].append(match[1].strip())

    # Extract skills section (usually starts with "Skills")
    skills_pattern = re.compile(r"(Skills|Technical Skills)(.*?)(?=(Experience|Education|Projects|Certifications|$))", re.DOTALL | re.IGNORECASE)
    skills_matches = skills_pattern.findall(text)
    for match in skills_matches:
        parsed_data["skills"].append(match[1].strip())

    # Extract projects section (usually starts with "Projects")
    projects_pattern = re.compile(r"(Projects|Personal Projects|Project Experience)(.*?)(?=(Experience|Education|Skills|Certifications|$))", re.DOTALL | re.IGNORECASE)
    projects_matches = projects_pattern.findall(text)
    for match in projects_matches:
        parsed_data["projects"].append(match[1].strip())

    # Extract certifications section (usually starts with "Certifications")
    certifications_pattern = re.compile(r"(Certifications|Certifications and Training)(.*?)(?=(Experience|Education|Skills|Projects|$))", re.DOTALL | re.IGNORECASE)
    certifications_matches = certifications_pattern.findall(text)
    for match in certifications_matches:
        parsed_data["certifications"].append(match[1].strip())

    # Return the parsed data
    return parsed_data

def create_dataframe(parsed_data):
    """Converts parsed data into a DataFrame."""
    # Create a DataFrame from the parsed data
    df = pd.DataFrame({
        "Contact Email": [parsed_data["contact"]["email"]],
        "Contact Phone": [parsed_data["contact"]["phone"]],
        "Education": [", ".join(parsed_data["education"])],
        "Experience": [", ".join(parsed_data["experience"])],
        "Skills": [", ".join(parsed_data["skills"])],
        "Projects": [", ".join(parsed_data["projects"])],
        "Certifications": [", ".join(parsed_data["certifications"])]
    })
    return df

def save_to_csv(df, file_name="resume_data.csv"):
    """Saves the DataFrame to a CSV file."""
    df.to_csv(file_name, index=False)
    print(f"Data saved to {file_name}")

# Example usage:
file_path = "resume.pdf"  # Replace with the actual file path
text = load_pdf(file_path)
parsed_data = parse_resume_text(text)
df = create_dataframe(parsed_data)

# Save to CSV
save_to_csv(df, "resume_data.csv")  # You can specify a different name if needed


Data saved to resume_data.csv


In [2]:
import re
import pandas as pd

def load_pdf(file_path):
    """Loads the PDF file and extracts text."""
    with open(file_path, 'rb') as pdf_file:
        with pdfplumber.open(pdf_file) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text()  # Concatenate text from all pages
        return text

def extract_keywords(text):
    """Extracts keywords (important terms) from a given text."""
    # Remove non-alphabetic characters and split text into words
    words = re.findall(r'\b\w+\b', text.lower())
    return set(words)

def match_keywords(resume_keywords, job_description_keywords):
    """Matches keywords from the resume with the job description."""
    common_keywords = resume_keywords.intersection(job_description_keywords)
    return common_keywords

# Example usage:
resume_file_path = "My Resume.pdf"
job_description = """
    We are looking for a backend developer with strong experience in Java, Python, react and MySQL.
    Familiarity with AWS is a plus. Excellent problem-solving skills and experience in developing scalable applications are required.
"""

# Load and parse the resume
resume_text = load_pdf(resume_file_path)

# Extract keywords from both the resume and job description
resume_keywords = extract_keywords(resume_text)
job_description_keywords = extract_keywords(job_description)

# Find the common keywords
common_keywords = match_keywords(resume_keywords, job_description_keywords)

print(f"Common Keywords: {common_keywords}")


Common Keywords: {'scalable', 'a', 'in', 'backend', 'skills', 'aws', 'java', 'with', 'experience', 'python', 'for', 'developer', 'and', 'react', 'applications'}


In [3]:
import pdfplumber
import re
import pandas as pd

def load_pdf(file_path):
    """Loads the PDF file and extracts text."""
    with open(file_path, 'rb') as pdf_file:
        with pdfplumber.open(pdf_file) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text()  # Concatenate text from all pages
        return text

def parse_resume_text(text):
    """Parses the extracted text to extract key information."""
    parsed_data = {
        "contact": {
            "email": None,
            "phone": None
        },
        "education": [],
        "experience": [],
        "skills": [],
        "projects": [],
        "certifications": []
    }

    # Extract contact information (email and phone)
    email_match = re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text)
    if email_match:
        parsed_data["contact"]["email"] = email_match.group(0)

    phone_match = re.search(r"(\+?\d{10,15})", text)
    if phone_match:
        parsed_data["contact"]["phone"] = phone_match.group(0)

    # Extract education details
    education_pattern = re.compile(r"(Education|Academic Qualifications|Educational Background)(.*?)(?=(Experience|Skills|Projects|Certifications|$))", re.DOTALL | re.IGNORECASE)
    education_matches = education_pattern.findall(text)
    for match in education_matches:
        parsed_data["education"].append(match[1].strip())

    # Extract experience details
    experience_pattern = re.compile(r"(Experience|Work Experience)(.*?)(?=(Skills|Education|Projects|Certifications|$))", re.DOTALL | re.IGNORECASE)
    experience_matches = experience_pattern.findall(text)
    for match in experience_matches:
        parsed_data["experience"].append(match[1].strip())

    # Extract skills section
    skills_pattern = re.compile(r"(Skills|Technical Skills)(.*?)(?=(Experience|Education|Projects|Certifications|$))", re.DOTALL | re.IGNORECASE)
    skills_matches = skills_pattern.findall(text)
    for match in skills_matches:
        parsed_data["skills"].append(match[1].strip())

    # Extract projects section
    projects_pattern = re.compile(r"(Projects|Personal Projects|Project Experience)(.*?)(?=(Experience|Education|Skills|Certifications|$))", re.DOTALL | re.IGNORECASE)
    projects_matches = projects_pattern.findall(text)
    for match in projects_matches:
        parsed_data["projects"].append(match[1].strip())

    # Extract certifications section
    certifications_pattern = re.compile(r"(Certifications|Certifications and Training)(.*?)(?=(Experience|Education|Skills|Projects|$))", re.DOTALL | re.IGNORECASE)
    certifications_matches = certifications_pattern.findall(text)
    for match in certifications_matches:
        parsed_data["certifications"].append(match[1].strip())

    return parsed_data

def create_dataframe(parsed_data):
    """Converts parsed data into a DataFrame."""
    df = pd.DataFrame({
        "Contact Email": [parsed_data["contact"]["email"]],
        "Contact Phone": [parsed_data["contact"]["phone"]],
        "Education": [", ".join(parsed_data["education"])],
        "Experience": [", ".join(parsed_data["experience"])],
        "Skills": [", ".join(parsed_data["skills"])],
        "Projects": [", ".join(parsed_data["projects"])],
        "Certifications": [", ".join(parsed_data["certifications"])]
    })
    return df

def save_to_csv(df, file_name="resume_data.csv"):
    """Saves the DataFrame to a CSV file."""
    df.to_csv(file_name, index=False)
    print(f"Data saved to {file_name}")
def evaluate_candidate_skills(candidate_skills, required_skills):
    """Evaluates how well a candidate's skills match the required skills."""
    # Join the list of candidate skills into a single string
    candidate_skills_str = ", ".join(candidate_skills).lower()
    
    # Convert the required skills to a lower case string and split by commas
    required_skill_set = set(required_skills.lower().split(", "))
    
    # Match the candidate's skills with the required skills
    candidate_skill_set = set(candidate_skills_str.split(", "))
    
    matched_skills = candidate_skill_set.intersection(required_skill_set)
    match_percentage = (len(matched_skills) / len(required_skill_set)) * 100
    return matched_skills, match_percentage


# Example usage:
file_path = "science-cs-egr-resumes-7.pdf"  # Replace with the actual file path
required_skills = "Java, Python, MySQL, React, Django, JavaScript, HTML, CSS"  # Example required skills for the position

# Load and parse the resume
text = load_pdf(file_path)
parsed_data = parse_resume_text(text)
df = create_dataframe(parsed_data)

# Evaluate the candidate's skills against the required skills
matched_skills, match_percentage = evaluate_candidate_skills(parsed_data["skills"], required_skills)

# Print the match details
print(f"Matched Skills: {matched_skills}")
print(f"Skill Match Percentage: {match_percentage:.2f}%")

# Save candidate data to CSV
save_to_csv(df, "resumeee_data.csv")


Matched Skills: set()
Skill Match Percentage: 0.00%
Data saved to resumeee_data.csv


In [4]:
import pdfplumber
import re
import csv

def load_pdf(file_path):
    """Extracts text from the PDF."""
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            return ''.join(page.extract_text() for page in pdf.pages)

def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    # Match the Skills section and extract its content
    skills_match = re.search(r"(?i)(TECHNICAL SUMMARY|SKILLS)\s*(.*?)(?=(COURSEWORK|EXPERIENCE|EDUCATION|PROJECTS|$))", text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        # Clean and split the skills
        skills_text = skills_match.group(1)
        skills = re.split(r"[,;/\n]+", skills_text)
        return clean_skills([skill.strip() for skill in skills if skill.strip()])  # Clean and normalize skills
    return []

def clean_skills(raw_skills):
    """Clean and normalize extracted skills."""
    cleaned = []
    for skill in raw_skills:
        # Remove descriptors like "(Proficient)" or "(Familiar)" and extra text
        cleaned_skill = re.sub(r"\(.*?\)", "", skill).strip()
        # Split compound entries like "Java, Python" into separate skills
        cleaned.extend(re.split(r"[:,;/]+", cleaned_skill))
        """
        if '/' in cleaned_skill:
            cleaned.extend(cleaned_skill.split('/'))
        else:
            cleaned.append(cleaned_skill)  """
    # Return a clean, deduplicated list with all skills in lowercase
    return list(set(skill.strip().lower() for skill in cleaned if skill.strip()))


def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill extraction."""
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}"
, text),
        "skills": extract_skills(text),
    }
    return {key: (value.group(0) if value else "Not available") if key != "skills" else value for key, value in data.items()}

def save_to_csv(data, file_name="resume_dataa.csv"):
    """Saves extracted data to CSV."""
    with open(file_name, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(data.keys())  # Write headers
        writer.writerow(data.values())  # Write values
    print(f"Data saved to {file_name}")

def evaluate_skills(candidate_skills, required_skills):
    """Matches candidate's skills with required skills."""
    if not candidate_skills or not required_skills:
        return set(), 0  # No skills to evaluate
    candidate_set = set(map(str.lower, candidate_skills))
    required_set = set(map(str.lower, required_skills.split(", ")))
    matched_skills = candidate_set.intersection(required_set)
    match_percentage = (len(matched_skills) / len(required_set)) * 100 if required_set else 0
    return matched_skills, match_percentage
# Example usage:
file_path = "science-cs-egr-resumes-8.pdf"  # Replace with the actual file path
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills

# Load and parse the resume
text = load_pdf(file_path)
data = extract_info(text)

# Evaluate candidate skills
matched_skills, match_percentage = evaluate_skills(data["skills"], required_skills)

# Debugging extracted text
#print("Extracted Text:")
#print(text)

# Debugging extracted skills
#print("Extracted Skills:")
#print(data["skills"])

# Output the matched skills and percentage
print(f"Matched Skills: {matched_skills}")
print(f"Skill Match Percentage: {match_percentage:.2f}%")

# Save extracted data to CSV
save_to_csv(data) 

Matched Skills: set()
Skill Match Percentage: 0.00%
Data saved to resume_dataa.csv


In [5]:
import pdfplumber
import re
import csv
import os

def load_pdf(file_path):
    """Extracts text from the PDF."""
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            return ''.join(page.extract_text() for page in pdf.pages)

def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    # Match the Skills section and extract its content
    skills_match = re.search(r"SKILLS\s*(.*?)(?=(COURSEWORK|EXPERIENCE|EDUCATION|PROJECTS|$))", text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        # Clean and split the skills
        skills_text = skills_match.group(1)
        skills = re.split(r"[\u2022\n,]+", skills_text)  # Split by bullet points, newlines, or commas
        return clean_skills([skill.strip() for skill in skills if skill.strip()])  # Clean and normalize skills
    return []

def clean_skills(raw_skills):
    """Clean and normalize extracted skills."""
    cleaned = []
    for skill in raw_skills:
        # Remove descriptors like "(Proficient)" or "(Familiar)" and extra text
        cleaned_skill = re.sub(r"\(.*?\)", "", skill).strip()
        # Split compound entries like "Java, Python" into separate skills
        cleaned.extend(re.split(r"[:,;/]+", cleaned_skill))
    # Return a clean, deduplicated list
    return list(set(skill.strip().lower() for skill in cleaned if skill.strip()))

def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill extraction."""
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text),
        "skills": extract_skills(text),
    }
    return {key: (value.group(0) if value else "Not available") if key != "skills" else value for key, value in data.items()}

def save_to_csv(data, file_name="resume_data1.csv"):
    """Saves extracted data to CSV."""
    with open(file_name, mode='a', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:  # Check if file is empty to write headers
            writer.writerow(data.keys())  # Write headers
        writer.writerow(data.values())  # Write values
    print(f"Data saved to {file_name}")

def evaluate_skills(candidate_skills, required_skills):
    """Matches candidate's skills with required skills."""
    candidate_set = set(map(str.lower, candidate_skills))
    required_set = set(map(str.lower, required_skills.split(", ")))
    matched_skills = candidate_set.intersection(required_set)
    match_percentage = (len(matched_skills) / len(required_set)) * 100 if required_set else 0
    return matched_skills, match_percentage

def process_pdf_files_in_directory(directory_path, required_skills):
    """Processes all PDF files in the given directory."""
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)
            print(f"Processing {filename}...")
            
            # Load and parse the resume
            text = load_pdf(file_path)
            data = extract_info(text)

            # Evaluate candidate skills
            matched_skills, match_percentage = evaluate_skills(data["skills"], required_skills)

            # Debugging extracted text
            #print("Extracted Text:")
            #print(text)

            # Debugging extracted skills
            #print("Extracted Skills:")
            #print(data["skills"])

            # Output the matched skills and percentage
            print(f"Matched Skills: {matched_skills}")
            print(f"Skill Match Percentage: {match_percentage:.2f}%")

            # Save extracted data to CSV
            save_to_csv(data)

# Example usage:
directory_path = "resumes"  # Replace with the directory containing your PDF files
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills

# Process all PDF files in the directory
process_pdf_files_in_directory(directory_path, required_skills)


Processing computer-science-resume-example.pdf...
Matched Skills: {'css', 'html', 'javascript'}
Skill Match Percentage: 37.50%
Data saved to resume_data1.csv
Processing John Doe.pdf...
Matched Skills: set()
Skill Match Percentage: 0.00%
Data saved to resume_data1.csv
Processing My Resume.pdf...
Matched Skills: {'python', 'java', 'javascript'}
Skill Match Percentage: 37.50%
Data saved to resume_data1.csv
Processing Resume-Sample-2.pdf...
Matched Skills: {'java', 'html'}
Skill Match Percentage: 25.00%
Data saved to resume_data1.csv
Processing resume1.pdf...
Matched Skills: set()
Skill Match Percentage: 0.00%
Data saved to resume_data1.csv
Processing resume12.pdf...
Matched Skills: {'java', 'django', 'python'}
Skill Match Percentage: 37.50%
Data saved to resume_data1.csv
Processing resume2.pdf...
Matched Skills: set()
Skill Match Percentage: 0.00%
Data saved to resume_data1.csv
Processing resume3.pdf...
Matched Skills: set()
Skill Match Percentage: 0.00%
Data saved to resume_data1.csv
Pro

In [6]:
import pdfplumber
import re
import csv
import os

def load_pdf(file_path):
    """Extracts text from the PDF."""
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            return ''.join(page.extract_text() for page in pdf.pages)

def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    skills_match = re.search(r"SKILLS\s*(.*?)(?=(COURSEWORK|EXPERIENCE|EDUCATION|PROJECTS|$))", text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = re.split(r"[\u2022\n,]+", skills_text)
        return clean_skills([skill.strip() for skill in skills if skill.strip()])
    return []

def clean_skills(raw_skills):
    """Clean and normalize extracted skills."""
    cleaned = []
    for skill in raw_skills:
        cleaned_skill = re.sub(r"\(.*?\)", "", skill).strip()
        cleaned.extend(re.split(r"[:,;/]+", cleaned_skill))
    return list(set(skill.strip().lower() for skill in cleaned if skill.strip()))

def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill extraction."""
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text),
        "skills": extract_skills(text),
    }
    return {key: (value.group(0) if value else "Not available") if key != "skills" else value for key, value in data.items()}

def save_to_csv(data, file_name="resume_data1.csv"):
    """Saves extracted data to CSV."""
    with open(file_name, mode='a', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:
            writer.writerow(data.keys())  # Write headers
        writer.writerow(data.values())  # Write values
    print(f"Data saved to {file_name}")

def evaluate_skills(candidate_skills, required_skills):
    """Matches candidate's skills with required skills."""
    candidate_set = set(map(str.lower, candidate_skills))
    required_set = set(map(str.lower, required_skills.split(", ")))
    matched_skills = candidate_set.intersection(required_set)
    match_percentage = (len(matched_skills) / len(required_set)) * 100 if required_set else 0
    return matched_skills, match_percentage

def process_pdf_files_in_directory(directory_path, required_skills):
    """Processes all PDF files in the given directory."""
    candidates = []
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)
            print(f"Processing {filename}...")
            
            # Load and parse the resume
            text = load_pdf(file_path)
            data = extract_info(text)

            # Evaluate candidate skills
            matched_skills, match_percentage = evaluate_skills(data["skills"], required_skills)

            # Debugging extracted text
           # print("Extracted Text:")
            #print(text)

            # Debugging extracted skills
           # print("Extracted Skills:")
           # print(data["skills"])

            # Output the matched skills and percentage
            print(f"Matched Skills: {matched_skills}")
            print(f"Skill Match Percentage: {match_percentage:.2f}%")

            # Store the candidate data with match percentage for ranking
            candidate_data = data.copy()
            candidate_data["match_percentage"] = match_percentage
            candidates.append(candidate_data)

            # Save extracted data to CSV
            save_to_csv(data)

    # Rank candidates based on match percentage
    ranked_candidates = sorted(candidates, key=lambda x: x["match_percentage"], reverse=True)

    print("\nRanking Candidates:")
    for i, candidate in enumerate(ranked_candidates, start=1):
        print(f"{i}. {candidate.get('email', 'No Email')} - {candidate['match_percentage']:.2f}% Match")

    return ranked_candidates

# Example usage:
directory_path = "resumes"  # Replace with the directory containing your PDF files
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills

# Process all PDF files in the directory
ranked_candidates = process_pdf_files_in_directory(directory_path, required_skills)


Processing computer-science-resume-example.pdf...
Matched Skills: {'css', 'html', 'javascript'}
Skill Match Percentage: 37.50%
Data saved to resume_data1.csv
Processing John Doe.pdf...
Matched Skills: set()
Skill Match Percentage: 0.00%
Data saved to resume_data1.csv
Processing My Resume.pdf...
Matched Skills: {'python', 'java', 'javascript'}
Skill Match Percentage: 37.50%
Data saved to resume_data1.csv
Processing Resume-Sample-2.pdf...
Matched Skills: {'java', 'html'}
Skill Match Percentage: 25.00%
Data saved to resume_data1.csv
Processing resume1.pdf...
Matched Skills: set()
Skill Match Percentage: 0.00%
Data saved to resume_data1.csv
Processing resume12.pdf...
Matched Skills: {'java', 'django', 'python'}
Skill Match Percentage: 37.50%
Data saved to resume_data1.csv
Processing resume2.pdf...
Matched Skills: set()
Skill Match Percentage: 0.00%
Data saved to resume_data1.csv
Processing resume3.pdf...
Matched Skills: set()
Skill Match Percentage: 0.00%
Data saved to resume_data1.csv
Pro

In [7]:
import pdfplumber
import re
import csv
import os

def load_pdf(file_path):
    """Extracts text from the PDF."""
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            return ''.join(page.extract_text() for page in pdf.pages)

def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    skills_match = re.search(r"SKILLS\s*(.*?)(?=(COURSEWORK|EXPERIENCE|EDUCATION|PROJECTS|$))", text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = re.split(r"[\u2022\n,]+", skills_text)
        return clean_skills([skill.strip() for skill in skills if skill.strip()])
    return []

def clean_skills(raw_skills):
    """Clean and normalize extracted skills."""
    cleaned = []
    for skill in raw_skills:
        cleaned_skill = re.sub(r"\(.*?\)", "", skill).strip()
        cleaned.extend(re.split(r"[:,;/]+", cleaned_skill))
    return list(set(skill.strip().lower() for skill in cleaned if skill.strip()))

def extract_info(text):
    """Extracts key information (name, email, phone, etc.) and invokes skill extraction."""
    name = re.search(r"(?:name|full\s*name|first\s*name|last\s*name)\s*[:\-]?\s*(.*?)(?=\n|email|$)", text, re.IGNORECASE)
    email = re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text)
    phone = re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text)
    
    data = {
        "name": name.group(1).strip() if name else "Not available",
        "email": email.group(0) if email else "Not available",
        "phone": phone.group(0) if phone else "Not available",
        "skills": extract_skills(text),
    }
    return data

def save_to_csv(data, file_name="ranked_candidates.csv"):
    """Saves ranked candidate data to CSV."""
    with open(file_name, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([ "Phone", "Email", "Matched Skills", "Match Percentage"])  # Header
        for candidate in data:
            writer.writerow([candidate["phone"], candidate["email"],
                             ', '.join(candidate["skills"]), f"{candidate['match_percentage']:.2f}%"])
    print(f"Ranked data saved to {file_name}")

def evaluate_skills(candidate_skills, required_skills):
    """Matches candidate's skills with required skills."""
    candidate_set = set(map(str.lower, candidate_skills))
    required_set = set(map(str.lower, required_skills.split(", ")))
    matched_skills = candidate_set.intersection(required_set)
    match_percentage = (len(matched_skills) / len(required_set)) * 100 if required_set else 0
    return matched_skills, match_percentage

def process_pdf_files_in_directory(directory_path, required_skills):
    """Processes all PDF files in the given directory."""
    candidates = []
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)
            print(f"Processing {filename}...")
            
            # Load and parse the resume
            text = load_pdf(file_path)
            data = extract_info(text)

             # Debugging extracted text
            #print("Extracted Text:")
            #print(text)

            # Evaluate candidate skills
            matched_skills, match_percentage = evaluate_skills(data["skills"], required_skills)

            # Store the candidate data with match percentage for ranking
            candidate_data = data.copy()
            candidate_data["match_percentage"] = match_percentage
            candidate_data["skills"] = list(matched_skills)
            candidates.append(candidate_data)

    # Rank candidates based on match percentage
    ranked_candidates = sorted(candidates, key=lambda x: x["match_percentage"], reverse=True)

    # Print the ranking in the desired format
    print("\nRanking Candidates:")
    for i, candidate in enumerate(ranked_candidates, start=1):
        match_percentage = f"{candidate['match_percentage']:.2f}%"
        email = candidate['email'] if candidate['email'] != "Not available" else "Not available"
        print(f"{i}. {email} - {match_percentage} Match")

    # Save the ranked candidates to CSV
    save_to_csv(ranked_candidates)

    return ranked_candidates

# Example usage:
directory_path = "resumes"  # Replace with the directory containing your PDF files
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills

# Process all PDF files in the directory
ranked_candidates = process_pdf_files_in_directory(directory_path, required_skills)


Processing computer-science-resume-example.pdf...
Processing John Doe.pdf...
Processing My Resume.pdf...
Processing Resume-Sample-2.pdf...
Processing resume1.pdf...
Processing resume12.pdf...
Processing resume2.pdf...
Processing resume3.pdf...
Processing sample-resumes_scs-3.pdf...
Processing sample-resumes_scs-5.pdf...
Processing sample-resumes_scs-6.pdf...
Processing science-cs-egr-resumes-5.pdf...
Processing science-cs-egr-resumes-8.pdf...

Ranking Candidates:
1. mackcrol@gmail.com - 62.50% Match
2. mtrix@andrew.cmu.edu - 50.00% Match
3. msmith@smith.edu - 50.00% Match
4. bellatrevino@email.com - 37.50% Match
5. ravigupta.2140@gmail.com - 37.50% Match
6. ajaybkedare@gmail.com - 37.50% Match
7. uxsi@gmail.com - 37.50% Match
8. cindylou@nova.edu - 25.00% Match
9. c1phan@smith.edu - 12.50% Match
10. john.doe@example.com - 0.00% Match
11. emily.williams@example.com - 0.00% Match
12. jane.smith@example.com - 0.00% Match
13. alex.johnson@example.com - 0.00% Match
Ranked data saved to rank

In [8]:
import pdfplumber
import re
import csv
import os

def load_pdf(file_path):
    """Extracts text from the PDF."""
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            return ''.join(page.extract_text() for page in pdf.pages)

def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    skills_match = re.search(r"SKILLS\s*(.*?)(?=(COURSEWORK|EXPERIENCE|EDUCATION|PROJECTS|$))", text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = re.split(r"[\u2022\n,]+", skills_text)
        return clean_skills([skill.strip() for skill in skills if skill.strip()])
    return []

def clean_skills(raw_skills):
    """Clean and normalize extracted skills."""
    cleaned = []
    for skill in raw_skills:
        cleaned_skill = re.sub(r"\(.*?\)", "", skill).strip()
        cleaned.extend(re.split(r"[:,;/]+", cleaned_skill))
    return list(set(skill.strip().lower() for skill in cleaned if skill.strip()))

def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill extraction."""
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text),
        "skills": extract_skills(text),
    }
    return {key: (value.group(0) if value else "Not available") if key != "skills" else value for key, value in data.items()}

def save_to_csv(data, file_name="resume_data1.csv"):
    """Saves extracted data to CSV."""
    with open(file_name, mode='a', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:
            writer.writerow(data.keys())  # Write headers
        writer.writerow(data.values())  # Write values
    print(f"Data saved to {file_name}")

def evaluate_skills(candidate_skills, required_skills):
    """Matches candidate's skills with required skills."""
    candidate_set = set(map(str.lower, candidate_skills))
    required_set = set(map(str.lower, required_skills.split(", ")))
    matched_skills = candidate_set.intersection(required_set)
    match_percentage = (len(matched_skills) / len(required_set)) * 100 if required_set else 0
    return matched_skills, match_percentage

def process_single_pdf(file_path, required_skills):
    """Processes a single PDF resume file."""
    print(f"Processing {file_path}...")
    
    # Load and parse the resume
    text = load_pdf(file_path)
    data = extract_info(text)

    # Evaluate candidate skills
    matched_skills, match_percentage = evaluate_skills(data["skills"], required_skills)

    # Output the matched skills and percentage
    print(f"Matched Skills: {matched_skills}")
    print(f"Skill Match Percentage: {match_percentage:.2f}%")

    # Store the candidate data with match percentage for ranking
    candidate_data = data.copy()
    candidate_data["match_percentage"] = match_percentage

    # Save extracted data to CSV
    save_to_csv(data)

    return candidate_data, match_percentage

# Example usage for one resume:
file_path = "resume12.pdf"  # Replace with the path to the resume PDF
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills

# Process a single PDF resume
candidate_data, match_percentage = process_single_pdf(file_path, required_skills)

# Output the result for the candidate
print(f"Candidate Data: {candidate_data}")
print(f"Match Percentage: {match_percentage:.2f}%")


Processing resume12.pdf...
Matched Skills: {'java', 'django', 'python'}
Skill Match Percentage: 37.50%
Data saved to resume_data1.csv
Candidate Data: {'email': 'ajaybkedare@gmail.com', 'phone': '9082168876', 'skills': ['hadoop mapreduce framework', 'shell', 'ojet', 'spring mvc', 'hibernate', 'jsf', 'sql server management studio', 'postgresql', 'django', 'mysql', 'c++', 'git', 'c', 'sed', 'ibatis', 'python', 'awk', 'programming & scripting languages', 'eclipse', 'tools & other technologies', 'pycharm', 'java', 'golang', 'jsp', 'puppet', 'angularjs', 'web technologies'], 'match_percentage': 37.5}
Match Percentage: 37.50%


In [9]:
import pdfplumber
import re
import csv
import os

def load_pdf(file_path):
    """Extracts text from the PDF."""
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            return ''.join(page.extract_text() for page in pdf.pages)

def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    skills_match = re.search(r"SKILLS\s*(.*?)(?=(COURSEWORK|EXPERIENCE|EDUCATION|PROJECTS|$))", text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = re.split(r"[\u2022\n,]+", skills_text)
        return clean_skills([skill.strip() for skill in skills if skill.strip()])
    return []

def clean_skills(raw_skills):
    """Clean and normalize extracted skills."""
    cleaned = []
    for skill in raw_skills:
        cleaned_skill = re.sub(r"\(.*?\)", "", skill).strip()
        cleaned.extend(re.split(r"[:,;/]+", cleaned_skill))
    return list(set(skill.strip().lower() for skill in cleaned if skill.strip()))

def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill extraction."""
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text),
        "skills": extract_skills(text),
    }
    return {key: (value.group(0) if value else "Not available") if key != "skills" else value for key, value in data.items()}

def save_to_csv(data, file_name="resume_data1.csv"):
    """Saves extracted data to CSV."""
    with open(file_name, mode='a', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:
            writer.writerow(data.keys())  # Write headers
        writer.writerow(data.values())  # Write values
    print(f"Data saved to {file_name}")

def evaluate_skills(candidate_skills, required_skills):
    """Matches candidate's skills with required skills."""
    candidate_set = set(map(str.lower, candidate_skills))
    required_set = set(map(str.lower, required_skills.split(", ")))
    matched_skills = candidate_set.intersection(required_set)
    match_percentage = (len(matched_skills) / len(required_set)) * 100 if required_set else 0
    return matched_skills, match_percentage

def process_single_pdf(file_path, required_skills):
    """Processes a single PDF resume file."""
    print(f"Processing {file_path}...")
    
    # Load and parse the resume
    text = load_pdf(file_path)
    data = extract_info(text)

    # Evaluate candidate skills
    matched_skills, match_percentage = evaluate_skills(data["skills"], required_skills)

    # Output the matched skills and percentage
    print(f"Matched Skills: {matched_skills}")
    print(f"Skill Match Percentage: {match_percentage:.2f}%")

    # Store the candidate data with match percentage for ranking
    candidate_data = data.copy()
    candidate_data["match_percentage"] = match_percentage

    # Save extracted data to CSV
    save_to_csv(data)

    return candidate_data, match_percentage

def process_multiple_pdfs(pdf_folder, required_skills):
    """Processes multiple PDF resumes in a folder."""
    all_candidates_data = []

    for file_name in os.listdir(pdf_folder):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(pdf_folder, file_name)
            candidate_data, match_percentage = process_single_pdf(file_path, required_skills)
            all_candidates_data.append(candidate_data)

    return all_candidates_data

# Example usage:
pdf_folder = "resumes"  # Replace with the folder containing the PDF resumes
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills

# Process all PDF resumes in the folder
all_candidates_data = process_multiple_pdfs(pdf_folder, required_skills)

# Output the results for all candidates
for candidate_data in all_candidates_data:
    print(f"Candidate Data: {candidate_data}")
    print(f"Match Percentage: {candidate_data['match_percentage']:.2f}%")


Processing resumes\computer-science-resume-example.pdf...
Matched Skills: {'css', 'html', 'javascript'}
Skill Match Percentage: 37.50%
Data saved to resume_data1.csv
Processing resumes\John Doe.pdf...
Matched Skills: set()
Skill Match Percentage: 0.00%
Data saved to resume_data1.csv
Processing resumes\My Resume.pdf...
Matched Skills: {'python', 'java', 'javascript'}
Skill Match Percentage: 37.50%
Data saved to resume_data1.csv
Processing resumes\Resume-Sample-2.pdf...
Matched Skills: {'java', 'html'}
Skill Match Percentage: 25.00%
Data saved to resume_data1.csv
Processing resumes\resume1.pdf...
Matched Skills: set()
Skill Match Percentage: 0.00%
Data saved to resume_data1.csv
Processing resumes\resume12.pdf...
Matched Skills: {'java', 'django', 'python'}
Skill Match Percentage: 37.50%
Data saved to resume_data1.csv
Processing resumes\resume2.pdf...
Matched Skills: set()
Skill Match Percentage: 0.00%
Data saved to resume_data1.csv
Processing resumes\resume3.pdf...
Matched Skills: set()


In [10]:
import pdfplumber
import re
import csv
import os

def load_pdf(file_path):
    """Extracts text from the PDF."""
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            return ''.join(page.extract_text() for page in pdf.pages)

def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    skills_match = re.search(r"SKILLS\s*(.*?)(?=(COURSEWORK|EXPERIENCE|EDUCATION|PROJECTS|$))", text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = re.split(r"[\u2022\n,]+", skills_text)
        return clean_skills([skill.strip() for skill in skills if skill.strip()])
    return []


def clean_skills(raw_skills):
    """Clean and normalize extracted skills."""
    #print(f"Cleaning skills: {raw_skills}")
    cleaned = []
    for skill in raw_skills:
        cleaned_skill = re.sub(r"^\s*-?\s*", "", skill)
        cleaned_skill = re.sub(r"\(.*?\)", "", cleaned_skill).strip()# text enclosed in parenthsis
        cleaned.extend(re.split(r"[:,;/]+", cleaned_skill))
    cleaned_skills = list(dict.fromkeys(skill.strip().lower() for skill in cleaned if skill.strip()))#original order maintained
    #print(f"Cleaned skills: {cleaned_skills}")
    return cleaned_skills

def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill extraction."""
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text),
        "skills": extract_skills(text),
    }
    return {key: (value.group(0) if value else "Not available") if key != "skills" else value for key, value in data.items()}

def save_to_csv(data, file_name="resume_data1.csv"):
    """Saves extracted data to CSV."""
    with open(file_name, mode='a', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:
            writer.writerow(data.keys())  # Write headers
        writer.writerow(data.values())  # Write values
    print(f"Data saved to {file_name}")

def evaluate_skills(candidate_skills, required_skills):
    """Matches candidate's skills with required skills."""
    candidate_set = set(map(str.lower, candidate_skills))
    required_set = set(map(str.lower, required_skills.split(", ")))
    matched_skills = candidate_set.intersection(required_set)
    match_percentage = (len(matched_skills) / len(required_set)) * 100 if required_set else 0
    return matched_skills, match_percentage

def process_single_pdf(file_path, required_skills):
    """Processes a single PDF resume file."""
    print(f"Processing {file_path}...")
    
    # Load and parse the resume
    text = load_pdf(file_path)
    data = extract_info(text)

    # Evaluate candidate skills
    matched_skills, match_percentage = evaluate_skills(data["skills"], required_skills)

    # Output the matched skills and percentage
    print(f"Matched Skills: {matched_skills}")
    print(f"Skill Match Percentage: {match_percentage:.2f}%")

    # Store the candidate data with match percentage for ranking
    candidate_data = data.copy()
    candidate_data["match_percentage"] = match_percentage
    # Add matched_skills to the candidate data
    candidate_data["matched_skills"] = ", ".join(matched_skills)
    # Save extracted data to CSV
    save_to_csv(data)

    return candidate_data, match_percentage

def process_multiple_pdfs(pdf_folder, required_skills):
    """Processes multiple PDF resumes in a folder."""
    all_candidates_data = []

    for file_name in os.listdir(pdf_folder):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(pdf_folder, file_name)
            candidate_data, match_percentage = process_single_pdf(file_path, required_skills)
            all_candidates_data.append(candidate_data)

    return all_candidates_data

def save_sorted_candidates(all_candidates_data, output_file="sorted_resume_data.csv"):
    """Sort candidates by match percentage and save to CSV."""
    sorted_candidates = sorted(all_candidates_data, key=lambda x: x['match_percentage'], reverse=True)

    # Save the sorted candidates to a new CSV file
    with open(output_file, mode='w', newline='',encoding='utf-8') as file:
        writer = csv.writer(file)
        if len(sorted_candidates) > 0:
            headers=list(sorted_candidates[0].keys())
            writer.writerow(headers)# Write headers
            for candidate in sorted_candidates:
                # Add matched skills as a new field in candidate data
                matched_skills = ", ".join(candidate['skills'])  # Convert list of matched skills to a comma-separated string
                writer.writerow(list(candidate.values()))  # Write candidate data

    print(f"Sorted data saved to {output_file}")

# Example usage:
pdf_folder = "resumes"  # Replace with the folder containing the PDF resumes
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills

# Process all PDF resumes in the folder
all_candidates_data = process_multiple_pdfs(pdf_folder, required_skills)
for candidate_data in all_candidates_data:
    print(f"Candidate Data: {candidate_data}")
    print(f"Match Percentage: {candidate_data['match_percentage']:.2f}%")


# Save the sorted candidates to a new CSV file
save_sorted_candidates(all_candidates_data)


Processing resumes\computer-science-resume-example.pdf...
Matched Skills: {'css', 'html', 'javascript'}
Skill Match Percentage: 37.50%
Data saved to resume_data1.csv
Processing resumes\John Doe.pdf...
Matched Skills: {'sql', 'django', 'python', 'javascript'}
Skill Match Percentage: 50.00%
Data saved to resume_data1.csv
Processing resumes\My Resume.pdf...
Matched Skills: {'python', 'java', 'javascript'}
Skill Match Percentage: 37.50%
Data saved to resume_data1.csv
Processing resumes\Resume-Sample-2.pdf...
Matched Skills: {'java', 'html'}
Skill Match Percentage: 25.00%
Data saved to resume_data1.csv
Processing resumes\resume1.pdf...
Matched Skills: {'python'}
Skill Match Percentage: 12.50%
Data saved to resume_data1.csv
Processing resumes\resume12.pdf...
Matched Skills: {'java', 'django', 'python'}
Skill Match Percentage: 37.50%
Data saved to resume_data1.csv
Processing resumes\resume2.pdf...
Matched Skills: {'css', 'html', 'javascript'}
Skill Match Percentage: 37.50%
Data saved to resum

In [11]:
import re
import csv

def load_pdf(file_path):
    """Extracts text from the PDF."""
    print(f"Loading PDF: {file_path}")
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            text = ''.join(page.extract_text() for page in pdf.pages)
    print(f"Extracted text from PDF: {file_path}\n{text}")
    return text

def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    print("Extracting skills...")
    skills_match = re.search(r"SKILLS\s*([\s\S]*?)(?=(COURSEWORK|EXPERIENCE|EDUCATION|PROJECTS|$))", text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = re.split(r"[\u2022\n,]+", skills_text)#bullet operators unicode
        print(f"Raw skills extracted: {skills}")
        return clean_skills([skill.strip() for skill in skills if skill.strip()])
    print("No skills section found.")
    return []

def clean_skills(raw_skills):
    """Clean and normalize extracted skills."""
    print(f"Cleaning skills: {raw_skills}")
    cleaned = []
    for skill in raw_skills:
        cleaned_skill = re.sub(r"^\s*-?\s*", "", skill)
        cleaned_skill = re.sub(r"\(.*?\)", "", cleaned_skill).strip()# text enclosed in parenthsis
        
        words = re.split(r"[,\s:/;]+", cleaned_skill)
        cleaned.extend(word.strip() for word in words if word.strip())
    # Remove duplicates and normalize case
    cleaned_skills = list(dict.fromkeys(word.lower() for word in cleaned))
    print(f"Cleaned skills: {cleaned_skills}")
    return cleaned_skills

def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill extraction."""
    print("Extracting key information (email, phone, etc.)...")
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text),
        "skills": extract_skills(text),
    }
    result_data = {}

    for key, value in data.items():
        if key != "skills":
            result_data[key] = value.group(0) if value else "Not available"
        else:
            result_data[key] = value
    print(f"Extracted information: {result_data}")
    return result_data

def save_to_csv(data, file_name="resume_data1.csv"):
    """Saves extracted data to CSV."""
    print(f"Saving data to {file_name}...")
    with open(file_name, mode='a', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:  # Check if the file is empty to write headers
            writer.writerow(data.keys())  # Write headers("name", "email", "phone")
        writer.writerow(data.values())  # Write values
    print(f"Data saved to {file_name}")

def evaluate_skills(candidate_skills, required_skills):
    """Matches candidate's skills with required skills."""
    print(f"Evaluating skills... Required skills: {required_skills}")
    candidate_set = set(map(str.lower, candidate_skills))
    required_set = set(map(str.lower, required_skills.split(", ")))
    matched_skills = candidate_set.intersection(required_set)
    match_percentage = (len(matched_skills) / len(required_set)) * 100 if required_set else 0
    print(f"Matched skills: {matched_skills}")
    print(f"Skill match percentage: {match_percentage:.2f}%")
    return matched_skills, match_percentage

def process_single_pdf(file_path, required_skills):
    """Processes a single PDF resume file."""
    print(f"Processing {file_path}...")
    
    # Load and parse the resume
    text = load_pdf(file_path)
    data = extract_info(text)

    # Evaluate candidate skills
    matched_skills, match_percentage = evaluate_skills(data["skills"], required_skills)

    # Output the matched skills and percentage
    print(f"Matched Skills: {matched_skills}")
    print(f"Skill Match Percentage: {match_percentage:.2f}%")

    # Store the candidate data with match percentage for ranking
    candidate_data = data.copy()
    candidate_data["match_percentage"] = match_percentage

    # Save extracted data to CSV
    save_to_csv(data)

    return candidate_data, match_percentage

# Example usage for one resume:
#file_path = "John Doe.pdf"# Replace with the path to the resume PDF
#file_path = "computer-science-resume-example.pdf"
file_path = "My Resume.pdf"
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills

# Process a single PDF resume
candidate_data, match_percentage = process_single_pdf(file_path, required_skills)

# Output the result for the candidate
print(f"Candidate Data: {candidate_data}")
print(f"Match Percentage: {match_percentage:.2f}%")

Processing My Resume.pdf...
Loading PDF: My Resume.pdf
Extracted text from PDF: My Resume.pdf
Ravi Gupta
AI/ML Enthusiast | Computer Science Engineer
Innovative and solution-driven computer science student passionate about artificial intelligence and
machine learning. Adept at building and deploying AI-driven applications to solve real-world problems.
ravigupta.2140@gmail.com 98342451271
EDUCATION SKILLS
B.Tech in Computer Science Engineering
Programming Languages: Python, Java, C++, JavaScript
Indian Institute of Technology, Delhi (IIT Delhi)
Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn,
08/2020 - 08/2024, CGPA: 9.2/10
Flask
Courses
Relevant Courses: Machine
Cloud Platforms: AWS, Google Cloud, Azure
Learning, Artificial
Intelligence, Data
Tools & Technologies: Git, Docker, Kubernetes, Jupyter,
Structures, Deep Learning,
REST APIs
Cloud Computing
Other Skills: Data Analysis, Model Optimization,
Debugging
WORK EXPERIENCE
Machine Learning Intern
PERSONAL PROJECTS
Workplace/C

In [12]:
import pdfplumber
import re
import csv
import os

def load_pdf(file_path):
    """Extracts text from the PDF."""
    print(f"Loading PDF: {file_path}")
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            text = ''.join(page.extract_text() for page in pdf.pages)
    print(f"Extracted text from PDF: {file_path}")
    return text

def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    print("Extracting skills...")
    skills_match = re.search(r"SKILLS\s*(.*?)(?=(COURSEWORK|EXPERIENCE|EDUCATION|PROJECTS|$))", text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = re.split(r"[\u2022\n,]+", skills_text)  # Bullet operators unicode
        print(f"Raw skills extracted: {skills}")
        return clean_skills([skill.strip() for skill in skills if skill.strip()])
    print("No skills section found.")
    return []

def clean_skills(raw_skills):
    """Clean and normalize extracted skills."""
    print(f"Cleaning skills: {raw_skills}")
    cleaned = []
    for skill in raw_skills:
        cleaned_skill = re.sub(r"^\s*-?\s*", "", skill)
        cleaned_skill = re.sub(r"\(.*?\)", "", cleaned_skill).strip()# text enclosed in parenthsis
        
        words = re.split(r"[,\s:/;]+", cleaned_skill)
        cleaned.extend(word.strip() for word in words if word.strip())
    # Remove duplicates and normalize case
    cleaned_skills = list(dict.fromkeys(word.lower() for word in cleaned))
    print(f"Cleaned skills: {cleaned_skills}")
    return cleaned_skills
def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill extraction."""
    print("Extracting key information (email, phone, etc.)...")
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text),
        "skills": extract_skills(text),
    }
    result_data = {}

    for key, value in data.items():
        if key != "skills":
            result_data[key] = value.group(0) if value else "Not available"
        else:
            result_data[key] = value
    print(f"Extracted information: {result_data}")
    return result_data




def save_to_csv(data, file_name):
    """Saves extracted data to CSV."""
    print(f"Saving data to {file_name}...")
    with open(file_name, mode='a', newline='', encoding='utf-8' ,errors='ignore') as file:
        writer = csv.writer(file)
        if file.tell() == 0:  # Check if the file is empty to write headers
            writer.writerow(data.keys())  # Write headers("name", "email", "phone")
        writer.writerow(data.values())  # Write values
    print(f"Data saved to {file_name}")

def evaluate_skills(candidate_skills, required_skills):
    """Matches candidate's skills with required skills."""
    print(f"Evaluating skills... Required skills: {required_skills}")
    candidate_set = set(map(str.lower, candidate_skills))
    required_set = set(map(str.lower, required_skills.split(", ")))
    matched_skills = candidate_set.intersection(required_set)
    match_percentage = (len(matched_skills) / len(required_set)) * 100 if required_set else 0
    print(f"Matched skills: {matched_skills}")
    print(f"Skill match percentage: {match_percentage:.2f}%")
    return matched_skills, match_percentage

def process_single_pdf(file_path, required_skills):
    """Processes a single PDF resume file."""
    print(f"Processing {file_path}...")
    
    # Load and parse the resume
    text = load_pdf(file_path)
    data = extract_info(text)

    # Evaluate candidate skills
    matched_skills, match_percentage = evaluate_skills(data["skills"], required_skills)

    # Output the matched skills and percentage
    print(f"Matched Skills: {matched_skills}")
    print(f"Skill Match Percentage: {match_percentage:.2f}%")

    # Store the candidate data with match percentage for ranking
    candidate_data = data.copy()
    candidate_data["matched_skills"] = ", ".join(matched_skills)
    candidate_data["match_percentage"] = match_percentage

    return candidate_data, match_percentage

def process_multiple_resumes(folder_path, required_skills, output_csv="ranked_resumes.csv"):
    """Process all resumes in a folder, rank them by match percentage, and save to CSV."""
    print(f"Processing resumes in folder: {folder_path}")
    all_candidates = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            candidate_data, match_percentage = process_single_pdf(file_path, required_skills)
            candidate_data["filename"] = filename
            all_candidates.append(candidate_data)
    
    # Sort candidates by match percentage
    ranked_candidates = sorted(all_candidates, key=lambda x: x["match_percentage"], reverse=True)

    # Save the ranked candidates to the new CSV
    print(f"Saving ranked candidates to {output_csv}...")
    with open(output_csv, mode='w', newline='',encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["email", "phone", "match_percentage", "matched_skills", "skills","filename"])  # CSV header
        for candidate in ranked_candidates:
            writer.writerow([
                candidate["email"],
                candidate["phone"],
                candidate["match_percentage"],
                candidate["matched_skills"],
                ", ".join(candidate["skills"]), 
                candidate["filename"]
            ])
    print(f"Ranked resumes saved to {output_csv}")

# Example usage:
folder_path = "resumes"  # Replace with the path to the folder containing the resumes
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills

# Process multiple resumes and save the ranked list to CSV
process_multiple_resumes(folder_path, required_skills)


Processing resumes in folder: resumes
Processing resumes\computer-science-resume-example.pdf...
Loading PDF: resumes\computer-science-resume-example.pdf
Extracted text from PDF: resumes\computer-science-resume-example.pdf
Extracting key information (email, phone, etc.)...
Extracting skills...
Raw skills extracted: ['Social Media Scheduler', 'JavaScript (Angular)', 'Creator', 'HTML/ CSS', '·', 'Python (Django) Built responsive app using Django and Node that allowed users', 'to schedule social media posts across Instagram and Twitter', 'SQL (PostgreSQL', ' Oracle)', '·', 'Built features using scikit-learn in Python that learned the time', 'REST APIs', 'of day of maximum engagement with social media posts', 'Git', 'which increased overall engagement rate by 23% for users', '·', 'Released it for free for University of Illinois Chicago students', 'and it quickly grew to over 500 monthly active users', '·', 'Featured across 7 local newspapers', ' radio stations', ' and news', 'networks with 

In [13]:
import re
import csv

def load_pdf(file_path):
    """Extracts text from the PDF."""
    print(f"Loading PDF: {file_path}")
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            text = ''.join(page.extract_text() for page in pdf.pages)
    print(f"Extracted text from PDF: {file_path}\n{text}")
    return text

def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    print("Extracting skills...")
    skills_match = re.search(r"SKILLS\s*([\s\S]*?)(?=(COURSEWORK|EXPERIENCE|EDUCATION|PROJECTS|$))", text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = re.split(r"[\u2022\n,]+", skills_text)#bullet operators unicode
        print(f"Raw skills extracted: {skills}")
        return clean_skills([skill.strip() for skill in skills if skill.strip()])
    print("No skills section found.")
    return []

def clean_skills(raw_skills):
    """Clean and normalize extracted skills."""
    print(f"Cleaning skills: {raw_skills}")
    cleaned = []
    for skill in raw_skills:
        cleaned_skill = re.sub(r"^\s*-?\s*", "", skill)
        cleaned_skill = re.sub(r"\(.*?\)", "", cleaned_skill).strip()# text enclosed in parenthsis
        
        words = re.split(r"[,\s:/;]+", cleaned_skill)
        cleaned.extend(word.strip() for word in words if word.strip())
    # Remove duplicates and normalize case
    cleaned_skills = list(dict.fromkeys(word.lower() for word in cleaned))
    print(f"Cleaned skills: {cleaned_skills}")
    return cleaned_skills

def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill extraction."""
    print("Extracting key information (email, phone, etc.)...")
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text),
        "skills": extract_skills(text),
    }
    result_data = {}

    for key, value in data.items():
        if key != "skills":
            result_data[key] = value.group(0) if value else "Not available"
        else:
            result_data[key] = value
    print(f"Extracted information: {result_data}")
    return result_data

def save_to_csv(data, file_name="resume_data1.csv"):
    """Saves extracted data to CSV."""
    print(f"Saving data to {file_name}...")
    with open(file_name, mode='a', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:  # Check if the file is empty to write headers
            writer.writerow(data.keys())  # Write headers("name", "email", "phone")
        writer.writerow(data.values())  # Write values
    print(f"Data saved to {file_name}")

def evaluate_skills(candidate_skills, required_skills, good_to_have_skills):
    """Matches candidate's skills with required and good-to-have skills."""
    print(f"Evaluating skills... Required skills: {required_skills}")
    
    candidate_set = set(map(str.lower, candidate_skills))
    required_set = set(map(str.lower, required_skills.split(", ")))
    good_to_have_set = set(map(str.lower, good_to_have_skills.split(", ")))

    matched_required_skills = candidate_set.intersection(required_set)
    matched_good_to_have_skills = candidate_set.intersection(good_to_have_set)

    # Calculate match percentages
    match_percentage = (len(matched_required_skills) / len(required_set)) * 100 if required_set else 0
    bonus_percentage = (len(matched_good_to_have_skills) / len(good_to_have_set)) * 30 if good_to_have_set else 0
    
    total_match_percentage = match_percentage + bonus_percentage
    
    print(f"Matched required skills: {matched_required_skills}")
    print(f"Matched good-to-have skills: {matched_good_to_have_skills}")
    print(f"Skill match percentage: {match_percentage:.2f}%")
    print(f"Bonus percentage for good-to-have skills: {bonus_percentage:.2f}%")
    print(f"Total match percentage: {total_match_percentage:.2f}%")
    
    return matched_required_skills, matched_good_to_have_skills, total_match_percentage


def process_single_pdf(file_path, required_skills, good_to_have_skills):
    """Processes a single PDF resume file."""
    print(f"Processing {file_path}...")
    
    # Load and parse the resume
    text = load_pdf(file_path)
    data = extract_info(text)

    # Evaluate candidate skills
    matched_required_skills, matched_good_to_have_skills, total_match_percentage = evaluate_skills(data["skills"], required_skills, good_to_have_skills)

    # Output the matched skills and percentage
    print(f"Matched Required Skills: {matched_required_skills}")
    print(f"Matched Good-to-Have Skills: {matched_good_to_have_skills}")
    print(f"Total Match Percentage: {total_match_percentage:.2f}%")

    # Store the candidate data with match percentage for ranking
    candidate_data = data.copy()
    candidate_data["total_match_percentage"] = total_match_percentage

    # Save extracted data to CSV
    save_to_csv(data)

    return candidate_data, total_match_percentage

# Example usage for one resume:
file_path = "sample-resumes_scs-6.pdf"
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills
good_to_have_skills = "Node.js, Docker, Perl, AWS, TypeScript,Git"  # Example good-to-have skills

# Process a single PDF resume
candidate_data, total_match_percentage = process_single_pdf(file_path, required_skills, good_to_have_skills)

# Output the result for the candidate
print(f"Candidate Data: {candidate_data}")
print(f"Total Match Percentage: {total_match_percentage:.2f}%")

Processing sample-resumes_scs-6.pdf...
Loading PDF: sample-resumes_scs-6.pdf
Extracted text from PDF: sample-resumes_scs-6.pdf
MACK CROLANGUAGE
844-555-2626 | mackcrol@gmail.com
EDUCATION
Carnegie Mellon University, Pittsburgh, PA
Master of Science, Computer Science, December 2015
Selected Coursework: Introduction to Machine Learning (10-601, Fall 2014), Distributed Systems (15-440/640, Fall 2014),
Algorithm Design and Analysis (15-451/651, Fall 2014), Web Apps Development (15-637, Spring 2015), Machine Learning
with Large Datasets (10-605, Spring 2015), Graduate Artificial Intelligence (15-780, Spring 2015)
Birla Institute of Technology and Science, Pilani, India
Bachelor of Engineering (Hons.), Computer Science (Minor: M.Sc. Economics), July 2014
SKILLS
Programming/Scripting Languages: (Proficient) Java; (Familiar) Python, C, SQL, Javascript, MATLAB, Perl
Frameworks and tools: Hadoop, Django, DKPro for NLP, Maven, Git
EXPERIENCE
Software Engineering Intern
Yahoo! Inc., Sunnyvale, CA,

In [14]:
import pdfplumber
import re
import csv
import os
import io
import hashlib

# Functions from your previous code
def load_pdf(file):
    """Extracts text from the PDF."""
    # Use BytesIO to read the uploaded file as a file object
    with io.BytesIO(file.read()) as byte_file:
        with pdfplumber.open(byte_file) as pdf:
            text = ''.join(page.extract_text() for page in pdf.pages)
    return text

def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    skills_match = re.search(r"SKILLS\s*(.*?)(?=(COURSEWORK|EXPERIENCE|EDUCATION|PROJECTS|$))", text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = re.split(r"[\u2022\n,]+", skills_text)  # Bullet operators unicode
        return clean_skills([skill.strip() for skill in skills if skill.strip()])
    return []

def clean_skills(raw_skills):
    """Clean and normalize extracted skills."""
    cleaned = []
    for skill in raw_skills:
        cleaned_skill = re.sub(r"^\s*-?\s*", "", skill)
        cleaned_skill = re.sub(r"\(.*?\)", "", cleaned_skill).strip()  # Text enclosed in parentheses
        words = re.split(r"[,\s:/;]+", cleaned_skill)
        cleaned.extend(word.strip() for word in words if word.strip())
    # Remove duplicates and normalize case
    return list(dict.fromkeys(word.lower() for word in cleaned))

def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill extraction."""
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text),
        "skills": extract_skills(text),
    }
    result_data = {}
    for key, value in data.items():
        if key != "skills":
            result_data[key] = value.group(0) if value else "Not available"
        else:
            result_data[key] = value
    return result_data

def evaluate_skills(candidate_skills, required_skills, good_to_have_skills):
    """Matches candidate's skills with required and good-to-have skills."""
    candidate_set = set(map(str.lower, candidate_skills))
    required_set = set(map(str.lower, required_skills.split(", ")))
    good_to_have_set = set(map(str.lower, good_to_have_skills.split(", ")))

    matched_required_skills = candidate_set.intersection(required_set)
    matched_good_to_have_skills = candidate_set.intersection(good_to_have_set)

    # Calculate match percentages
    match_percentage = (len(matched_required_skills) / len(required_set)) * 80 if required_set else 0
    bonus_percentage = (len(matched_good_to_have_skills) / len(good_to_have_set)) * 20 if good_to_have_set else 0  # Increase weightage for good-to-have

    total_match_percentage = match_percentage + bonus_percentage

    return matched_required_skills, matched_good_to_have_skills, total_match_percentage

def process_single_pdf(file, required_skills, good_to_have_skills, processed_hashes):
    """Processes a single PDF resume file and prevents duplicate processing."""
    file_hash = generate_file_hash(file)
    
    # Skip file if it's already processed
    if file_hash in processed_hashes:
        return None
    
    text = load_pdf(file)
    data = extract_info(text)
    matched_required_skills, matched_good_to_have_skills, total_match_percentage = evaluate_skills(data["skills"], required_skills, good_to_have_skills)

    candidate_data = data.copy()
    candidate_data["matched_skills"] = ", ".join(matched_required_skills)
    candidate_data["matched_good_to_have_skills"] = ", ".join(matched_good_to_have_skills)  # Add matched good-to-have skills
    candidate_data["match_percentage"] = total_match_percentage
    processed_hashes.add(file_hash)  # Add file hash to processed set

    return candidate_data

def generate_file_hash(file):
    """Generates a unique hash for the uploaded file based on its content."""
    file_content = file.read()  # Read the file content
    file.seek(0)  # Reset file pointer after reading
    return hashlib.md5(file_content).hexdigest()  # Generate MD5 hash of the content

def save_to_csv(data, file_name):
    """Saves extracted data to CSV."""
    with open(file_name, mode='a', newline='', encoding='utf-8', errors='ignore') as file:
        writer = csv.writer(file)
        if file.tell() == 0:  # Check if the file is empty to write headers
            writer.writerow(data.keys())  # Write headers
        writer.writerow(data.values())  # Write values

# Main function to process PDFs
def process_resumes(uploaded_files, required_skills, good_to_have_skills):
    processed_hashes = set()
    
    all_candidates = []
    for uploaded_file in uploaded_files:
        candidate_data = process_single_pdf(uploaded_file, required_skills, good_to_have_skills, processed_hashes)
        
        # Skip processing if the file was already processed
        if candidate_data is None:
            continue
        
        candidate_data["filename"] = uploaded_file.name
        all_candidates.append(candidate_data)

    # Rank candidates based on match percentage
    ranked_candidates = sorted(all_candidates, key=lambda x: x["match_percentage"], reverse=True)
    
    # Display ranked candidates
    if ranked_candidates:
        print("**Ranked Candidates:**")
        table_data = []
        for candidate in ranked_candidates:
            table_data.append({
                "Filename": candidate["filename"],
                "Email": candidate["email"],
                "Phone": candidate["phone"],
                "Matched Required Skills": candidate["matched_skills"],
                "Matched Good-to-Have Skills": candidate["matched_good_to_have_skills"],
                "Match Percentage": f"{candidate['match_percentage']:.2f}%",
            })
        
        # Display the table
        for row in table_data:
            print(row)
    else:
        print("No valid candidates found.")

    # Option to save ranked candidates to CSV
    output_csv = "ranked_resumes.csv"
    with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["email", "phone", "match_percentage", "matched_skills", "skills", "filename"])  # CSV header
        for candidate in ranked_candidates:
            writer.writerow([  
                candidate["email"],
                candidate["phone"],
                candidate["match_percentage"],
                candidate["matched_skills"],
                candidate["matched_good_to_have_skills"],
                ", ".join(candidate["skills"]),
                candidate["filename"]
            ])
    print(f"CSV file saved as {output_csv}. You can download it from the filesystem.")

# Example usage:
uploaded_files = [open('resume12.pdf', 'rb'), open('sample-resumes_scs-6.pdf', 'rb')]  # Replace with your actual PDF files
required_skills = "Python, SQL, Java"
good_to_have_skills = "AWS, Docker, git, djnango, javascript"
process_resumes(uploaded_files, required_skills, good_to_have_skills)


**Ranked Candidates:**
{'Filename': 'sample-resumes_scs-6.pdf', 'Email': 'mackcrol@gmail.com', 'Phone': '844-555-2626', 'Matched Required Skills': 'sql, python, java', 'Matched Good-to-Have Skills': 'git, javascript', 'Match Percentage': '88.00%'}
{'Filename': 'resume12.pdf', 'Email': 'ajaybkedare@gmail.com', 'Phone': '9082168876', 'Matched Required Skills': 'sql, python, java', 'Matched Good-to-Have Skills': 'git', 'Match Percentage': '84.00%'}
CSV file saved as ranked_resumes.csv. You can download it from the filesystem.


In [15]:
import re
import csv

def load_pdf(file_path):
    """Extracts text from the PDF."""
    print(f"Loading PDF: {file_path}")
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            text = ''.join(page.extract_text() for page in pdf.pages)
    print(f"Extracted text from PDF: {file_path}\n{text}")
    return text

import re
import csv

def load_pdf(file_path):
    """Extracts text from the PDF."""
    print(f"Loading PDF: {file_path}")
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            text = ''.join(page.extract_text() for page in pdf.pages)
    print(f"Extracted text from PDF: {file_path}\n{text}")
    return text

def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    print("Extracting skills...")
    skills_match = re.search(r"SKILLS\s*([\s\S]*?)(?=(COURSEWORK|EXPERIENCE|EDUCATION|PROJECTS|$))", text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = re.split(r"[\u2022\n,]+", skills_text)#bullet operators unicode
        print(f"Raw skills extracted: {skills}")
        return clean_skills([skill.strip() for skill in skills if skill.strip()])
    print("No skills section found.")
    return []

def clean_skills(raw_skills):
    """Clean and normalize extracted skills."""
    print(f"Cleaning skills: {raw_skills}")
    cleaned = []
    for skill in raw_skills:
        cleaned_skill = re.sub(r"^\s*-?\s*", "", skill)
        cleaned_skill = re.sub(r"\(.*?\)", "", cleaned_skill).strip()# text enclosed in parenthsis
        
        words = re.split(r"[,\s:/;]+", cleaned_skill)
        cleaned.extend(word.strip() for word in words if word.strip())
    # Remove duplicates and normalize case
    cleaned_skills = list(dict.fromkeys(word.lower() for word in cleaned))
    print(f"Cleaned skills: {cleaned_skills}")
    return cleaned_skills

def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill extraction."""
    print("Extracting key information (email, phone, etc.)...")
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text),
        "skills": extract_skills(text),
    }
    result_data = {}

    for key, value in data.items():
        if key != "skills":
            result_data[key] = value.group(0) if value else "Not available"
        else:
            result_data[key] = value
    print(f"Extracted information: {result_data}")
    return result_data

def save_to_csv(data, file_name="resume_data1.csv"):
    """Saves extracted data to CSV."""
    print(f"Saving data to {file_name}...")
    with open(file_name, mode='a', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:  # Check if the file is empty to write headers
            writer.writerow(data.keys())  # Write headers("name", "email", "phone")
        writer.writerow(data.values())  # Write values
    print(f"Data saved to {file_name}")

def evaluate_skills(candidate_skills, required_skills):
    """Matches candidate's skills with required skills."""
    print(f"Evaluating skills... Required skills: {required_skills}")
    candidate_set = set(map(str.lower, candidate_skills))
    required_set = set(map(str.lower, required_skills.split(", ")))
    matched_skills = candidate_set.intersection(required_set)
    match_percentage = (len(matched_skills) / len(required_set)) * 100 if required_set else 0
    print(f"Matched skills: {matched_skills}")
    print(f"Skill match percentage: {match_percentage:.2f}%")
    return matched_skills, match_percentage

def process_single_pdf(file_path, required_skills):
    """Processes a single PDF resume file."""
    print(f"Processing {file_path}...")
    
    # Load and parse the resume
    text = load_pdf(file_path)
    data = extract_info(text)

    # Evaluate candidate skills
    matched_skills, match_percentage = evaluate_skills(data["skills"], required_skills)

    # Output the matched skills and percentage
    print(f"Matched Skills: {matched_skills}")
    print(f"Skill Match Percentage: {match_percentage:.2f}%")

    # Store the candidate data with match percentage for ranking
    candidate_data = data.copy()
    candidate_data["match_percentage"] = match_percentage

    # Save extracted data to CSV
    save_to_csv(data)

    return candidate_data, match_percentage

# Example usage for one resume:
#file_path = "John Doe.pdf"# Replace with the path to the resume PDF
#file_path = "computer-science-resume-example.pdf"
file_path = "My Resume.pdf"
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills

# Process a single PDF resume
candidate_data, match_percentage = process_single_pdf(file_path, required_skills)

# Output the result for the candidate
print(f"Candidate Data: {candidate_data}")
print(f"Match Percentage: {match_percentage:.2f}%")

def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill extraction."""
    print("Extracting key information (email, phone, etc.)...")
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text),
        "skills": extract_skills(text),
    }
    result_data = {}

    for key, value in data.items():
        if key != "skills":
            result_data[key] = value.group(0) if value else "Not available"
        else:
            result_data[key] = value
    print(f"Extracted information: {result_data}")
    return result_data

def save_to_csv(data, file_name="resume_data1.csv"):
    """Saves extracted data to CSV."""
    print(f"Saving data to {file_name}...")
    with open(file_name, mode='a', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:  # Check if the file is empty to write headers
            writer.writerow(data.keys())  # Write headers("name", "email", "phone")
        writer.writerow(data.values())  # Write values
    print(f"Data saved to {file_name}")



def process_single_pdf(file_path, required_skills):
    """Processes a single PDF resume file."""
    print(f"Processing {file_path}...")
    
    # Load and parse the resume
    text = load_pdf(file_path)
    data = extract_info(text)

    # Evaluate candidate skills
    matched_skills, match_percentage = evaluate_skills(data["skills"], required_skills)

    # Output the matched skills and percentage
    print(f"Matched Skills: {matched_skills}")
    print(f"Skill Match Percentage: {match_percentage:.2f}%")

    # Store the candidate data with match percentage for ranking
    candidate_data = data.copy()
    candidate_data["match_percentage"] = match_percentage

    # Save extracted data to CSV
    save_to_csv(data)

    return candidate_data, match_percentage

# Example usage for one resume:
#file_path = "John Doe.pdf"# Replace with the path to the resume PDF
#file_path = "computer-science-resume-example.pdf"
file_path = "My Resume.pdf"
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills

# Process a single PDF resume
candidate_data, match_percentage = process_single_pdf(file_path, required_skills)

# Output the result for the candidate
print(f"Candidate Data: {candidate_data}")
print(f"Match Percentage: {match_percentage:.2f}%")

Processing My Resume.pdf...
Loading PDF: My Resume.pdf
Extracted text from PDF: My Resume.pdf
Ravi Gupta
AI/ML Enthusiast | Computer Science Engineer
Innovative and solution-driven computer science student passionate about artificial intelligence and
machine learning. Adept at building and deploying AI-driven applications to solve real-world problems.
ravigupta.2140@gmail.com 98342451271
EDUCATION SKILLS
B.Tech in Computer Science Engineering
Programming Languages: Python, Java, C++, JavaScript
Indian Institute of Technology, Delhi (IIT Delhi)
Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn,
08/2020 - 08/2024, CGPA: 9.2/10
Flask
Courses
Relevant Courses: Machine
Cloud Platforms: AWS, Google Cloud, Azure
Learning, Artificial
Intelligence, Data
Tools & Technologies: Git, Docker, Kubernetes, Jupyter,
Structures, Deep Learning,
REST APIs
Cloud Computing
Other Skills: Data Analysis, Model Optimization,
Debugging
WORK EXPERIENCE
Machine Learning Intern
PERSONAL PROJECTS
Workplace/C

In [16]:
import re
import csv

def load_pdf(file_path):
    """Extracts text from the PDF."""
    print(f"Loading PDF: {file_path}")
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            text = ''.join(page.extract_text() for page in pdf.pages)
    print(f"Extracted text from PDF: {file_path}\n{text}")
    return text

import re
import csv

def load_pdf(file_path):
    """Extracts text from the PDF."""
    print(f"Loading PDF: {file_path}")
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            text = ''.join(page.extract_text() for page in pdf.pages)
    print(f"Extracted text from PDF: {file_path}\n{text}")
    return text

def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    # Match the SKILLS section up to the next heading or end of text
    skills_match = re.search(r"SKILLS\s*([\s\S]*?)(?=(WORK EXPERIENCE|PERSONAL PROJECTS|$))", text, re.IGNORECASE)
    if skills_match:
        skills_text = skills_match.group(1)
        # Split by bullets, commas, or whitespace and clean the skills
        skills = re.split(r"[\u2022\n,]+", skills_text)
        return clean_skills([skill.strip() for skill in skills if skill.strip()])
    return []

def clean_skills(raw_skills):
    """Clean, normalize, and correct extracted skills."""
    typo_corrections = {
        "pyhton": "python",  # Fix common typos
    }
    cleaned = []
    for skill in raw_skills:
        # Remove extra symbols and normalize text
        cleaned_skill = re.sub(r"^\s*-?\s*", "", skill)
        cleaned_skill = re.sub(r"\(.*?\)", "", cleaned_skill).strip()
        cleaned_skill = typo_corrections.get(cleaned_skill.lower(), cleaned_skill.lower())
        words = re.split(r"[,\s:/;]+", cleaned_skill)
        cleaned.extend(word for word in words if word)
    # Remove duplicates
    return list(dict.fromkeys(cleaned))

def evaluate_skills(candidate_skills, required_skills):
    """Matches candidate's skills with required skills."""
    from rapidfuzz import fuzz  # Use fuzzy matching
    candidate_set = set(candidate_skills)
    required_set = set(map(str.lower, required_skills.split(", ")))
    matched_skills = set()
    for required in required_set:
        for candidate in candidate_set:
            # Fuzzy match threshold for similar skills
            if fuzz.ratio(required, candidate) > 80:
                matched_skills.add(candidate)
    match_percentage = (len(matched_skills) / len(required_set)) * 100 if required_set else 0
    return matched_skills, match_percentage


def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill extraction."""
    print("Extracting key information (email, phone, etc.)...")
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text),
        "skills": extract_skills(text),
    }
    result_data = {}

    for key, value in data.items():
        if key != "skills":
            result_data[key] = value.group(0) if value else "Not available"
        else:
            result_data[key] = value
    print(f"Extracted information: {result_data}")
    return result_data

def save_to_csv(data, file_name="resume_data1.csv"):
    """Saves extracted data to CSV."""
    print(f"Saving data to {file_name}...")
    with open(file_name, mode='a', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:  # Check if the file is empty to write headers
            writer.writerow(data.keys())  # Write headers("name", "email", "phone")
        writer.writerow(data.values())  # Write values
    print(f"Data saved to {file_name}")



def process_single_pdf(file_path, required_skills):
    """Processes a single PDF resume file."""
    print(f"Processing {file_path}...")
    
    # Load and parse the resume
    text = load_pdf(file_path)
    data = extract_info(text)

    # Evaluate candidate skills
    matched_skills, match_percentage = evaluate_skills(data["skills"], required_skills)

    # Output the matched skills and percentage
    print(f"Matched Skills: {matched_skills}")
    print(f"Skill Match Percentage: {match_percentage:.2f}%")

    # Store the candidate data with match percentage for ranking
    candidate_data = data.copy()
    candidate_data["match_percentage"] = match_percentage

    # Save extracted data to CSV
    save_to_csv(data)

    return candidate_data, match_percentage

# Example usage for one resume:
#file_path = "John Doe.pdf"# Replace with the path to the resume PDF
#file_path = "computer-science-resume-example.pdf"
file_path = "My Resume.pdf"
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills

# Process a single PDF resume
candidate_data, match_percentage = process_single_pdf(file_path, required_skills)

# Output the result for the candidate
print(f"Candidate Data: {candidate_data}")
print(f"Match Percentage: {match_percentage:.2f}%")



Processing My Resume.pdf...
Loading PDF: My Resume.pdf
Extracted text from PDF: My Resume.pdf
Ravi Gupta
AI/ML Enthusiast | Computer Science Engineer
Innovative and solution-driven computer science student passionate about artificial intelligence and
machine learning. Adept at building and deploying AI-driven applications to solve real-world problems.
ravigupta.2140@gmail.com 98342451271
EDUCATION SKILLS
B.Tech in Computer Science Engineering
Programming Languages: Python, Java, C++, JavaScript
Indian Institute of Technology, Delhi (IIT Delhi)
Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn,
08/2020 - 08/2024, CGPA: 9.2/10
Flask
Courses
Relevant Courses: Machine
Cloud Platforms: AWS, Google Cloud, Azure
Learning, Artificial
Intelligence, Data
Tools & Technologies: Git, Docker, Kubernetes, Jupyter,
Structures, Deep Learning,
REST APIs
Cloud Computing
Other Skills: Data Analysis, Model Optimization,
Debugging
WORK EXPERIENCE
Machine Learning Intern
PERSONAL PROJECTS
Workplace/C

In [13]:
!pip install rapidfuzz

Defaulting to user installation because normal site-packages is not writeable


In [17]:
!pip install transformers


Defaulting to user installation because normal site-packages is not writeable


In [20]:
!pip install transformers torch


Defaulting to user installation because normal site-packages is not writeable


ERROR: Could not find a version that satisfies the requirement torch (from versions: none)
ERROR: No matching distribution found for torch


In [43]:
import re
import csv
from datetime import datetime
import pdfplumber
from rapidfuzz import fuzz

# Helper function to calculate months difference
def calculate_months(start_date, end_date):
    """Calculates the number of months between two dates."""
    return (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)

# Function to load and extract text from PDF
def load_pdf(file_path):
    """Extracts text from the PDF."""
    with open(file_path, 'rb') as file:
        with pdfplumber.open(file) as pdf:
            text = ''.join(page.extract_text() for page in pdf.pages)
    return text

# Extract skills from text based on SKILLS section
def extract_skills(text):
    """Extracts skills from the text based on the 'Skills' section."""
    skills_match = re.search(r"SKILLS\s*([\s\S]*?)(?=(WORK EXPERIENCE|PERSONAL PROJECTS|EDUCATION|$))", text, re.IGNORECASE)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = re.split(r"[\u2022\n,]+", skills_text)
        return clean_skills([skill.strip() for skill in skills if skill.strip()])
    return []

# Clean extracted skills
def clean_skills(raw_skills):
    """Clean, normalize, and correct extracted skills."""
    typo_corrections = {
        "pyhton": "python",  # Fix common typos
    }
    cleaned = []
    for skill in raw_skills:
        cleaned_skill = re.sub(r"^\s*-?\s*", "", skill)
        cleaned_skill = re.sub(r"\(.*?\)", "", cleaned_skill).strip()
        cleaned_skill = typo_corrections.get(cleaned_skill.lower(), cleaned_skill.lower())
        words = re.split(r"[,\s:/;]+", cleaned_skill)
        cleaned.extend(word for word in words if word)
    cleaned = list(dict.fromkeys(cleaned))
    return cleaned

# Function to evaluate skills using fuzzy matching
def evaluate_skills(candidate_skills, required_skills):
    """Matches candidate's skills with required skills."""
    candidate_set = set(candidate_skills)
    required_set = set(map(str.lower, required_skills.split(", ")))
    matched_skills = set()
    for required in required_set:
        for candidate in candidate_set:
            if fuzz.ratio(required, candidate) > 80:
                matched_skills.add(candidate)
    match_percentage = (len(matched_skills) / len(required_set)) * 100 if required_set else 0
    return matched_skills, match_percentage

# Extract work experience from text
def extract_experience(text):
    """Extracts work experience (roles, start, and end dates) from the resume."""
    experiences = []
    experience_pattern = re.compile(r"(?:WORK EXPERIENCE|EXPERIENCE)[\s\S]+?(?=(EDUCATION|SKILLS|$))", re.IGNORECASE)
    experience_section = experience_pattern.search(text)
    if experience_section:
        experience_text = experience_section.group(0)
        role_pattern = re.compile(r"([A-Za-z\s]+)\s+(\d{4}-\d{2}-\d{2})\s*[-–]?\s*(\d{4}-\d{2}-\d{2}|Present)", re.IGNORECASE)
        roles = role_pattern.findall(experience_text)
        for role in roles:
            role_title = role[0].strip()
            start_date_str = role[1]
            end_date_str = role[2] if role[2].lower() != "present" else datetime.now().strftime('%Y-%m-%d')
            experiences.append({
                "role": role_title,
                "start": start_date_str,
                "end": end_date_str
            })
    return experiences

# Extract education details
def extract_education(text):
    """Extracts education details (degree, university, year, percentage)."""
    education_data = []
    education_pattern = re.compile(r"EDUCATION[\s\S]+?(?=(WORK EXPERIENCE|SKILLS|$))", re.IGNORECASE)
    education_section = education_pattern.search(text)
    if education_section:
        education_text = education_section.group(0)
        edu_pattern = re.compile(r"([A-Za-z\s]+)\s+(\d{4})\s+([-+]?\d*\.\d+|\d+)%?", re.IGNORECASE)
        education_details = edu_pattern.findall(education_text)
        for edu in education_details:
            degree = edu[0].strip()
            year = edu[1].strip()
            percentage = edu[2].strip()
            education_data.append({
                "degree": degree,
                "year": year,
                "percentage": percentage
            })
    return education_data

# Function to extract key information (email, phone, etc.) from text
def extract_info(text):
    """Extracts key information (email, phone, etc.) and invokes skill and experience extraction."""
    data = {
        "email": re.search(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text),
        "phone": re.search(r"(\+?\d{1,4}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text),
        "skills": extract_skills(text),
        "experience": extract_experience(text),
        "education": extract_education(text)
    }
    result_data = {}
    for key, value in data.items():
        if key != "skills" and key != "experience" and key != "education":
            result_data[key] = value.group(0) if value else "Not available"
        else:
            result_data[key] = value
    return result_data

# Function to save extracted data to CSV
def save_to_csv(data, file_name="resume_data.csv"):
    """Saves extracted data to CSV."""
    with open(file_name, mode='a', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:  # Write headers if file is empty
            writer.writerow(data.keys())
        writer.writerow(data.values())

# Function to process a single PDF resume and extract information
def process_single_pdf(file_path, required_skills):
    """Processes a single PDF resume file."""
    # Load and parse the resume
    text = load_pdf(file_path)
    data = extract_info(text)

    # Evaluate candidate skills
    matched_skills, match_percentage = evaluate_skills(data["skills"], required_skills)
    
    # Output matched skills and percentage
    candidate_data = data.copy()
    candidate_data["match_percentage"] = match_percentage

    # Save extracted data to CSV
    save_to_csv(data)

    return candidate_data, match_percentage

# Function to calculate total experience from work experiences
def calculate_total_experience(experiences):
    """Calculates the total work experience in months from a list of experiences."""
    total_months = 0
    for exp in experiences:
        start_date = datetime.strptime(exp["start"], "%Y-%m-%d")
        end_date = datetime.strptime(exp["end"], "%Y-%m-%d")
        total_months += calculate_months(start_date, end_date)
    
    years = total_months // 12
    months = total_months % 12
    return years, months

# Example usage
file_path = "My Resume.pdf"
required_skills = "Python, SQL, React, Django, Java, JavaScript, HTML, CSS"  # Example required skills

# Process resume PDF
candidate_data, match_percentage = process_single_pdf(file_path, required_skills)

# Process work experience dates
years, months = calculate_total_experience(candidate_data["experience"])

# Print everything
print("------------- Extracted Resume Data -------------")
print(f"Candidate Data: {candidate_data}")
print(f"Match Percentage: {match_percentage:.2f}%")
print("------------- Education Data -------------")
print(f"Education: {candidate_data['education']}")
print("------------- Total Experience -------------")
print(f"Total Experience: {years} years, {months} months")


------------- Extracted Resume Data -------------
Candidate Data: {'email': 'ravigupta.2140@gmail.com', 'phone': '98342451271', 'skills': ['b.tech', 'in', 'computer', 'science', 'engineering', 'programming', 'languages', 'python', 'java', 'c++', 'javascript', 'indian', 'institute', 'of', 'technology', 'delhi', 'frameworks', '&', 'libraries', 'tensorflow', 'pytorch', 'scikit-learn', '08', '2020', '-', '2024', 'cgpa', '9.2', '10', 'flask', 'courses', 'relevant', 'machine', 'cloud', 'platforms', 'aws', 'google', 'azure', 'learning', 'artificial', 'intelligence', 'data', 'tools', 'technologies', 'git', 'docker', 'kubernetes', 'jupyter', 'structures', 'deep', 'rest', 'apis', 'computing', 'other', 'skills', 'analysis', 'model', 'optimization', 'debugging'], 'experience': [], 'education': [], 'match_percentage': 37.5}
Match Percentage: 37.50%
------------- Education Data -------------
Education: []
------------- Total Experience -------------
Total Experience: 0 years, 0 months


Extracted Details:
