In [8]:
import spacy
import re
import os
import requests
from io import BytesIO
from PyPDF2 import PdfReader
from docx import Document
from word2number import w2n

In [9]:
pip install PyPDF2


Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install word2number


Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install python-docx


Note: you may need to restart the kernel to use updated packages.


In [12]:
# Load the English language model for spaCy
nlp = spacy.load("en_core_web_sm")

In [13]:
# Helper functions to extract text from PDF and DOCX files

def extract_text_from_pdf(file):
    """Extract text from a PDF file."""
    pdf_reader = PdfReader(file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def extract_text_from_docx(file):
    """Extract text from a DOCX file."""
    doc = Document(file)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text


In [14]:
# Function to extract experience details like '10 years' or 'more than thirty years'
def extract_experience(text):
    text = text.lower()
    numeric_pattern = r"(?:more than|over|at least|around|approximately|nearly|up to)?\s*(\d+)\+?\s*years?"
    numeric_match = re.search(numeric_pattern, text)
    if numeric_match:
        years = numeric_match.group(1)
        return f"{years} years"
    
    word_pattern = r"(?:more than|over|at least|around|approximately|nearly|up to)?\s*(\b(?:one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred)\b)\s*years?"
    word_match = re.search(word_pattern, text)
    if word_match:
        try:
            word_years = word_match.group(1)
            numeric_years = w2n.word_to_num(word_years)
            return f"{numeric_years} years"
        except ValueError:
            return "Experience not found"
    return "Not found"


In [15]:
# Function to count the number of certifications mentioned in the resume
def extract_certifications_count(text):
    certification_keywords = [
        r"certification", r"certifications", r"certified", r"certificate", r"certificates"
    ]
    pattern = r"|".join(certification_keywords)
    matches = re.findall(pattern, text, re.IGNORECASE)
    return len(matches)


In [16]:
# Function to extract name from resume text
def extract_name_from_text(text):
    text = text.strip()
    lines = text.split("\n")
    for line in lines[:3]:
        line = line.strip()
        if len(line) > 1:
            name = re.sub(r'[^a-zA-Z\s]', '', line)
            if len(name.split()) > 1:
                return name.title()
    return "Name not found"


In [17]:
# Function to extract skills without predefined keywords
def extract_skills_without_keywords(text):
    doc = nlp(text)
    skills = []
    for ent in doc.ents:
        if ent.label_ in ["ORG", "PRODUCT", "WORK_OF_ART"]:
            skills.append(ent.text)
    skill_patterns = re.findall(r"(proficient in|experienced with|skilled in|knowledge of)\s+([a-zA-Z0-9\s\+\-]+)", text, re.IGNORECASE)
    for pattern in skill_patterns:
        skills.append(pattern[1].strip())
    return list(set(skills))


In [18]:
# Function to extract visa status
def extract_visa_status(text):
    visa_keywords = {
        "H1B": ["h1b"],
        "Green Card": ["green card", "permanent resident"],
        "US Citizen": ["usc", "us citizen", "citizenship: us"],
        "OPT": ["opt"],
        "CPT": ["cpt"],
        "L2": ["l2 visa"],
        "EAD": ["ead"],
        "TN Visa": ["tn visa"],
        "Study Visa": ["study visa"]
    }
    visa_status = []
    for visa, patterns in visa_keywords.items():
        for pattern in patterns:
            if re.search(pattern, text.lower()):
                visa_status.append(visa)
                break
    return ", ".join(visa_status) if visa_status else "Not found"


In [19]:
# Function to extract all details and calculate a matching score based on job description
def extract_details(text, job_description):
    name = extract_name_from_text(text)
    experience = extract_experience(text)
    skills = extract_skills_without_keywords(text)
    email = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', text)
    email = email.group(0) if email else "Not found"
    phone = re.search(r'(\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}|\+\d{1,3}\s?\(?\d{1,4}\)?[-.\s]?\d{3}[-.\s]?\d{4})', text)
    phone = phone.group(0) if phone else "Not found"
    location = re.search(r'\b(?:[A-Z][a-z]+(?:\s[A-Z][a-z]+)*,\s(?:TX|CA|NY|FL|WA|IL|PA|GA|NC|OH|NJ|VA|CO|AZ|MA|MD|TN|MO|IN|WI|MN|SC|AL|LA|KY|OR|OK|CT|IA|MS|KS|AR|NV|UT|NM|NE|WV|ID|HI|ME|NH|MT|RI|DE|SD|ND|AK|VT|WY))\b|\b\d{5}(?:-\d{4})?\b', text)
    location = location.group(0) if location else "Not found"
    visa_status = extract_visa_status(text)
    certificates_count = extract_certifications_count(text)

    # Calculate score based on job description (matching skills)
    jd_skills = set(re.findall(r'[a-zA-Z0-9\+\-]+', job_description.lower()))
    resume_skills = set([skill.lower() for skill in skills])
    matching_skills = resume_skills.intersection(jd_skills)
    score = len(matching_skills)  # Score is the number of matching skills

    return {
        "Name": name,
        "Email": email,
        "Phone": phone,
        "Experience": experience,
        "Location": location,
        "Visa Status": visa_status,
        "Skills": skills,
        "Certificates Count": certificates_count,
        "Score": score,  # Add score to the details
    }


In [21]:
import os
import requests
import pandas as pd
from io import BytesIO

# Function to extract text from PDF (use your previously defined functions)
def extract_text_from_pdf(file):
    """Extract text from a PDF file."""
    from PyPDF2 import PdfReader
    pdf_reader = PdfReader(file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Function to extract text from DOCX (use your previously defined functions)
def extract_text_from_docx(file):
    """Extract text from a DOCX file."""
    from docx import Document
    doc = Document(file)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

# The process_resumes function
def process_resumes(resume_input, job_description_input):
    resume_files = []
    
    # Check if resume_input is a URL
    if resume_input.lower().startswith("http"):
        try:
            response = requests.get(resume_input)
            response.raise_for_status()
            resume_files = [BytesIO(response.content)]
        except requests.exceptions.RequestException as e:
            print(f"Error fetching resume from URL: {e}")
            return None  # Return None to indicate failure to fetch resume
    else:
        # Check if resume_input is a directory and add PDFs and DOCXs
        if os.path.isdir(resume_input):
            resume_files = [os.path.join(resume_input, f) for f in os.listdir(resume_input) if f.lower().endswith(('.pdf', '.docx'))]
        else:
            print("The provided path is neither a folder nor a valid URL.")
            return None  # Return None to indicate invalid input

    if not resume_files:
        print("No valid resume files found.")
        return None  # Return None if no valid resume files are found

    results = []
    for resume_file in resume_files:
        if isinstance(resume_file, BytesIO):
            file_content = resume_file.read()
            if resume_input.lower().endswith(".pdf"):
                resume_text = extract_text_from_pdf(BytesIO(file_content))
            elif resume_input.lower().endswith(".docx"):
                resume_text = extract_text_from_docx(BytesIO(file_content))
        else:
            if resume_file.endswith(".pdf"):
                with open(resume_file, "rb") as file:
                    resume_text = extract_text_from_pdf(file)
            elif resume_file.endswith(".docx"):
                with open(resume_file, "rb") as file:
                    resume_text = extract_text_from_docx(file)
            else:
                print("Unsupported file type!")
                continue

        resume_details = extract_details(resume_text, job_description_input)
        results.append(resume_details)

    # Sort results based on matching skills (Score)
    sorted_results = sorted(results, key=lambda x: x['Score'], reverse=True)
    for idx, result in enumerate(sorted_results, start=1):
        result['Ranking'] = idx  # Assign Rank

    return sorted_results

# Function to save results to a CSV file and display DataFrame
def save_results_to_csv_and_display(results, file_name="resume_analysis_results_2.csv"):
    # Convert results to a pandas DataFrame
    df = pd.DataFrame(results)

    # Save the DataFrame to a CSV file
    df.to_csv(file_name, index=False)
    print(f"Results saved to {file_name}")

    # Display the DataFrame
    return df

# Example usage
resume_input = r"C:\Users\andre\Desktop\Resumes"  # Path to your resume folder
job_description_input = '''Hybrid/Local Cloud Solutions Architect (AWS/Azure/GCP/15+) with networking, IAM, GDPR, HIPAA, CI/CD, Docker, Kubernetes, IaC/Terraform/CloudFormation, Python/Java/C#, RESTful, SOAP, SageMaker, Redshift, EC2, Kinesis, EMR and AI experience
Location: Austin, TX (HHSC)
Duration: 6 Months
Position: 1
Position will be 2 days onsite (Tues & Thurs) at the location listed above and 3 days remote (Mon, Wed, Fri). Program will allow candidates who are within the State of Texas.
Skills:
10 Required experience architecting enterprise application solutions across on-premises and cloud infrastructure
10 Required architecting solutions in one or more of the cloud platforms: GCP, Azure, and AWS.
8 Required developing cloud native application architectures across leading cloud platforms.
8 Required architectural solutions that utilize fit-for-purpose service models such as IaaS, PaaS, and SaaS
8 Required experience in all phases of Machine Learning, Artificial Intelligence and Deep Learning solutions using Azure or AWS or google technologies.
8 Preferred Strong understanding of virtual networks, VPNs, DNS, load balancers, and firewalls.
8 Preferred Understanding of cloud security frameworks, encryption, identity and access management (IAM), and compliance standards like GDPR, HIPAA, etc.
8 Preferred Familiarity with CI/CD pipelines, containerization (Docker, Kubernetes), and infrastructure as code (IaC) tools like Terraform or CloudFormation.
8 Preferred Proficiency in languages such as Python, Java, C#, and scripting languages for automation.
8 Preferred Knowledge of API design, development, and management, including RESTful and SOAP APIs.
8 Preferred Knowledge of microservices, serverless architectures, and cloud-native application development.
8 Preferred Ability to analyze and optimize cloud costs while maintaining performance and reliability.
8 Preferred Excellent verbal and written communication skills to articulate cloud strategies and solutions to stakeholders.
8 Preferred Ability to work effectively with cross-functional teams, including developers, administrators, and business leaders.
8 Preferred Skills in monitoring and analyzing system performance to ensure optimal cloud operations.
6 Preferred Experience with Azure Machine Learning, Azure Cognitive Services, or AWS technologies like SageMaker, Redshift, EC2, Data Pipeline, Kinesis, EMR, and AI services like Transcribe etc or Google technologies like Vertex AI platform, Agent Builder, and others.'''

# Process the resumes and display results
processed_results = process_resumes(resume_input, job_description_input)

# Check if the results are valid and then display/save them
if processed_results:
    df_output = save_results_to_csv_and_display(processed_results, file_name="resume_analysis_results_2.csv")
    display(df_output)  # Use this to display the DataFrame in Jupyter Notebook
else:
    print("No valid results to display or save.")

incorrect startxref pointer(3)


Results saved to resume_analysis_results_2.csv


Unnamed: 0,Name,Email,Phone,Experience,Location,Visa Status,Skills,Certificates Count,Score,Ranking
0,Kumar K,Not found,Not found,14 years,"Cloud, MS","OPT, EAD","[RDP, Cloud ecosystems, IaC, Service Bus, Stor...",3,13,1
1,Naveen Kumar,Vinod.kumar@cloudzenix.com,+1 469-505-0687,16 years,Not found,"OPT, EAD","[Azure Data Factory, Bachelor of Engineering, ...",4,13,2
2,Karl Kimball \t\t\t ...,karl@karlkimball.com,(817) 845-0463,16 years,78665,"OPT, EAD","[Azure AI Cognitive Services, Racal Corp, Data...",6,12,3
3,Anish M,Not found,+1 510-854-1314,12 years,Not found,"OPT, EAD","[CPU, Docker Containers, Kubernetes Clusters, ...",7,12,4
4,Bhabani Pani,bhabani.aws881@gmail.com,+1 805 300 7443,24 years,"The Open Group, CA","OPT, EAD","[Serverless Price Setting Management Tool, Inf...",4,12,5
5,Desire Chedjieu,chedjieu@yahoo.de,(210) 209-1018,Not found,"Cormorant, TX","OPT, EAD","[NJ, Lessons Learned, ISO, CPU, CodeStar, EKS,...",5,11,6
6,Vikram Kancharla,vikramarchitect6@gmail.com,929 561 6508,15 years,"Princeton, TX","H1B, OPT, EAD","[Information Technology, Computer Science & Te...",7,10,7
7,Srinivas Madhavarao,Not found,Not found,15 years,"Cloud, MS","OPT, EAD","[Azure Data Factory, Power BI, Apps\nS/W Metho...",5,8,8
8,Naeem Ravat,Not found,(442)245-7876,Not found,77024,OPT,"[Sametime and Quickplace\nEnvironment, IaC, Lo...",8,8,9
9,Richard J Zhang\t\tSr Enterprise Solution Arch...,Not found,Not found,Not found,"Marshfield Clinic, WI","OPT, EAD","[Interoperability Platform, SAP/Ariba, NPI, Fl...",10,8,10


: 