In [2]:
import re
import pdfplumber
import pytesseract
from PIL import Image
import os
import spacy
import smtplib
from email.mime.text import MIMEText
import glob
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Set the path for Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\\Users\\praveenkumar.s\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Skill improvement and role mapping (as an example)
SKILL_IMPROVEMENT_MAPPING = {
    "Python": {
        "improvements": ["Practice advanced Python concepts", "Contribute to open-source projects"],
        "roles": ["Data Scientist", "Backend Developer"]
    },
    "Machine Learning": {
        "improvements": ["Study ML algorithms", "Work on Kaggle competitions"],
        "roles": ["Machine Learning Engineer", "Data Analyst"]
    },
}

# Helper Functions

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

def calculate_similarity(text1, text2):
    emb1 = get_bert_embedding(text1)
    emb2 = get_bert_embedding(text2)
    similarity = cosine_similarity(emb1.numpy(), emb2.numpy())
    return similarity[0][0]

def extract_phone_numbers(text):
    phone_patterns = [
        r'\b\d{10}\b',  # Matches exactly 10 digits
        r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b',  # Matches numbers like 638-186-6577
        r'\(\d{3}\)[\s\-.]?\d{3}[\s\-.]?\d{4}',  # Matches numbers like (638) 186-6577
        r'\+91[\s\-.]?\d{4}[\s\-.]?\d{3}[\s\-.]?\d{3}',  # Matches Indian numbers like +91 6380 293 207
    ]
    phone_numbers = []
    for pattern in phone_patterns:
        matches = re.findall(pattern, text)
        for match in matches:
            cleaned_match = match.strip()
            cleaned_match_digits = re.sub(r'\D', '', cleaned_match)
            if len(cleaned_match_digits) == 10:
                phone_numbers.append(cleaned_match)
    return list(set(phone_numbers))

def extract_name_from_text(text):
    if text is None:
        return None
    name_patterns = [
        re.compile(r'^[A-Z][a-zA-Z\s\-\.]+$', re.MULTILINE),
        re.compile(r'\b[A-Z][a-zA-Z\s\.\-]+\s+[A-Z][a-zA-Z\s\.\-]+\b', re.MULTILINE),
        re.compile(r'\b[A-Z]+\s+[A-Z]+\.[A-Z]+\b', re.IGNORECASE)
    ]
    lines = text.split('\n')
    for line in lines:
        if re.search(r'\d', line) or 'Street' in line or 'Avenue' in line or 'Road' in line:
            continue
        for pattern in name_patterns:
            match = pattern.search(line)
            if match:
                return match.group(0).strip()
    return "Name not found"

def extract_emails(text):
    email_pattern = r'([a-zA-Z0-9._%+-]+)(?:\s*|\s*)(@)(?:\s*|\s*)([a-zA-Z0-9.-]+)(?:\s*|\s*)(\.[a-zA-Z]{2,})'
    matches = re.findall(email_pattern, text)
    emails = [f"{username}{at}{domain}{tld}" for username, at, domain, tld in matches]
    return [email.replace(" ", "") for email in emails]

def extract_skills(text):
    skills_pattern = r'\b(?:JavaScript|Python|Java|C\+\+|C|HTML|CSS|React\.js|Node\.js|Git|SQL|Tableau|Machine Learning|Keras|TensorFlow|Photoshop|PowerPoint|Visual Studio|Premiere Pro|MySQL|Excel)\b'
    skills = re.findall(skills_pattern, text, flags=re.IGNORECASE)
    return list(set(skills))

def process_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def process_image(file_path):
    image = Image.open(file_path)
    text = pytesseract.image_to_string(image)
    return text

def generate_suggestions(extracted_skills):
    improvement_suggestions = set()
    suggested_roles = set()
    for skill in extracted_skills:
        if skill in SKILL_IMPROVEMENT_MAPPING:
            improvement_suggestions.update(SKILL_IMPROVEMENT_MAPPING[skill]["improvements"])
            suggested_roles.update(SKILL_IMPROVEMENT_MAPPING[skill]["roles"])
    return improvement_suggestions, suggested_roles

def send_email(recipient_email, candidate_name, match_percentage, improvement_suggestions, suggested_roles, job_title):
    sender_email = "your_email@gmail.com"
    sender_password = "your_email_password"

    subject = "Job Application Update - Interview Status"

    if match_percentage >= 0.5:
        body = f"""
        Dear {candidate_name},

        Congratulations! Based on your resume and our job description for the role of {job_title}, we are pleased to inform you that you have been shortlisted for the next round of our hiring process.

        Match Percentage: {match_percentage * 100:.2f}%

        Skills you have in common with the job description:
        {', '.join(suggested_roles)}

        We were impressed with your background and look forward to discussing your qualifications further.

        Best regards,
        [Company Name]
        """
    else:
        body = f"""
        Dear {candidate_name},

        Thank you for applying for the {job_title} position at [Company Name].

        After reviewing your resume, unfortunately, we cannot move forward with your application at this time. However, we encourage you to enhance your skill set for future opportunities.

        Your Match Percentage: {match_percentage * 100:.2f}%

        Suggestions for improvement:
        {', '.join(improvement_suggestions)}

        Suggested roles for you to explore:
        {', '.join(suggested_roles)}

        We appreciate your interest and wish you all the best in your career growth.

        Best regards,
        [Company Name]
        """
    
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = sender_email
    msg['To'] = recipient_email

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(sender_email, sender_password)
            server.sendmail(sender_email, recipient_email, msg.as_string())
        print(f"Email sent to {recipient_email} successfully!")
        return "Yes"  # If email sent successfully
    except Exception as e:
        print(f"Failed to send email to {recipient_email}: {e}")
        return "No"  # If email failed

def extract_skills_from_job_description(job_description_file):
    if not os.path.isfile(job_description_file):
        print(f"Error: The file '{job_description_file}' does not exist!")
        return []
    with open(job_description_file, 'r') as file:
        job_description_text = file.read()
    return extract_skills(job_description_text)

def match_skills(resume_skills, job_skills):
    common_skills = set(resume_skills) & set(job_skills)
    match_percentage = (len(common_skills) / len(job_skills)) * 100 if job_skills else 0
    return common_skills, match_percentage

def extract_resume_details(resume_path):
    file_extension = os.path.splitext(resume_path)[1].lower()
    if file_extension == ".pdf":
        text = process_pdf(resume_path)
    elif file_extension in [".jpg", ".jpeg", ".png"]:
        text = process_image(resume_path)
    else:
        return None
    
    name = extract_name_from_text(text)
    emails = extract_emails(text)
    skills = extract_skills(text)
    phone_numbers = extract_phone_numbers(text)

    return {"Name": name, "Emails": emails, "Skills": skills, "Phone Numbers": phone_numbers}

def process_resumes(resume_folder_path, job_description_file, threshold=50):
    job_skills = extract_skills_from_job_description(job_description_file)
    if not job_skills:
        return

    resume_files = glob.glob(os.path.join(resume_folder_path, "*.pdf")) + glob.glob(os.path.join(resume_folder_path, "*.jpg")) + glob.glob(os.path.join(resume_folder_path, "*.jpeg")) + glob.glob(os.path.join(resume_folder_path, "*.png"))

    selected_results = []  # List for selected candidates
    non_selected_results = []  # List for non-selected candidates

    for resume_path in resume_files:
        resume_details = extract_resume_details(resume_path)
        if resume_details:
            candidate_name = resume_details['Name']
            extracted_skills = resume_details['Skills']
            recipient_email = resume_details['Emails'][0] if resume_details['Emails'] else None

            matched_skills, match_percentage = match_skills(extracted_skills, job_skills)
            improvement_suggestions, suggested_roles = generate_suggestions(extracted_skills)

            print(f"Processing resume: {resume_path}")
            print(f"Match Percentage: {match_percentage:.2f}%")
            
            if match_percentage >= threshold:
                status = "Selected"
                email_sent = send_email(recipient_email, candidate_name, match_percentage / 100, improvement_suggestions, suggested_roles, job_title="Data Scientist") if recipient_email else "No Email"
                selected_results.append({
                    'Resume Name': resume_path,
                    'Candidate Name': candidate_name,
                    'Match Percentage': match_percentage,
                    'Status': status,
                    'Skills Matched': ', '.join(matched_skills),
                    'Improvement Suggestions': ', '.join(improvement_suggestions),
                    'Suggested Roles': ', '.join(suggested_roles),
                    'Email Sent': email_sent
                })
            else:
                status = "Not Selected"
                email_sent = send_email(recipient_email, candidate_name, match_percentage / 100, improvement_suggestions, suggested_roles, job_title="Data Scientist") if recipient_email else "No Email"
                non_selected_results.append({
                    'Resume Name': resume_path,
                    'Candidate Name': candidate_name,
                    'Match Percentage': match_percentage,
                    'Status': status,
                    'Skills Matched': ', '.join(matched_skills),
                    'Improvement Suggestions': ', '.join(improvement_suggestions),
                    'Suggested Roles': ', '.join(suggested_roles),
                    'Email Sent': email_sent
                })

            print("----------")  # Separator for clarity

    # Save the results to an Excel file with two sheets
    with pd.ExcelWriter('resume_matching_results.xlsx', engine='openpyxl') as writer:
        pd.DataFrame(selected_results).to_excel(writer, sheet_name='Selected', index=False)
        pd.DataFrame(non_selected_results).to_excel(writer, sheet_name='Non Selected', index=False)

    print(f"Results saved to resume_matching_results.xlsx")

# User input for the paths and threshold
job_description_file = input("Enter the path for the job description file (e.g., job_description.txt): ")
resume_folder = input("Enter the path for the resume folder: ")
threshold = float(input("Enter the match percentage threshold (e.g., 50): "))

process_resumes(resume_folder, job_description_file, threshold)

Results saved to resume_matching_results.xlsx
