In [8]:
pip install -r requirements.txt




In [9]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.3/400.7 MB ? eta -:--:--
      -------------------------------------- 5.2/400.7 MB 21.3 MB/s eta 0:00:19
     --- ---------------------------------- 39.1/400.7 MB 88.8 MB/s eta 0:00:05
     ------- ----------------------------- 82.1/400.7 MB 127.8 MB/s eta 0:00:03
     ----------- ------------------------ 124.3/400.7 MB 147.1 MB/s eta 0:00:02
     -------------- --------------------- 166.7/400.7 MB 159.0 MB/s eta 0:00:02
     ------------------ ----------------- 208.4/400.7 MB 166.5 MB/s eta 0:00:02
     ---------------------- ------------- 248.5/400.7 MB 170.9 MB/s eta 0:00:01
     -------------------------- --------- 292.3/400.7 MB 206.9 MB/s eta 0:00:01
     ------------------------------ -

In [10]:
import re
import os
import io  # Added for potential memory issues with large files
import docx
import pdfminer.high_level  # Explicit import for high-level functions

# Optional: Install spaCy's English model for better name recognition.  Large model recommended.
# python -m spacy download en_core_web_lg  # Or en_core_web_trf for transformer-based, even better
import spacy  #  Requires: pip install spacy

# Added for robustness
from typing import List, Optional

# Requires: pip install pdfminer.six python-docx spacy
def extract_information_from_resume(resume_path: str, use_spacy: bool = True) -> dict:
    """
    Extracts personal information (name, email, phone number) from a resume file (PDF or DOCX).

    Args:
        resume_path (str): The path to the resume file.
        use_spacy (bool, optional): Whether to use spaCy for name extraction. Defaults to True.

    Returns:
        dict: A dictionary containing the extracted information. Keys are:
              'name', 'email', 'phone', 'filepath'.  If a piece of info is not found, its value will be None.
    """

    extracted_data = {
        'name': None,
        'email': None,
        'phone': None,
        'filepath': resume_path
    }

    try:
        text = extract_text_from_resume(resume_path)
        if text:
            extracted_data['email'] = extract_email(text)
            extracted_data['phone'] = extract_phone_number(text)

            if use_spacy:
                extracted_data['name'] = extract_name_spacy(text)
            else:
                extracted_data['name'] = extract_name_regex(text) # Fallback if spaCy not used
        else:
            print(f"Warning: Could not extract text from resume at {resume_path}")

    except Exception as e:
        print(f"Error processing {resume_path}: {e}")

    return extracted_data



def extract_text_from_resume(resume_path: str) -> Optional[str]:
    """
    Extracts text from a resume file (PDF or DOCX).

    Args:
        resume_path (str): The path to the resume file.

    Returns:
        Optional[str]: The extracted text, or None if an error occurred.
    """
    try:
        if resume_path.endswith('.pdf'):
            return extract_text_from_pdf(resume_path)
        elif resume_path.endswith('.docx'):
            return extract_text_from_docx(resume_path)
        else:
            print(f"Unsupported file format: {resume_path}")
            return None
    except Exception as e:
        print(f"Error extracting text from {resume_path}: {e}")
        return None


def extract_text_from_pdf(pdf_path: str) -> Optional[str]:
    """Extracts text from a PDF file using pdfminer.six.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        Optional[str]: The extracted text, or None if an error occurred.  Handles potential memory issues.
    """
    try:
        with open(pdf_path, 'rb') as file:
            text = pdfminer.high_level.extract_text(file)
            return text
    except Exception as e:
        print(f"Error extracting text from PDF {pdf_path}: {e}")
        return None


def extract_text_from_docx(docx_path: str) -> Optional[str]:
    """Extracts text from a DOCX file using python-docx.

    Args:
        docx_path (str): The path to the DOCX file.

    Returns:
        Optional[str]: The extracted text, or None if an error occurred.
    """
    try:
        doc = docx.Document(docx_path)
        full_text = []
        for paragraph in doc.paragraphs:
            full_text.append(paragraph.text)
        return '\n'.join(full_text)
    except Exception as e:
        print(f"Error extracting text from DOCX {docx_path}: {e}")
        return None


def extract_name_regex(text: str) -> Optional[str]:
    """Extracts a name from text using regular expressions.  Simple heuristic.

    Args:
        text (str): The text to extract from.

    Returns:
        Optional[str]: The extracted name, or None if not found.
    """
    name = None
    name_pattern = r"([A-Z][a-z]+)\s+([A-Z][a-z]+)"  # Matches "First Last"

    match = re.search(name_pattern, text)
    if match:
        name = match.group()  # Return the full match
    return name


def extract_name_spacy(text: str) -> Optional[str]:
    """Extracts a name from text using spaCy's Named Entity Recognition.  Requires `spacy` and an English model.

    Args:
        text (str): The text to extract from.

    Returns:
        Optional[str]: The extracted name, or None if not found.
    """
    try:
        nlp = spacy.load("en_core_web_lg")  # Or a similar model
        doc = nlp(text)
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                return ent.text
        return None  # No PERSON entity found
    except OSError:
        print("Error: spaCy model (en_core_web_lg) not found. Please download it using: python -m spacy download en_core_web_lg")
        return None # Or raise the exception if you want the program to stop.
    except Exception as e:
        print(f"Error using spaCy for name extraction: {e}")
        return None


def extract_email(text: str) -> Optional[str]:
    """Extracts an email address from text using regular expressions.

    Args:
        text (str): The text to extract from.

    Returns:
        Optional[str]: The extracted email address, or None if not found.
    """
    email = None
    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"  # More robust email regex
    match = re.search(email_pattern, text)
    if match:
        email = match.group()
    return email


def extract_phone_number(text: str) -> Optional[str]:
    """Extracts a phone number from text using regular expressions.

    Args:
        text (str): The text to extract from.

    Returns:
        Optional[str]: The extracted phone number, or None if not found.
    """
    phone = None
    phone_pattern = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"  # Handles various phone number formats
    match = re.search(phone_pattern, text)
    if match:
        phone = match.group()
    return phone


def process_directory(directory_path: str, use_spacy: bool = True) -> List[dict]:
    """Processes all PDF and DOCX files in a directory and returns a list of extracted information.

    Args:
        directory_path (str): The path to the directory containing resume files.
        use_spacy (bool, optional): Whether to use spaCy for name extraction. Defaults to True.

    Returns:
        List[dict]: A list of dictionaries, where each dictionary contains the extracted information from a resume.
    """
    results = []
    for filename in os.listdir(directory_path):
        if filename.endswith(('.pdf', '.docx')):
            filepath = os.path.join(directory_path, filename)
            data = extract_information_from_resume(filepath, use_spacy)
            results.append(data)
    return results


if __name__ == '__main__':
    # Example usage:

    # 1. Process a single resume file:
    resume_file = "Abhishek Dasandhi CV.pdf"  # Replace with your file
    '''
    # Create a dummy PDF file for testing purposes
    with open(resume_file, "w") as f:
        f.write("John Doe\njohn.doe@example.com\n(123) 456-7890")
    extracted_info = extract_information_from_resume(resume_file)
    print("Extracted information from single file:", extracted_info)

    # 2. Process all resumes in a directory:
    resume_directory = "resumes"  # Replace with your directory. Create the directory if it doesn't exist!
    if not os.path.exists(resume_directory):
      os.makedirs(resume_directory) # create the directory if it doesn't exist.
    # Create a dummy PDF file for testing purposes
    resume_file_2 = "resumes/example_resume2.pdf"
    with open(resume_file_2, "w") as f:
        f.write("Jane Smith\njane.smith@example.com\n(987) 654-3210")

    extracted_info_list = process_directory(resume_directory)
    print("Extracted information from directory:")
    for info in extracted_info_list:
        print(info)
    '''
    # Example with spaCy disabled:
    extracted_info = extract_information_from_resume(resume_file, use_spacy=False)
    print("Extracted information (no spaCy):", extracted_info)
    extracted_info = extract_information_from_resume(resume_file, use_spacy=True)
    print("Extracted information (with spaCy):", extracted_info)

Extracted information (no spaCy): {'name': 'Abhishek \nDasandhi', 'email': 'shyamaldasandhi@gmail.com', 'phone': '9646920015', 'filepath': 'Abhishek Dasandhi CV.pdf'}
Extracted information (with spaCy): {'name': 'Postman', 'email': 'shyamaldasandhi@gmail.com', 'phone': '9646920015', 'filepath': 'Abhishek Dasandhi CV.pdf'}
