## Block 2: extract_text(file_path)

In [19]:
# ✅ Core imports for resume file reading and text extraction
import os
import textract
from PyPDF2 import PdfReader
import docx

In [20]:
# ✅ Main function to extract text based on file extension
def extract_text(file_path):
    """
    Extracts plain text from a given resume file (.pdf or .docx).
    Returns the extracted text as a string.
    """
    ext = os.path.splitext(file_path)[-1].lower()

    if ext == '.pdf':
        return extract_from_pdf(file_path)
    elif ext == '.docx':
        return extract_from_docx(file_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

## Block 3 & 4: PDF and DOCX extractors

In [21]:
# ✅ Extract text from PDF files using PyPDF2
def extract_from_pdf(file_path):
    """
    Extracts text from a PDF using PyPDF2, one page at a time.
    """
    try:
        reader = PdfReader(file_path)
        text = ''
        for page in reader.pages:
            text += page.extract_text() or ''
        return text.strip()
    except Exception as e:
        print(f"[ERROR] PDF parsing failed: {e}")
        return ""

In [22]:
# ✅ Extract text from DOCX files using python-docx
def extract_from_docx(file_path):
    """
    Extracts text from a DOCX file by reading all paragraphs.
    """
    try:
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"[ERROR] DOCX parsing failed: {e}")
        return ""

In [23]:
##Block 5: extract_with_textract

# ✅ Fallback: Textract handles many file types and OCR 
def extract_with_textract(file_path):
    """
    Uses textract to extract text from any file (supports OCR).
    """
    try:
        return textract.process(file_path).decode('utf-8')
    except Exception as e:
        print(f"[ERROR] Textract failed: {e}")
        return ""

## Block 6: Dynamic Resume Loader

In [24]:
import os

# Base folder where all domain folders are stored
base_resume_dir = '../data/raw_resumes/resume-dataset/data/data/'

# Walk through each domain folder
domain_resume_paths = []

for domain in os.listdir(base_resume_dir):
    domain_path = os.path.join(base_resume_dir, domain)
    if os.path.isdir(domain_path):
        # ✅ Sort file names alphabetically for consistent output
        pdfs = sorted([f for f in os.listdir(domain_path) if f.endswith('.pdf')])
        if pdfs:
            domain_resume_paths.append(os.path.join(domain_path, pdfs[0]))

# Preview what you found
print(f"✅ Found {len(domain_resume_paths)} sample resumes from {len(os.listdir(base_resume_dir))} domains")

# Extract text from the first resume found
if domain_resume_paths:
    text = extract_text(domain_resume_paths[0])
    print(f"🧾 Preview from: {domain_resume_paths[0]}")
    print("\n" + "-"*40)
    print(text[:1000])
else:
    print("No resumes found across domains.")

#Automatically walks through 25 domains
#Picks 1 PDF per domain for testing
#Handles missing or empty folders
#Displays count and file path clearly

✅ Found 24 sample resumes from 25 domains
🧾 Preview from: ../data/raw_resumes/resume-dataset/data/data/AGRICULTURE/10953078.pdf

----------------------------------------
RN STAFF NURSE
Professional Experience
RN Staff Nurse
 
August 2008
 
to 
April 2014
 
Company Name
 
ï¼​ 
City
 
, 
State
Participate in multidisciplinary plan of care.
Follow Best Practice Protocols, Evidence based practice as well as patient-centered and team-based care.
Identify barriers to successful treatment and share with team.
Monitor,measure and report/document progress of interventions and outcomes.
Timely and effective communication of newly implemented changes to unit based methods of practice.
Accomplishments Management of Aggressive Behavior (MOAB) in house Certified Instructor, Patient Experience Champion, Shared
Governance Chair of Policy & Procedure Committee, Oncology Certification, Implementation of unit Bedside Report and hourly rounding.
Skills Used Therapeutic touch/communication, teach back comm

In [26]:
# ✅ Save the last extracted resume's text to a file for parsing
output_path = '../data/parsed_output/sample_resume.txt'
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, "w", encoding="utf-8") as f:
    f.write(text)

print(f"✅ Resume text saved to: {output_path}")

✅ Resume text saved to: ../data/parsed_output/sample_resume.txt
