Import Libraries

In [1]:
import pdfplumber
import docx
import os

Define Extraction Functions

In [2]:
def extract_pdf_text(file_path):
    """Extract text from a PDF file using pdfplumber."""
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ''
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + ' '
            return text.strip() or ''
    except Exception as e:
        return f"Error: {str(e)}"

def extract_docx_text(file_path):
    """Extract text from a DOCX file using python-docx."""
    try:
        doc = docx.Document(file_path)
        text = ''
        for paragraph in doc.paragraphs:
            if paragraph.text.strip():
                text += paragraph.text + ' '
        return text.strip() or ''
    except Exception as e:
        return f"Error: {str(e)}"

Test with Single Files

In [17]:
sample_pdf = "../uploads/job1.pdf" 
sample_docx = "../uploads/job5.docx"

pdf_text = extract_pdf_text(sample_pdf)
docx_text = extract_docx_text(sample_docx)

print("PDF Text:", pdf_text)
print("DOCX Text:", docx_text)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


PDF Text: Job 1: Legal Analyst position requiring 2 years of experience in contract law and negotiation skills.
DOCX Text: Job 5: Attorney with 1 years of practice in intellectual property law and client counseling.


Test with Multiple Files

In [None]:
upload_folder = "../uploads/"
pdf_files = [f"{upload_folder}job{i}.pdf" for i in range(1, 21)]
docx_files = [f"{upload_folder}job{i}.docx" for i in range(1, 21)]
all_files = pdf_files + docx_files

extracted_texts = []
for file_path in all_files:
    if file_path.endswith('.pdf'):
        text = extract_pdf_text(file_path)
        extracted_texts.append({'filename': os.path.basename(file_path), 'text': text})
    elif file_path.endswith('.docx'):
        text = extract_docx_text(file_path)
        extracted_texts.append({'filename': os.path.basename(file_path), 'text': text})
    print(f"Extracted from {file_path}: {text[:50]}...")

Analyze Results

In [10]:
total_files = len(extracted_texts)
successful_extractions = sum(1 for item in extracted_texts if item['text'] and not item['text'].startswith('Error'))
print(f"Total Files Processed: {total_files}")
print(f"Successful Extractions: {successful_extractions}")
print("Sample Extracted Texts:", extracted_texts[:5])

Total Files Processed: 40
Successful Extractions: 40
Sample Extracted Texts: [{'filename': 'job1.pdf', 'text': 'Job 1: Legal Analyst position requiring 2 years of experience in contract law and negotiation skills.'}, {'filename': 'job2.pdf', 'text': 'Job 2: Paralegal role with expertise in legal research and 3 years of drafting contracts.'}, {'filename': 'job3.pdf', 'text': 'Job 3: Corporate Lawyer needed with 4 years in compliance and litigation management.'}, {'filename': 'job4.pdf', 'text': 'Job 4: Legal Assistant position offering 5 years of experience in case preparation and analysis.'}, {'filename': 'job5.pdf', 'text': 'Job 5: Attorney with 1 years of practice in intellectual property law and client counseling.'}]
