In [1]:
import pdfplumber
from PIL import Image
import pytesseract
import io
import os


In [15]:


def extract_text_from_pdf_bytes(pdf_path: bytes)->str:
    text=[]
    with pdfplumber.open(io.BytesIO(pdf_path)) as pdf:
        for page in pdf.pages:
            ptext=page.extract_text()
            if ptext:
                text.append(ptext)
            else:
                try:
                    im=page.to_image(resolution=300).original
                    
                    ocr_text=pytesseract.image_to_string(im)
                    if ocr_text:
                        text.append(ocr_text)
                except Exception as e:
                    pass
    final_text='\n'.join(text).strip()
    if not final_text:
        try:
            final_text=''
        except Exception:
            final_text=''
    
    return final_text

In [None]:
extract_text_from_pdf_bytes(open('pdf_src/Sanwal_Diya_influencer.pdf', 'rb').read())

'DIYA SANWAL\nNainital | 7820028492 | Gmail | LinkedIn\nPROFESSIONAL SUMMARY\nPassionate and data-driven Digital Marketing Specialist with a Master’s Certification, skilled in crafting engaging strategies,\nuser-friendly websites, and AI-powered growth solutions. Backed by 2 years of experience in healthcare analytics at R1\nRCM, I bring a unique blend of creativity and analytical thinking. Proven success across industries like e-commerce,\nhospitality, media, with expertise in SEO, social media, and campaign execution. Committed to continuous learning and\ndelivering impactful digital outcomes.\nEXPERIENCE\nINFLUENCER MARKETING EXECUTIVE | R1RCM | Sep 2023 – Apr 2025\n● Analysed influencer partnership data to ensure compliance with marketing regulations (e.g., FTC guidelines, platform policies),\nreducing campaign discrepancies by 20% through accurate verification and disclosure tracking.\n● Analysed campaign performance metrics (reach, engagement rate, conversions) to refine influenc

In [3]:
from src.pdf_extractor import extract_text_from_pdf



ModuleNotFoundError: No module named 'src'

In [None]:

from src.utils.validators import validate_pdf_content,validate_pdf_file
with open('pdf_src/Sanwal_Diya_influencer.pdf', 'rb') as f:
    print((f))

<_io.BufferedReader name='pdf_src/Sanwal_Diya_influencer.pdf'>


In [None]:
# Replace/overwrite the previous function with a more robust implementation.
def extract_text_from_pdf_bytes(pdf_path: bytes) -> str:
    """
    Robust PDF -> text extractor. Accepts raw PDF bytes and:
      - tries pdfplumber.extract_text()
      - falls back to OCR via pytesseract on a rasterized page image
    Raises a clear error if Tesseract is not installed or not found.
    """
    if not pdf_path:
        return ""

    text_parts = []
    try:
        with pdfplumber.open(io.BytesIO(pdf_path)) as pdf:
            for page in pdf.pages:
                try:
                    ptext = page.extract_text()
                except Exception:
                    ptext = None

                if ptext and ptext.strip():
                    text_parts.append(ptext.strip())
                    continue

                # If no text, rasterize page and OCR
                try:
                    page_image = page.to_image(resolution=300).original
                    if page_image.mode != "RGB":
                        page_image = page_image.convert("RGB")
                    try:
                        ocr_text = pytesseract.image_to_string(page_image)
                    except AttributeError:
                        # Older pytesseract versions might raise different errors; re-raise as informative
                        raise RuntimeError("pytesseract is not usable in this environment.")
                    except pytesseract.pytesseract.TesseractNotFoundError:
                        raise RuntimeError(
                            "Tesseract executable not found. Install Tesseract OCR or set "
                            "pytesseract.pytesseract.tesseract_cmd to the tesseract binary path."
                        )
                    if ocr_text and ocr_text.strip():
                        text_parts.append(ocr_text.strip())
                except Exception:
                    # If rasterization/ocr fails for this page, skip it and continue with others
                    continue
    except Exception as e:
        # Bubble up a clear error for file-level problems
        raise RuntimeError(f"Failed to open/process PDF bytes: {e}")

    return "\n".join(text_parts).strip()


# Example usage (will overwrite prior behavior with the fixed function):
pdf_bytes = open("pdf_src/Sanwal_Diya_influencer.pdf", "rb").read()
print(extract_text_from_pdf_bytes(pdf_bytes)[:1000])  # print first 1000 chars for quick check