In [3]:
import PyPDF2
import pdfplumber
import fitz

In [7]:
def _extract_text_from_pdf(file_path):
        """Extract text from PDF file using pdfplumber for better text extraction"""
        text = ""
        try:
            # Use pdfplumber for better formatting preservation
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text(x_tolerance=3, y_tolerance=3)
                    if page_text:
                        text += page_text + "\n"
                    
            # If pdfplumber didn't get good results, try PyPDF2 as fallback
            if not text.strip():
                with open(file_path, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    for page_num in range(len(pdf_reader.pages)):
                        text += pdf_reader.pages[page_num].extract_text() + "\n"
        except Exception as e:
            print(f"Error extracting text from PDF: {str(e)}")
        
        # print(text)
        return text

    
def extract_hyperlinks_from_pdf(file_path):
        """Extract hyperlinks from PDF file using PyMuPDF to get both link text and URLs"""
        hyperlinks = []
        
        try:
            doc = fitz.open(file_path)
            for page_num, page in enumerate(doc):
                link_list = page.get_links()
                for link in link_list:
                    if link.get("uri"):
                        # Get the rectangle containing the link
                        rect = link.get("from")
                        
                        # Extract the text in this rectangle if available
                        link_text = ""
                        if rect:
                            words = page.get_text("words", clip=rect)
                            link_text = " ".join([word[4] for word in words]) if words else ""
                        
                        hyperlinks.append({
                            'text': link_text,
                            'url': link.get("uri")
                        })
            doc.close()
        except Exception as e:
            print(f"PyMuPDF extraction error: {str(e)}")
        
        return hyperlinks

In [8]:
print(extract_hyperlinks_from_pdf("ad.pdf"))

[{'text': 'deopurkar.a@northeastern.edu', 'url': 'mailto:deopurkar.a@northeastern.edu'}, {'text': 'LinkedIn', 'url': 'https://www.linkedin.com/in/ankit-deopurkar/'}]


In [10]:
_extract_text_from_pdf("tanmay.pdf")

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


'Tanmay Pawar\n(857) 339-8799 | pawar.ta@northeastern.edu | Boston, MA, 02120 | LinkedIn | GitHub\nEDUCATION\nNortheastern University Boston, MA\nMaster of Science in Information Systems Expected May 2026\nRelated Coursework: Data Science, Big Data Intelligence, Database Management GPA: 4\nPune University Pune, India\nBachelor of Engineering in Computer Engineering Mar 2018 - Jun 2022\nRelated Coursework: Computer Organization and Architecture, Machine Learning, DBMS, Cloud\nSKILLS\nProgramming Languages: Python, C#, Java, HTML, CSS, JavaScript\nDatabases & Tools: Oracle SQL, MSSQL, Snowflake, dbt, Git, Apache Airflow\nCloud Platforms: AWS, GCP, Azure\nLibraries & Frameworks: Pandas, NumPy, scikit-learn, ReactJS, .NET, Streamlit, LangChain\nCertification: AZ-900(Azure Fundamentals)\nWORK EXPERIENCE\nPersistent Systems Ltd | Software Engineer (Jul 2022 - Jul 2024) Pune, India\n• Cleaned and managed datasets using Python (Pandas, NumPy) and SQL, improving data quality\n• Conducted data a