In [3]:
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, TextLoader, UnstructuredWordDocumentLoader
import os

def load_document(file_path: str) -> list[Document]:
    ext = os.path.splitext(file_path)[-1].lower()

    if ext == ".pdf":
        loader = PyPDFLoader(file_path)
    elif ext == ".docx":
        loader = UnstructuredWordDocumentLoader(file_path)
    elif ext == ".txt":
        loader = TextLoader(file_path)
    else:
        raise ValueError(f"Unsupported file format: {ext}")

    return loader.load()

def text_from_doc(document):
    text = "\n\n".join([doc.page_content for doc in document])
    return text

if __name__ == "__main__":
    test_file = "backend/reference_docs/ref2.pdf"
    docs = load_document(test_file)
    full_text = text_from_doc(docs)

    print("Document loaded successfully!")
    print("---- Preview ----")
    print(full_text[:1000])  


Document loaded successfully!
---- Preview ----
T echNova 
Executive Summary 
This Environmental, Social, and Governance (ESG) report presents a 
comprehensive overview of TechNova International’s sustainability performance 
for the fiscal year ending December 31, 2024. Prepared in accordance with the 
Global Reporting Initiative (GRI) Standards, this report details our progress, 
challenges, and future commitments across key ESG dimensions. As a global 
technology leader, TechNova International recognizes its responsibility to drive 
positive change, foster innovation, and uphold ethical standards in all aspects of 
its operations. This year, we achieved a 17 percent reduction in Scope 1 and 2 
greenhouse gas emissions, increased the representation of women and 
underrepresented minorities in our workforce by 8 percent, and strengthened our 
governance framework to enhance transparency and accountability. We engaged 
with over 2,000 stakeholders worldwide to identify material topics a