In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from pathlib import Path
from tqdm import tqdm
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.schema import Document

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
DATA_DIR = "F:\Multi-User Document Search and Conversational Q&A System\data"
INDEX_NAME = "company-docs"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100

In [29]:
def load_pdf_file(data):
  loader = DirectoryLoader(data, glob="**/*.pdf", loader_cls=PyPDFLoader)
  documents = loader.load()
  return documents

data = load_pdf_file(DATA_DIR)
print(f"Loaded {len(data)} documents from {DATA_DIR}")

Loaded 652 documents from F:\Multi-User Document Search and Conversational Q&A System\data


In [31]:
def filter_to_minimal_document(docs: list) -> list:
  minimal_docs = []
  for doc in docs:
    src = doc.metadata.get("source")
    minimal_docs.append(
      Document(
              page_content=doc.page_content,
              metadata={"source": src}
            )
        )
  return minimal_docs

data = filter_to_minimal_document(data)
data

[Document(metadata={'source': 'F:\\Multi-User Document Search and Conversational Q&A System\\data\\Meta\\Meta.pdf'}, page_content='UNITED ST ATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n__________________________\nFORM 10-K\n__________________________\n(Mark One)\n☒    ANNUAL  REPOR T PURSUANT  TO SECTION 13 OR 15(d) OF  THE SECURITIES EXCHANGE ACT  OF 1934\nFor the fiscal year  ended December  31, 2024\nor\n☐    TRANSITION REPOR T PURSUANT  TO SECTION 13 OR 15(d) OF  THE SECURITIES EXCHANGE ACT  OF 1934\nFor the transition period fr om            to            \nCommission File Number: 001-35551\n__________________________\nMeta Platforms, Inc.\n(Exact name of r egistrant as specified in its charter)\n__________________________\nDelawar e 20-1665019\n(State or other jurisdiction of incorporation or organization) (I.R.S. Employer Identification Number)\n1 Meta Way, Menlo Park, California 94025\n(Address of principal executive offices and Zip Code)\n(650) 543-4800\n

In [32]:
import re

def clean_text(text: str) -> str:
  # Remove multiple newlines
  text = re.sub(r'\n+', '\n', text)
  # Remove multiple spaces
  text = re.sub(r' +', ' ', text)
  # Strip leading/trailing whitespace
  text = text.strip()
  # Remove common artifacts like page numbers
  text = re.sub(r"Page \d+ of \d+", "", text)
  
  return text

In [33]:
def clean_documents(docs: list) -> list:
  cleaned_docs = []
  for doc in docs:
    cleaned_text = clean_text(doc.page_content)
    cleaned_docs.append(
      Document(
              page_content=cleaned_text,
              metadata=doc.metadata
            )
        )
  return cleaned_docs

In [34]:
print("Before cleaning:", data[0].page_content[:500])
clean_data = clean_documents(data)
print("After cleaning:", clean_data[0].page_content[:500])

Before cleaning: UNITED ST ATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
__________________________
FORM 10-K
__________________________
(Mark One)
☒    ANNUAL  REPOR T PURSUANT  TO SECTION 13 OR 15(d) OF  THE SECURITIES EXCHANGE ACT  OF 1934
For the fiscal year  ended December  31, 2024
or
☐    TRANSITION REPOR T PURSUANT  TO SECTION 13 OR 15(d) OF  THE SECURITIES EXCHANGE ACT  OF 1934
For the transition period fr om            to            
Commission File Number: 001-35551
__________________
After cleaning: UNITED ST ATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
__________________________
FORM 10-K
__________________________
(Mark One)
☒ ANNUAL REPOR T PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended December 31, 2024
or
☐ TRANSITION REPOR T PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period fr om to 
Commission File Number: 001-35551
_____________________

In [35]:
def text_split(data):
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=CHUNK_SIZE,
      chunk_overlap=CHUNK_OVERLAP,
      separators=["\n\n", "\n", " ", ""]
  )
  docs = text_splitter.split_documents(data)
  return docs

In [36]:
split_data = text_split(clean_data)

In [37]:
embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)



In [41]:
vs = FAISS.from_documents(split_data, embeddings)
save_path = r"F:\Multi-User Document Search and Conversational Q&A System\VDB_index"
vs.save_local(save_path)

In [None]:
vs = FAISS.load_local("F:\Multi-User Document Search and Conversational Q&A System\VDB_index\faiss_index", embeddings, allow_dangerous_deserialization=True)

In [42]:
query = "What are the key governance policies mentioned?"
results = vs.similarity_search(query, k=3)  # k = number of top results

for i, doc in enumerate(results, 1):
    print(f"Result {i}")
    print("Source:", doc.metadata)
    print(doc.page_content[:300])  # first 300 characters
    print("-" * 80)


Result 1
Source: {'source': 'F:\\Multi-User Document Search and Conversational Q&A System\\data\\Meta\\Meta.pdf'}
Table of Contents
content moderation, data localization, data protection, competition, e-commerce and payments, and regulatory oversight;
• reduced protection for intellectual property rights in some countries;
• difficulties in staffing, managing, and overseeing global operations and the increased 
--------------------------------------------------------------------------------
Result 2
Source: {'source': 'F:\\Multi-User Document Search and Conversational Q&A System\\data\\Reliance\\RIL-Integrated-Annual-Report-2024-25.pdf'}
files/reports/Vigil-Mechanism-and-
Whistle-Blower-Policy.pdf .
Anti-Bribery and Anti-
Corruption Policy
The	Company	is	committed	in	doing 	
business	with	integrity	&	transparency 	
and	has	a	zero-tolerance	approach	to 	
non-compliance	with	the	Anti-Bribery 	
&	Anti-Corruption	Policy.	The	Company 	
p
-----------------------------------------------------

In [43]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_community.llms import Ollama

In [44]:
retriever = vs.as_retriever(search_kwargs={"k": 3})
llm = Ollama(model="mistral")
memory = ConversationBufferMemory(
                  memory_key="chat_history", 
                  return_messages=True,
                  output_key="answer"
                  )

In [45]:
qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    output_key="answer"
)

In [27]:
print("🤖 Local Conversational Q&A with Ollama Ready! Type 'exit' to quit.\n")
while True:
    query = input("You: ")
    if query.lower() in ["exit", "quit"]:
        print("Goodbye! 👋")
        break

    result = qa.invoke({"question": query})
    print("\nBot:", result["answer"])
    print("Sources:", [doc.metadata for doc in result["source_documents"]])
    print()


🤖 Local Conversational Q&A with Ollama Ready! Type 'exit' to quit.


Bot:  The question asks for the profit, but the provided context only includes income (loss) from operations by segment. To find the profit, we would need to know the total revenue and expenses for each segment, which are not explicitly stated in the given context. However, the income from operations (before accounting for expenses) for the Family of Apps segment in 2024 is $87,109 million, so their profit (if there's a surplus) would be at least that amount if there were no other expenses. For the Reality Labs segment, there's a loss of ($17,729) million in 2024. Therefore, the total profit for both segments combined would be $69,380 (Family of Apps income) - ($17,729) (Reality Labs loss) = $51,669 million. Keep in mind that this is an approximation as we don't have the full financial information to account for other expenses and potential losses in the Family of Apps segment or gains/losses in other areas.
Sources: 

KeyboardInterrupt: 