In [2]:
import os
from pypdf import PdfReader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma
import google.generativeai as genai
from google.generativeai import GenerativeModel

pdf_path = input("Please enter the full path to the PDF file (or press Enter to use the default):\n").strip()

if not pdf_path:
    pdf_path = r"C:\Users\shira\OneDrive\שולחן העבודה\BOT_Elad_Sistem\international_agreements_uae_bit-eng.pdf"

if not os.path.exists(pdf_path):
    print("❌ File not found. Please check the path and try again.")
    exit(1)

API_KEY = "AIzaSyBelcUdSVHsJZfU2RKWdBS3rGFl6MWOFFA"
os.environ["GOOGLE_API_KEY"] = API_KEY

reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
    text += page.extract_text() + "\n"

print(f"Total number of characters in the file: {len(text)}")

def split_text(text, max_len=1000, overlap=200):
    start = 0
    chunks = []
    while start < len(text):
        end = start + max_len
        chunk = text[start:end]
        chunks.append(chunk)
        start += max_len - overlap
    return chunks

chunks = split_text(text)
print(f"Number of chunks after splitting: {len(chunks)}")
print("Sample chunk:\n", chunks[0][:500])

embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=API_KEY
)
print("Embeddings ready")

print("Computing embeddings and creating vector database...")
vectordb = Chroma.from_texts(chunks, embeddings)
print("The vector database is ready.")

genai.configure(api_key=API_KEY)
model = GenerativeModel("models/gemini-2.5-flash")
print("Gemini model is ready.")

def gemini_chat(prompt: str):
    response = model.generate_content(prompt)
    if hasattr(response, "text") and response.text:
        return response.text
    if hasattr(response, "output") and response.output:
        first = response.output[0]
        if hasattr(first, "content") and first.content:
            c = first.content[0]
            if hasattr(c, "text"):
                return c.text
        if hasattr(first, "text"):
            return first.text
    return str(response)

def answer_question_with_context(question, k=3):
    retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": k})
    relevant_docs = retriever.invoke(question)

    if isinstance(relevant_docs, dict):
        if "documents" in relevant_docs:
            relevant_docs = relevant_docs["documents"]
        elif "results" in relevant_docs:
            relevant_docs = relevant_docs["results"]

    if not isinstance(relevant_docs, (list, tuple)):
        relevant_docs = list(relevant_docs)

    print(f"\n[INFO] {len(relevant_docs)} relevant chunks found. Using the following context:")

    context_combined = ""
    used_indices = []
    for idx, doc in enumerate(relevant_docs, 1):
        snippet = getattr(doc, "page_content", str(doc)).strip().replace("\n", " ")
        print(f"  Chunk {idx}: {snippet[:300]}...")
        context_combined += f"\nChunk {idx}:\n{snippet}\n"
        used_indices.append(str(idx))

    prompt = (
        "You are an assistant that explains the content of an agreement. "
        "You have received a user question and some relevant excerpts from the document. "
        "Please answer clearly and briefly, in the **same language** as the question. "
        "Also mention which chunks your answer is based on.\n\n"
        f"Question: {question}\n\n"
        f"Context from the document: {context_combined}\n\n"
        "Answer:"
    )

    answer_text = gemini_chat(prompt)

    attribution = f"(Based on chunks {', '.join(used_indices)})"
    print("\n[Gemini Answer]:")
    print(answer_text.strip())
    print(attribution)
    return answer_text

while True:
    question = input("\nWhat would you like to ask about the agreement? (type 'סיום' or 'exit' to quit)\n")
    if question.lower() in ["סיום", "exit", "quit"]:
        print("Goodbye, Hope I was helpful :) ")
        break
    try:
        answer_question_with_context(question)
    except Exception as e:
        print("An error occurred:", e)


Please enter the full path to the PDF file (or press Enter to use the default):
 


Total number of characters in the file: 60092
Number of chunks after splitting: 76
Sample chunk:
  
 
1 
 
 
AGREEMENT 
 
between 
 
The Government of the State of Israel 
 
and 
 
The Government of the United Arab Emirates 
 
on 
 
Promotion and Protection of Investments 
 
The Government of the State of Israel and The Government of The United Arab 
Emirates (hereinafter, “the Parties”) 
Further to the Treaty of Peace, Diplomatic Relations and Full Normalization between 
the United Arab Emirates and the State of Israel, signed in Washington , DC on  15 
September 2020 (hereinafter, “the Pea
Embeddings ready
Computing embeddings and creating vector database...
The vector database is ready.
Gemini model is ready.



What would you like to ask about the agreement? (type 'סיום' or 'exit' to quit)
 סיום


Goodbye, Hope I was helpful :) 
