### OpenParse

In [1]:
import json
import time
import os
import pandas as pd
from dotenv import load_dotenv
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.vectorstores import PathwayVectorClient
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
import tiktoken  # For token counting

# Load environment variables
def load_environment_variables():
    """Load environment variables from a .env file."""
    print("Loading environment variables...")
    load_dotenv()
    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
    print("Environment variables loaded.")

# Initialize Pathway retriever
def create_pathway_client(host="127.0.0.1", port=8011, k=10):
    """Initialize PathwayVectorClient and return as retriever."""
    print(f"Initializing PathwayVectorClient on host {host} and port {port}...")
    client = PathwayVectorClient(host=host, port=port)
    print("PathwayVectorClient initialized.")
    return client.as_retriever(search_kwargs={"k": k})

# Retrieve relevant documents
def retrieve_relevant_documents(retriever, query):
    """Retrieve relevant documents based on the given query."""
    print(f"Retrieving relevant documents for query: {query}")
    try:
        docs = retriever.get_relevant_documents(query)
        print(f"Retrieved {len(docs)} chunks.")
        return docs
    except Exception as e:
        print(f"Error retrieving documents: {e}")
        return []

# Create document compressor
def create_compressor(model="rerank-english-v3.0"):
    """Create and return a CohereRerank compressor."""
    print("Creating CohereRerank compressor...")
    compressor = CohereRerank(model=model, top_n=5)
    print("Compressor created.")
    return compressor

# Compress retrieved documents
def compress_documents(retriever, query, compressor):
    """Compress relevant documents using the provided compressor."""
    print("Compressing retrieved chunks...")
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor, base_retriever=retriever
    )
    try:
        compressed_docs = compression_retriever.invoke(query)
        print(f"Compressed to {len(compressed_docs)} chunks.")
        return compressed_docs
    except Exception as e:
        print(f"Error during compression: {e}")
        return []

# Initialize OpenAI LLM
def create_llm():
    """Create and return an OpenAI LLM instance."""
    print("Initializing OpenAI LLM...")
    llm = ChatOpenAI(
        model="gpt-4o-mini",
        temperature=0,
        max_tokens=None,
        timeout=45,
        max_retries=2,
    )
    print("OpenAI LLM initialized.")
    return llm

# Create ChatPromptTemplate for Q&A
def create_chat_prompt():
    """Create and return a chat prompt template for Q&A."""
    print("Creating chat prompt template...")
    chat_prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are a helpful assistant. Answer the following question based only on the provided context.\n"
                "Provide a step-by-step explanation, and reference the source details using metadata.\n\n"
                "<context>\n{context}\n</context>",
            ),
            ("human", "{question}"),
        ]
    )
    print("Chat prompt template created.")
    return chat_prompt

# Token counting function
def count_tokens(text, model="o200k_base"):
    """Estimate token count for given text using tiktoken."""
    tokenizer = tiktoken.get_encoding(model)
    tokens = tokenizer.encode(text)
    return len(tokens)

# Generate answer
def get_answer(context, question, chain):
    """Generate an answer based on the context and question using the LLM chain."""
    print("Generating answer for the question...")
    try:
        input_tokens = count_tokens(context) + count_tokens(question)
        print(f"Estimated input tokens: {input_tokens}")

        # Invoke the LLM chain with structured input
        answer = chain.invoke({"context": context, "question": question})
        
        output_tokens = count_tokens(answer.content)
        print(f"Estimated output tokens: {output_tokens}")
        
        return answer.content
    except Exception as e:
        print(f"Error generating answer: {e}")
        return None

# Main execution logic
def main():
    # Load environment variables
    load_environment_variables()

    # Load the dataset
    file_path = 'AAPL_10Q_dataset/aapl_qna_data.csv'
    print(f"Loading dataset from {file_path}...")
    data = pd.read_csv(file_path)
    print(f"Loaded {len(data)} questions.")

    # Initialize Pathway retriever, compressor, LLM, and chat prompt
    retriever = create_pathway_client()
    compressor = create_compressor()
    llm = create_llm()
    chat_prompt = create_chat_prompt()

    # Create the LLM chain
    chain = chat_prompt | llm

    # Prepare results storage
    results = []
    print("Total number of rows: ", len(data))
    # Loop over each question in the dataset
    for idx, row in data.iterrows():
        question = row['Question']
        gt_answer = row.get('Answer', "Ground truth answer not provided")
        
        print(f"\nProcessing question ID {idx}: {question}")

        # Step 1: Retrieve relevant documents
        retrieved_docs = retrieve_relevant_documents(retriever, question)
        if not retrieved_docs:
            print(f"No documents retrieved for query ID {idx}.")
            continue

        # Step 2: Compress retrieved documents
        compressed_docs = compress_documents(retriever, question, compressor)
        # compressed_docs = retrieved_docs

        # Prepare the context from compressed documents
        context = "\n".join([doc.page_content for doc in compressed_docs if doc.page_content])

        # Step 3: Generate an answer with retry logic
        answer = get_answer(context, question, chain)
        if answer is None:
            print(f"Retrying for question ID {idx} due to no initial response...")
            time.sleep(50)  # Wait 50 seconds before retrying
            answer = get_answer(context, question, chain)

        # Structure retrieved context in a list of dictionaries as per your format
        retrieved_context = [
            {"doc_id": doc.metadata.get("doc_id", "unknown"), "text": doc.page_content}
            for doc in compressed_docs
        ]
        
        # Append to results
        results.append({
            "query_id": str(idx).zfill(3),
            "question": question,
            "gt_answer": gt_answer,
            "response": answer,
            "retrieved_context": retrieved_context
        })

    # Save the results to a JSON file
    output_path = 'eval_results/basic_rag_openParser.json'
    print(f"Saving evaluation results to {output_path}...")
    with open(output_path, 'w') as f:
        json.dump({"results": results}, f, indent=4)

    print(f"Evaluation results saved to {output_path}")

# Run main function
if __name__ == "__main__":
    main()

Loading environment variables...
Environment variables loaded.
Loading dataset from AAPL_10Q_dataset/aapl_qna_data.csv...
Loaded 39 questions.
Initializing PathwayVectorClient on host 127.0.0.1 and port 8011...
PathwayVectorClient initialized.
Creating CohereRerank compressor...
Compressor created.
Initializing OpenAI LLM...
OpenAI LLM initialized.
Creating chat prompt template...
Chat prompt template created.
Total number of rows:  39

Processing question ID 0: How has Apple's total net sales changed over time?
Retrieving relevant documents for query: How has Apple's total net sales changed over time?


  docs = retriever.get_relevant_documents(query)


Retrieved 10 chunks.
Compressing retrieved chunks...
Compressed to 5 chunks.
Generating answer for the question...
Estimated input tokens: 2005
Estimated output tokens: 465

Processing question ID 1: What are the major factors contributing to the change in Apple's gross margin in the most recent 10-Q compared to the previous quarters?
Retrieving relevant documents for query: What are the major factors contributing to the change in Apple's gross margin in the most recent 10-Q compared to the previous quarters?
Retrieved 10 chunks.
Compressing retrieved chunks...
Compressed to 5 chunks.
Generating answer for the question...
Estimated input tokens: 446
Estimated output tokens: 434

Processing question ID 2: Has there been any significant change in Apple's operating expenses over the reported quarters? If so, what are the key drivers for this change?
Retrieving relevant documents for query: Has there been any significant change in Apple's operating expenses over the reported quarters? If s

### Our Parser

In [7]:
import json
import time
import os
import pandas as pd
from dotenv import load_dotenv
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.vectorstores import PathwayVectorClient
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
import tiktoken  # For token counting

# Load environment variables
def load_environment_variables():
    """Load environment variables from a .env file."""
    print("Loading environment variables...")
    load_dotenv()
    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
    print("Environment variables loaded.")

# Initialize Pathway retriever
def create_pathway_client(host="127.0.0.1", port=8011, k=10):
    """Initialize PathwayVectorClient and return as retriever."""
    print(f"Initializing PathwayVectorClient on host {host} and port {port}...")
    client = PathwayVectorClient(host=host, port=port)
    print("PathwayVectorClient initialized.")
    return client.as_retriever(search_kwargs={"k": k})

# Retrieve relevant documents
def retrieve_relevant_documents(retriever, query):
    """Retrieve relevant documents based on the given query."""
    print(f"Retrieving relevant documents for query: {query}")
    try:
        docs = retriever.get_relevant_documents(query)
        print(f"Retrieved {len(docs)} chunks.")
        return docs
    except Exception as e:
        print(f"Error retrieving documents: {e}")
        return []

# Create document compressor
def create_compressor(model="rerank-english-v3.0"):
    """Create and return a CohereRerank compressor."""
    print("Creating CohereRerank compressor...")
    compressor = CohereRerank(model=model, top_n=5)
    print("Compressor created.")
    return compressor

# Compress retrieved documents
def compress_documents(retriever, query, compressor):
    """Compress relevant documents using the provided compressor."""
    print("Compressing retrieved chunks...")
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor, base_retriever=retriever
    )
    try:
        compressed_docs = compression_retriever.invoke(query)
        print(f"Compressed to {len(compressed_docs)} chunks.")
        return compressed_docs
    except Exception as e:
        print(f"Error during compression: {e}")
        return []

# Initialize OpenAI LLM
def create_llm():
    """Create and return an OpenAI LLM instance."""
    print("Initializing OpenAI LLM...")
    llm = ChatOpenAI(
        model="gpt-4o-mini",
        temperature=0,
        max_tokens=None,
        timeout=45,
        max_retries=2,
    )
    print("OpenAI LLM initialized.")
    return llm

# Create ChatPromptTemplate for Q&A
def create_chat_prompt():
    """Create and return a chat prompt template for Q&A."""
    print("Creating chat prompt template...")
    chat_prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are a helpful assistant. Answer the following question based only on the provided context.\n"
                "Provide a step-by-step explanation, and reference the source details using metadata.\n\n"
                "<context>\n{context}\n</context>",
            ),
            ("human", "{question}"),
        ]
    )
    print("Chat prompt template created.")
    return chat_prompt

# Token counting function
def count_tokens(text, model="o200k_base"):
    """Estimate token count for given text using tiktoken."""
    tokenizer = tiktoken.get_encoding(model)
    tokens = tokenizer.encode(text)
    return len(tokens)

# Generate answer
def get_answer(context, question, chain):
    """Generate an answer based on the context and question using the LLM chain."""
    print("Generating answer for the question...")
    try:
        input_tokens = count_tokens(context) + count_tokens(question)
        print(f"Estimated input tokens: {input_tokens}")

        # Invoke the LLM chain with structured input
        answer = chain.invoke({"context": context, "question": question})
        
        output_tokens = count_tokens(answer.content)
        print(f"Estimated output tokens: {output_tokens}")
        
        return answer.content
    except Exception as e:
        print(f"Error generating answer: {e}")
        return None

# Main execution logic
def main():
    # Load environment variables
    load_environment_variables()

    # Load the dataset
    file_path = 'AAPL_10Q_dataset/aapl_qna_data.csv'
    print(f"Loading dataset from {file_path}...")
    data = pd.read_csv(file_path)
    print(f"Loaded {len(data)} questions.")

    # Initialize Pathway retriever, compressor, LLM, and chat prompt
    retriever = create_pathway_client()
    compressor = create_compressor()
    llm = create_llm()
    chat_prompt = create_chat_prompt()

    # Create the LLM chain
    chain = chat_prompt | llm

    # Prepare results storage
    results = []
    print("Total number of rows: ", len(data))
    # Loop over each question in the dataset
    for idx, row in data.iterrows():
        question = row['Question']
        gt_answer = row.get('Answer', "Ground truth answer not provided")
        
        print(f"\nProcessing question ID {idx}: {question}")

        # Step 1: Retrieve relevant documents
        retrieved_docs = retrieve_relevant_documents(retriever, question)
        if not retrieved_docs:
            print(f"No documents retrieved for query ID {idx}.")
            continue

        # Step 2: Compress retrieved documents
        compressed_docs = compress_documents(retriever, question, compressor)
        # compressed_docs = retrieved_docs

        # Prepare the context from compressed documents
        context = "\n".join([doc.page_content for doc in compressed_docs if doc.page_content])

        # Step 3: Generate an answer with retry logic
        answer = get_answer(context, question, chain)
        if answer is None:
            print(f"Retrying for question ID {idx} due to no initial response...")
            time.sleep(50)  # Wait 50 seconds before retrying
            answer = get_answer(context, question, chain)

        # Structure retrieved context in a list of dictionaries as per your format
        retrieved_context = [
            {"doc_id": doc.metadata.get("doc_id", "unknown"), "text": doc.page_content}
            for doc in compressed_docs
        ]
        
        # Append to results
        results.append({
            "query_id": str(idx).zfill(3),
            "question": question,
            "gt_answer": gt_answer,
            "response": answer,
            "retrieved_context": retrieved_context
        })

    # Save the results to a JSON file
    output_path = 'eval_results/basic_rag_enhancedParser.json'
    print(f"Saving evaluation results to {output_path}...")
    with open(output_path, 'w') as f:
        json.dump({"results": results}, f, indent=4)

    print(f"Evaluation results saved to {output_path}")

# Run main function
if __name__ == "__main__":
    main()

Loading environment variables...
Environment variables loaded.
Loading dataset from AAPL_10Q_dataset/aapl_qna_data.csv...
Loaded 39 questions.
Initializing PathwayVectorClient on host 127.0.0.1 and port 8011...
PathwayVectorClient initialized.
Creating CohereRerank compressor...
Compressor created.
Initializing OpenAI LLM...
OpenAI LLM initialized.
Creating chat prompt template...
Chat prompt template created.
Total number of rows:  39

Processing question ID 0: How has Apple's total net sales changed over time?
Retrieving relevant documents for query: How has Apple's total net sales changed over time?
Retrieved 10 chunks.
Compressing retrieved chunks...
Compressed to 5 chunks.
Generating answer for the question...
Estimated input tokens: 1008
Estimated output tokens: 565

Processing question ID 1: What are the major factors contributing to the change in Apple's gross margin in the most recent 10-Q compared to the previous quarters?
Retrieving relevant documents for query: What are the

### Unstructured HiRes Parser

In [1]:
import json
import time
import os
import pandas as pd
from dotenv import load_dotenv
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.vectorstores import PathwayVectorClient
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
import tiktoken  # For token counting

# Load environment variables
def load_environment_variables():
    """Load environment variables from a .env file."""
    print("Loading environment variables...")
    load_dotenv()
    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
    print("Environment variables loaded.")

# Initialize Pathway retriever
def create_pathway_client(host="127.0.0.1", port=8011, k=10):
    """Initialize PathwayVectorClient and return as retriever."""
    print(f"Initializing PathwayVectorClient on host {host} and port {port}...")
    client = PathwayVectorClient(host=host, port=port)
    print("PathwayVectorClient initialized.")
    return client.as_retriever(search_kwargs={"k": k})

# Retrieve relevant documents
def retrieve_relevant_documents(retriever, query):
    """Retrieve relevant documents based on the given query."""
    print(f"Retrieving relevant documents for query: {query}")
    try:
        docs = retriever.get_relevant_documents(query)
        print(f"Retrieved {len(docs)} chunks.")
        return docs
    except Exception as e:
        print(f"Error retrieving documents: {e}")
        return []

# Create document compressor
def create_compressor(model="rerank-english-v3.0"):
    """Create and return a CohereRerank compressor."""
    print("Creating CohereRerank compressor...")
    compressor = CohereRerank(model=model, top_n=5)
    print("Compressor created.")
    return compressor

# Compress retrieved documents
def compress_documents(retriever, query, compressor):
    """Compress relevant documents using the provided compressor."""
    print("Compressing retrieved chunks...")
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor, base_retriever=retriever
    )
    try:
        compressed_docs = compression_retriever.invoke(query)
        print(f"Compressed to {len(compressed_docs)} chunks.")
        return compressed_docs
    except Exception as e:
        print(f"Error during compression: {e}")
        return []

# Initialize OpenAI LLM
def create_llm():
    """Create and return an OpenAI LLM instance."""
    print("Initializing OpenAI LLM...")
    llm = ChatOpenAI(
        model="gpt-4o-mini",
        temperature=0,
        max_tokens=None,
        timeout=45,
        max_retries=2,
    )
    print("OpenAI LLM initialized.")
    return llm

# Create ChatPromptTemplate for Q&A
def create_chat_prompt():
    """Create and return a chat prompt template for Q&A."""
    print("Creating chat prompt template...")
    chat_prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are a helpful assistant. Answer the following question based only on the provided context.\n"
                "Provide a step-by-step explanation, and reference the source details using metadata.\n\n"
                "<context>\n{context}\n</context>",
            ),
            ("human", "{question}"),
        ]
    )
    print("Chat prompt template created.")
    return chat_prompt

# Token counting function
def count_tokens(text, model="o200k_base"):
    """Estimate token count for given text using tiktoken."""
    tokenizer = tiktoken.get_encoding(model)
    tokens = tokenizer.encode(text)
    return len(tokens)

# Generate answer
def get_answer(context, question, chain):
    """Generate an answer based on the context and question using the LLM chain."""
    print("Generating answer for the question...")
    try:
        input_tokens = count_tokens(context) + count_tokens(question)
        print(f"Estimated input tokens: {input_tokens}")

        # Invoke the LLM chain with structured input
        answer = chain.invoke({"context": context, "question": question})
        
        output_tokens = count_tokens(answer.content)
        print(f"Estimated output tokens: {output_tokens}")
        
        return answer.content
    except Exception as e:
        print(f"Error generating answer: {e}")
        return None

# Main execution logic
def main():
    # Load environment variables
    load_environment_variables()

    # Load the dataset
    file_path = 'AAPL_10Q_dataset/aapl_qna_data.csv'
    print(f"Loading dataset from {file_path}...")
    data = pd.read_csv(file_path)
    print(f"Loaded {len(data)} questions.")

    # Initialize Pathway retriever, compressor, LLM, and chat prompt
    retriever = create_pathway_client()
    compressor = create_compressor()
    llm = create_llm()
    chat_prompt = create_chat_prompt()

    # Create the LLM chain
    chain = chat_prompt | llm

    # Prepare results storage
    results = []
    print("Total number of rows: ", len(data))
    # Loop over each question in the dataset
    for idx, row in data.iterrows():
        question = row['Question']
        gt_answer = row.get('Answer', "Ground truth answer not provided")
        
        print(f"\nProcessing question ID {idx}: {question}")

        # Step 1: Retrieve relevant documents
        retrieved_docs = retrieve_relevant_documents(retriever, question)
        if not retrieved_docs:
            print(f"No documents retrieved for query ID {idx}.")
            continue

        # Step 2: Compress retrieved documents
        compressed_docs = compress_documents(retriever, question, compressor)
        # compressed_docs = retrieved_docs

        # Prepare the context from compressed documents
        context = "\n".join([doc.page_content for doc in compressed_docs if doc.page_content])

        # Step 3: Generate an answer with retry logic
        answer = get_answer(context, question, chain)
        if answer is None:
            print(f"Retrying for question ID {idx} due to no initial response...")
            time.sleep(50)  # Wait 50 seconds before retrying
            answer = get_answer(context, question, chain)

        # Structure retrieved context in a list of dictionaries as per your format
        retrieved_context = [
            {"doc_id": doc.metadata.get("doc_id", "unknown"), "text": doc.page_content}
            for doc in compressed_docs
        ]
        
        # Append to results
        results.append({
            "query_id": str(idx).zfill(3),
            "question": question,
            "gt_answer": gt_answer,
            "response": answer,
            "retrieved_context": retrieved_context
        })

    # Save the results to a JSON file
    output_path = 'eval_results/basic_rag_hiResParser.json'
    print(f"Saving evaluation results to {output_path}...")
    with open(output_path, 'w') as f:
        json.dump({"results": results}, f, indent=4)

    print(f"Evaluation results saved to {output_path}")

# Run main function
if __name__ == "__main__":
    main()

Loading environment variables...
Environment variables loaded.
Loading dataset from AAPL_10Q_dataset/aapl_qna_data.csv...
Loaded 39 questions.
Initializing PathwayVectorClient on host 127.0.0.1 and port 8011...
PathwayVectorClient initialized.
Creating CohereRerank compressor...
Compressor created.
Initializing OpenAI LLM...
OpenAI LLM initialized.
Creating chat prompt template...
Chat prompt template created.
Total number of rows:  39

Processing question ID 0: How has Apple's total net sales changed over time?
Retrieving relevant documents for query: How has Apple's total net sales changed over time?


  docs = retriever.get_relevant_documents(query)


Retrieved 10 chunks.
Compressing retrieved chunks...
Compressed to 5 chunks.
Generating answer for the question...
Estimated input tokens: 823
Estimated output tokens: 620

Processing question ID 1: What are the major factors contributing to the change in Apple's gross margin in the most recent 10-Q compared to the previous quarters?
Retrieving relevant documents for query: What are the major factors contributing to the change in Apple's gross margin in the most recent 10-Q compared to the previous quarters?
Retrieved 10 chunks.
Compressing retrieved chunks...
Compressed to 5 chunks.
Generating answer for the question...
Estimated input tokens: 110
Estimated output tokens: 464

Processing question ID 2: Has there been any significant change in Apple's operating expenses over the reported quarters? If so, what are the key drivers for this change?
Retrieving relevant documents for query: Has there been any significant change in Apple's operating expenses over the reported quarters? If so

### Print Stats

In [None]:
import os
from dotenv import load_dotenv
from langchain_community.vectorstores import PathwayVectorClient

def load_environment_variables():
    """Load environment variables from a .env file."""
    print("Loading environment variables...")
    load_dotenv()
    print("Environment variables loaded.")

def calculate_average_chunk_size(client, sample_size=10000):
    """
    Estimate the average chunk size based on a sample of documents from the vector store.
    
    Args:
        client (PathwayVectorClient): The client connected to the Pathway Vector Store.
        sample_size (int): Number of documents to sample for estimating chunk size.
    
    Returns:
        int: Estimated average chunk size.
    """
    try:
        # Retrieve a sample of documents
        sample_docs = client.similarity_search(".", k=sample_size)
        
        # Calculate the average document length in the sample
        total_length = sum(len(doc.page_content) for doc in sample_docs if doc.page_content)
        average_chunk_size = total_length // len(sample_docs)
        print(f"Estimated average chunk size from sample: {average_chunk_size} characters.")
        
        return average_chunk_size
    
    except Exception as e:
        print(f"Error retrieving sample documents: {e}")
        return None

def calculate_total_chunks(client, estimated_chunk_size):
    """
    Calculate the total number of chunks in the Pathway Vector Store using the estimated chunk size.
    
    Args:
        client (PathwayVectorClient): The client connected to the Pathway Vector Store.
        estimated_chunk_size (int): Estimated size of each chunk in characters.
    
    Returns:
        int: Estimated total number of chunks in the database.
    """
    try:
        # Try to get total characters from the vector store statistics
        stats = client.get_vectorstore_statistics()
        total_chars = stats.get("total_character_count")  # Hypothetical field
        
        if total_chars:
            print(f"Total characters from statistics: {total_chars}")
            total_chunks = total_chars // estimated_chunk_size + (1 if total_chars % estimated_chunk_size != 0 else 0)
            return total_chunks
        else:
            print("Character count not available in statistics. Proceeding with document iteration.")
    
    except Exception as e:
        print(f"Error retrieving statistics: {e}")
    
    # Fall back to summing document lengths if statistics aren't available
    try:
        all_docs = client.similarity_search("how are you", k=10000)  # Retrieve a large number of docs (adjust as needed)
        
        total_length = sum(len(doc.page_content) for doc in all_docs if doc.page_content)
        print(f"Total characters by iterating documents: {total_length}")
        
        total_chunks = total_length // estimated_chunk_size + (1 if total_length % estimated_chunk_size != 0 else 0)
        return total_chunks
    
    except Exception as e:
        print(f"Error retrieving documents: {e}")
        return 0

# Load environment variables (if needed)
load_environment_variables()

# Initialize PathwayVectorClient
client = PathwayVectorClient(host="127.0.0.1", port=8011)

# Step 1: Estimate average chunk size based on sample documents
estimated_chunk_size = calculate_average_chunk_size(client)
if estimated_chunk_size is None:
    print("Could not determine an estimated chunk size.")

# Step 2: Calculate total chunks in the vector store based on estimated chunk size
total_chunks = calculate_total_chunks(client, estimated_chunk_size)
print(f"Total estimated chunks in the database: {total_chunks}")

Loading environment variables...
Environment variables loaded.
Error retrieving sample documents: HTTPConnectionPool(host='127.0.0.1', port=8011): Read timed out. (read timeout=3)
Could not determine an estimated chunk size.
