In [1]:
import os
import chromadb
import faiss
import numpy as np
from openai import OpenAI
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from dotenv import load_dotenv

In [2]:
# Load environment variables from the .env file
load_dotenv()

# Initialize OpenAI client
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

# Load the FAISS index and metadata
faiss_index = faiss.read_index("faiss_index.bin")
ids = np.load("ids.npy", allow_pickle=True)
metadata = np.load("metadata.npy", allow_pickle=True)

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [3]:
def search(query, top_k=10):
    # Embed the query
    query_embedding = embed_model.get_text_embedding(query)
    query_embedding = np.array([query_embedding], dtype=np.float32)
    
    # Search the FAISS index
    distances, indices = faiss_index.search(query_embedding, top_k)
    
    # Retrieve the top_k document IDs and metadata
    result_ids = [ids[i] for i in indices[0]]
    result_metadata = [metadata[i] for i in indices[0]]
    
    return distances[0], result_ids, result_metadata

In [4]:
def retrieve_related_documents(element_id, part_number):
    db_path = f"chroma_tmp_part{part_number}"
    client = chromadb.PersistentClient(path=db_path)
    collection = client.get_collection(f"investopedia_articles_part{part_number}")
    
    # Fetch the title document
    title_document = collection.get(ids=[element_id], include=["documents", "metadatas"])
    
    # Fetch narratives related to the given element_id
    related_narratives = collection.get(where={"parent_title_id": element_id}, include=["documents", "metadatas"])
    
    return title_document, related_narratives

In [6]:
def query_llm(query, context):
    prompt = (
        "Based on the following context, provide a detailed and expert-level response to the query. "
        "Ensure the response is well-structured, includes specific financial insights, comparisons to traditional financial instruments where relevant, and uses appropriate terminology.\n\n"
        "Context:\n"
        f"{context}\n\n"
        "Query:\n"
        f"{query}\n\n"
        "Response:"
    )
    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a finance and investment expert."},
            {"role": "user", "content": prompt}
        ],
        model="gpt-4o-mini",
        max_tokens=300,
        temperature=0.5
    )
    return response.choices[0].message.content.strip()


In [7]:
def aggregate_context_and_query_llm(query, top_k=10, max_part_number=10, distance_threshold=0.5):
    # Perform search
    distances, result_ids, result_metadata = search(query, top_k=top_k)
    
    all_titles = []
    all_narratives = []
    
    for distance, element_id in zip(distances, result_ids):
        if distance > distance_threshold:
            continue  # Skip results with distance above the threshold

        for part_number in range(1, max_part_number + 1):  # Adjust according to the number of parts you have
            try:
                title_document, related_narratives = retrieve_related_documents(element_id, part_number)
                if title_document["documents"]:
                    all_titles.append(title_document["documents"][0])
                if related_narratives["documents"]:
                    all_narratives.extend(related_narratives["documents"])
            except Exception as e:
                print(f"Error retrieving documents for element_id {element_id} in part {part_number}: {e}")
    
    # Combine all narratives to form the context
    context = "\n".join(all_narratives[:5])  # Limit the context to the top 5 narratives for conciseness
    
    # Query the LLM with the context and query
    llm_response = query_llm(query, context)
    
    return distances, all_titles, context, llm_response

In [8]:
# Example query
query = "What is YoY growth and how is it calculated?"
distances, titles, context, llm_response = aggregate_context_and_query_llm(query, top_k=5, distance_threshold=0.7)

print("Distances:", distances)
print("Titles:", titles)
print("Context for LLM:")
print(context)
print("LLM Response:")
print(llm_response)

Distances: [0.35899398 0.415286   0.4346206  0.4491641  0.457163  ]
Titles: ['\ufeffunderstand growth rate', 'understand econom growth rate', 'understand econom growth', 'econom growth rate definit formula exampl', 'understand real econom growth rate']
Context for LLM:
convers countri grown incom two consecut quarter consid expand
basic level growth rate use express annual chang variabl percentag exampl economi ’ growth rate deriv annual rate chang countri ’ gdp increas decreas rate growth use measur economi ’ recess expans incom within countri declin two consecut quarter consid recess
econom growth fundament goal countri allow improv wellb citizen invest variou area educ healthcar infrastructur also close monitor policymak busi investor make inform decis
posit econom growth rate signifi economi expand measur period often mean countri increas econom activ output growth often lead higher employ rate improv live standard greater opportun busi individu convers neg econom growth rate sugge