## RAG Them All!


### Imports


In [1]:
import os
import pandas as pd
import numpy as np
import pickle
import faiss

from glob import glob
from tqdm import tqdm
from llama_index.core import VectorStoreIndex, get_response_synthesizer, StorageContext
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings
from llama_index.core import (
    load_index_from_storage,
    load_indices_from_storage,
    load_graph_from_storage,
)
from llama_index.core import PromptTemplate
from llama_index.core.response_synthesizers import TreeSummarize
from llama_index.core import Settings
from trulens_eval import Tru
from trulens_eval import TruLlama
from trulens_eval import Feedback
from trulens_eval.feedback import GroundTruthAgreement
from trulens_eval import Feedback
from trulens_eval.app import App

### Configurations


In [2]:
os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN  # Set OpenAI Token

In [3]:
# Loading Test Data
test_data = pd.read_csv("../data/financial-qa-dataset/financial_qna_with_metadata.csv")
alphabet_10k_question_df = (
    test_data[(test_data["Entity"] == "Alphabet") & (test_data["Year"] == 2023)]
    .copy()
    .reset_index(drop=True)
)

In [4]:
alphabet_10k_question_df = (
    test_data[(test_data["Entity"] == "Alphabet") & (test_data["Year"] == 2023)]
    .copy()
    .reset_index(drop=True)
)
print(alphabet_10k_question_df.shape[0])
alphabet_10k_question_df.head()

30


Unnamed: 0,Questions,Answers,Contexts,Document,Page_no,Year,Sector,Entity,Document_type,Quarter
0,What was the value of Alphabet's total current...,"$171,530 million",Alphabet Inc.\nCONSOLIDATED BALANCE SHEETS\n(i...,goog-10-k-2023.pdf,page_0,2023,Technology,Alphabet,annual report,
1,What was the total value of Alphabet's market...,"$86,868 million",Alphabet Inc.\nCONSOLIDATED BALANCE SHEETS\n(i...,goog-10-k-2023.pdf,page_0,2023,Technology,Alphabet,annual report,
2,What was the value of Alphabet's goodwill in 2...,"$28,960 million",Alphabet Inc.\nCONSOLIDATED BALANCE SHEETS\n(i...,goog-10-k-2023.pdf,page_0,2023,Technology,Alphabet,annual report,
3,What amount of long-term debt did Alphabet hav...,"$13,253 million",Alphabet Inc.\nCONSOLIDATED BALANCE SHEETS\n(i...,goog-10-k-2023.pdf,page_0,2023,Technology,Alphabet,annual report,
4,What was the total value of Alphabet's accrued...,"$37,866 million",Alphabet Inc.\nCONSOLIDATED BALANCE SHEETS\n(i...,goog-10-k-2023.pdf,page_0,2023,Technology,Alphabet,annual report,


### Experimentation Config


In [5]:
from pprint import pprint

exps = [
    # ["Exp#1: GPT-4o + ada002",
    #      OpenAI(model="gpt-4o", temperature=0),
    #      OpenAIEmbedding(model='text-embedding-ada-002')
    # ],
    [
        "Exp#2 GPT-4o-mini + ada002",
        OpenAI(model="gpt-4o-mini"),
        OpenAIEmbedding(model="text-embedding-ada-002"),
    ],
    [
        "Exp#3: Llama 3.1 + ada002",
        Ollama(model="llama3.1:latest", request_timeout=120.0),
        OpenAIEmbedding(model="text-embedding-ada-002"),
    ],
    # ["Exp#4: Llama 3.1 + fin-invest",
    #      Ollama(model="llama3.1:latest", request_timeout=120.0),
    #      HuggingFaceEmbedding(model_name="FinLang/finance-embeddings-investopedia")
    # ],
    [
        "Exp#5: Llama 3.1 + bge-large",
        Ollama(model="llama3.1:latest", request_timeout=220.0),
        HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5"),
    ],
    # ["Exp#6: Gemma 2 + fin-invest",
    #      Ollama(model="gemma2", request_timeout=120.0),
    #      HuggingFaceEmbedding(model_name="FinLang/finance-embeddings-investopedia")
    # ],
    # ["Exp#7: Gemma 2 + bge-large",
    #      Ollama(model="gemma2", request_timeout=120.0),
    #      HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")
    # ],
    # ["Exp#8: Gemma 2 + ada002",
    #      Ollama(model="gemma2", request_timeout=120.0),
    #      OpenAIEmbedding(model_name="text-embedding-ada-002")
    # ],
    # ["Exp#9: Mistral + ada002",
    #      Ollama(model="mistral", request_timeout=120.0),
    #      OpenAIEmbedding(model_name="text-embedding-ada-002")
    # ],
]
# pprint([x for x in exps])

In [6]:
len(exps)

3

### TruLens Db Init


In [7]:
tru = Tru()
tru.reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


## Experiment Runs


In [8]:
qa_prompt_tmpl = """You are a financial expert on 10k annual company reportings. Answer the following question. Answer 2-3 words only:

    Question: {query_str}\n
    Context:\n{context_str}\n
    Answer: 
"""
qa_prompt = PromptTemplate(qa_prompt_tmpl)


for exp_name, llm, emb_model in exps:
    print("=================================================")
    print(f"       Running {exp_name}        ")
    print("=================================================")

    Settings.llm = llm
    Settings.embed_model = emb_model

    try:

        if emb_model.model_name == "text-embedding-ada-002":

            try:
                # load index from disk
                print(">> Loading index from local")
                vector_store = FaissVectorStore.from_persist_dir(
                    "./resources/storage_openai"
                )
                storage_context = StorageContext.from_defaults(
                    vector_store=vector_store, persist_dir="./resources/storage_openai"
                )
                index = load_index_from_storage(storage_context=storage_context)
                print("Index loaded")

            except:
                # dimensions of text-ada-embedding-002
                print(">> Loading index from local failed. Recomputing index...")
                dim = 1536
                faiss_index = faiss.IndexFlatL2(dim)

                vector_store = FaissVectorStore(faiss_index=faiss_index)
                storage_context = StorageContext.from_defaults(
                    vector_store=vector_store
                )
                index = VectorStoreIndex.from_documents(
                    documents, storage_context=storage_context
                )
                index.storage_context.persist("./resources/storage_openai")
                print("Index loaded")

        elif emb_model.model_name == "BAAI/bge-large-en-v1.5":
            try:
                # load index from disk
                print(">> Loading index from local")
                vector_store = FaissVectorStore.from_persist_dir(
                    "./resources/storage_oss"
                )
                storage_context = StorageContext.from_defaults(
                    vector_store=vector_store, persist_dir="./resources/storage_oss"
                )
                index = load_index_from_storage(storage_context=storage_context)
                print("Index loaded")
            except:
                print(">> Loading index from local failed. Recomputing index...")
                dim = 1024
                faiss_index = faiss.IndexFlatL2(dim)
                vector_store = FaissVectorStore(faiss_index=faiss_index)
                storage_context = StorageContext.from_defaults(
                    vector_store=vector_store
                )
                index = VectorStoreIndex.from_documents(
                    documents, storage_context=storage_context
                )
                index.storage_context.persist("./resources/storage_oss")
                print("Index loaded")

        elif emb_model.model_name == "FinLang/finance-embeddings-investopedia":
            try:
                # load index from disk
                print(">> Loading index from local")
                vector_store = FaissVectorStore.from_persist_dir(
                    "./resources/storage_fin_oss"
                )
                storage_context = StorageContext.from_defaults(
                    vector_store=vector_store, persist_dir="./resources/storage_fin_oss"
                )
                index = load_index_from_storage(storage_context=storage_context)
                print("Index loaded")
            except:
                print(">> Loading index from local failed. Recomputing index...")
                dim = 768
                faiss_index = faiss.IndexFlatL2(dim)
                vector_store = FaissVectorStore(faiss_index=faiss_index)
                storage_context = StorageContext.from_defaults(
                    vector_store=vector_store
                )
                index = VectorStoreIndex.from_documents(
                    documents, storage_context=storage_context
                )
                print("Saving index")
                index.storage_context.persist("./resources/storage_fin_oss")
                print("Index loaded")

        else:
            print("> Index not loading. Please check")
            break

        # configure retriever
        retriever = VectorIndexRetriever(
            index=index,
            similarity_top_k=8,
        )

        # configure response synthesizer
        response_synthesizer = get_response_synthesizer(
            summary_template=qa_prompt,
            response_mode="tree_summarize",
        )

        # assemble query engine
        query_engine = RetrieverQueryEngine(
            retriever=retriever,
            response_synthesizer=response_synthesizer,
        )

        # Initialize provider class
        from trulens_eval.feedback.provider import OpenAI as LLMJudge

        provider = LLMJudge(model_engine="gpt-4o")

        # select context to be used in feedback. the location of context is app specific.
        context = App.select_context(query_engine)

        # Groundtruth SME data
        golden_set = test_data.rename(
            columns={"Questions": "query", "Answers": "response"}
        )[["query", "response"]].to_dict(orient="records")

        # On Actual Groundtruth data
        f_groundtruth = Feedback(
            GroundTruthAgreement(golden_set).agreement_measure, name="Ground Truth"
        ).on_input_output()

        # Define a groundedness feedback function
        f_groundedness = (
            Feedback(
                provider.groundedness_measure_with_cot_reasons, name="Groundedness"
            )
            .on(context.collect())  # collect context chunks into a list
            .on_output()
        )

        # Question/answer relevance between overall question and answer.
        f_answer_relevance = Feedback(
            provider.relevance_with_cot_reasons, name="Answer Relevance"
        ).on_input_output()
        # Question/statement relevance between question and each context chunk.
        f_context_relevance = (
            Feedback(
                provider.context_relevance_with_cot_reasons, name="Context Relevance"
            )
            .on_input()
            .on(context)
            .aggregate(np.mean)
        )

        tru_recorder = TruLlama(
            query_engine,
            app_id=exp_name,
            feedbacks=[f_groundtruth, f_groundedness, f_context_relevance],
        )
        # with tru_recorder as recording:
        #     print(query_engine.query("What was the value of Alphabet's total current assets as of December 31, 2023?"))
        # print("\n")
        with tru_recorder as recording:
            rag_answers = []
            for idx, row in tqdm(
                alphabet_10k_question_df.iterrows(),
                total=alphabet_10k_question_df.shape[0],
            ):
                question = row.Questions
                response = query_engine.query(question)
                try:
                    rag_answers.append(
                        [
                            question,
                            [response.response],
                            [[x.text] for x in response.source_nodes],
                        ]
                    )
                except:
                    rag_answers.append([question, ["NA"], []])

    except Exception as e:
        print(f">> Failed for {exp_name}")

       Running Exp#2 GPT-4o-mini + ada002        
>> Loading index from local
Index loaded
✅ In Ground Truth, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Ground Truth, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.query.rets.source_nodes[:].node.text .


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [01:05<00:00,  2.19s/it]


       Running Exp#3: Llama 3.1 + ada002        
>> Loading index from local
Index loaded
✅ In Ground Truth, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Ground Truth, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.query.rets.source_nodes[:].node.text .


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [10:09<00:00, 20.32s/it]


       Running Exp#5: Llama 3.1 + bge-large        
>> Loading index from local
Index loaded
✅ In Ground Truth, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Ground Truth, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.query.rets.source_nodes[:].node.text .


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [15:37<00:00, 31.25s/it]


## Leaderboard


In [11]:
tru.get_leaderboard()

Unnamed: 0_level_0,Context Relevance,Groundedness,Ground Truth,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Exp#3: Llama 3.1 + ada002,0.344167,0.633333,0.676667,19.466667,0.0
Exp#2 GPT-4o-mini + ada002,0.34375,0.966667,0.9,19.466667,0.000764
Exp#5: Llama 3.1 + bge-large,0.279583,0.596667,0.555172,19.466667,0.0


## Dashboard


In [12]:
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
Dashboard already running at path:   Network URL: http://192.168.1.2:60486



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>