In [1]:
import os, time, glob
from dotenv import load_dotenv

from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Pinecone as langchain_pinecone
from pinecone import Pinecone, ServerlessSpec

import pandas as pd
import promptquality as pq
from tqdm import tqdm
tqdm.pandas()

from metrics import all_metrics
from qa_chain import get_qa_chain

load_dotenv("../.env")

True

In [2]:
documents = []
for file_path in glob.glob("../data/nvidia_10k_*.pdf"):
    print(file_path)
    loader = PyPDFLoader(file_path)
    documents.extend(loader.load_and_split())

len(documents)

../data/nvidia_10k_2023.pdf
../data/nvidia_10k_2022.pdf
../data/nvidia_10k_2021.pdf
../data/nvidia_10k_2024.pdf


701

In [None]:
def get_questions(text):
    questions = chat_model.invoke(
    [
        HumanMessage(
            content=f"""Your job is to generate only 1 short question from the given text such that it can be answered using the provided text. Use the exact info in the questions as mentioned in the text. There should not be duplicates questions. Return questions starting with - instead of numbers.

Text: {text}
Questions: """
        )
    ]
)
    questions = questions.content.replace("- ", "").split("\n")
    questions = list(filter(None, questions)) 
    return questions

text = documents[1].page_content
print(text)
chat_model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=1.0)
get_questions(text)

In [None]:
df = pd.DataFrame({"text": [doc.page_content for doc in documents]})
df = df.sample(n=100, random_state=0)
df["questions"] = df.text.progress_apply(get_questions)
df.head()

In [None]:
df.to_csv("../data/nvidia_questions.csv", index=False)

In [3]:
df = pd.read_csv("../data/nvidia_questions.csv")
df["questions"] = df.questions.apply(eval)
df.head()

Unnamed: 0,text,questions
0,"Automotive market platform, COVID-19 did not h...",[In which regions did COVID-19 lead to an incr...
1,"that liabilities, while possible, are not prob...",[Can the possible loss or range of loss in leg...
2,Table of Contents\nItem 7. Management's Discus...,[What are NVIDIA's two operating segments as m...
3,to Note 12 of the Notes to the Consoli...,[What is the amount of long-term tax liabiliti...
4,(5) La Compañía y sus Afiliadas no son resp...,[Where are the offices of the Company register...


In [4]:
questions = df.explode("questions").questions.sample(n=100, random_state=1).tolist()
len(questions)

100

In [5]:
# instantiate a Pinecone client
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

project_name = "emb-model-eval"
temperature = 0.1

In [6]:
def rag_chain_executor(emb_model_name: str, dimensions: int, llm_model_name: str, emb_k: int, rerank_k: int) -> None:
    # initialise embedding model
    if "text-embedding-3" in emb_model_name:
        embeddings = OpenAIEmbeddings(model=emb_model_name, dimensions=dimensions)
    else:
        embeddings = HuggingFaceEmbeddings(model_name=emb_model_name, encode_kwargs = {'normalize_embeddings': True})
        
    index_name = f"{emb_model_name}-{dimensions}".lower()
    
    # First, check if our index already exists and delete stale index
    if index_name in [index_info['name'] for index_info in pc.list_indexes()]:
        pc.delete_index(index_name)

    # create a new index
    pc.create_index(name=index_name, metric="cosine", dimension=dimensions, 
                    spec=ServerlessSpec(
                    cloud="aws",
                    region="us-west-2"
                ) )
    time.sleep(10)
    
    # index the documents
    _ = langchain_pinecone.from_documents(documents, embeddings, index_name=index_name)
    time.sleep(10)
    
    # load qa chain 
    qa = get_qa_chain(embeddings, index_name, emb_k, rerank_k, llm_model_name, temperature)
    
    # tags to be kept in galileo run
    run_name = f"{index_name}-emb-k-{emb_k}-rerank-k-{rerank_k}"
    index_name_tag = pq.RunTag(key="Index config", value=index_name, tag_type=pq.TagType.RAG)
    encoder_model_name_tag = pq.RunTag(key="Encoder", value=emb_model_name, tag_type=pq.TagType.RAG)
    llm_model_name_tag = pq.RunTag(key="LLM", value=llm_model_name, tag_type=pq.TagType.RAG)
    dimension_tag = pq.RunTag(key="Dimension", value=str(dimensions), tag_type=pq.TagType.RAG)
    emb_k_tag = pq.RunTag(key="Emb k", value=str(emb_k), tag_type=pq.TagType.RAG)
    rerank_k_tag = pq.RunTag(key="Rerank k", value=str(rerank_k), tag_type=pq.TagType.RAG)

    evaluate_handler = pq.GalileoPromptCallback(project_name=project_name, run_name=run_name, scorers=all_metrics, run_tags=[encoder_model_name_tag, llm_model_name_tag, index_name_tag, dimension_tag, emb_k_tag, rerank_k_tag])

    # run chain with questions to generate the answers
    print("Ready to ask!")
    for i, q in enumerate(tqdm(questions)):
        print(f"Question {i}: ", q)
        print(qa.invoke(q, config=dict(callbacks=[evaluate_handler])))
        print("\n\n")

    evaluate_handler.finish()
    
pq.login("console.demo.rungalileo.io")

👋 You have logged into 🔭 Galileo (https://console.demo.rungalileo.io/) as pratik@rungalileo.io.


Config(console_url=Url('https://console.demo.rungalileo.io/'), username='pratik@rungalileo.io', password=SecretStr('**********'), api_key=None, token=SecretStr('**********'), current_user='pratik@rungalileo.io', current_project_id=None, current_project_name=None, current_run_id=None, current_run_name=None, current_run_url=None, current_run_task_type=None, current_template_id=None, current_template_name=None, current_template_version_id=None, current_template_version=None, current_template=None, current_dataset_id=None, current_job_id=None, api_url=Url('https://api.demo.rungalileo.io/'))

In [1]:
pq.sweep(
    rag_chain_executor,
    {
        "emb_model_name": ["text-embedding-3-small"],
        "dimensions": [384],
        "llm_model_name": ["gpt-3.5-turbo-0125"],
        "emb_k": [10],
        "rerank_k": [3]
    },
)