<a href="https://colab.research.google.com/github/robertheubanks/newaiengbootcamp/blob/main/Eubanks_AIEngBootCamp_Midterm_vFINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### Let's install the libraries
!pip install -U -q langchain langchain-openai langchain_core langchain-community langchainhub openai

In [None]:
### Let's also install RAGAS
!pip install -qU ragas

In [None]:
### Let's load FAISS as our vector store and also the pdf loader that we will use to load the NVIDIA 10-k. we are also using Unstructured IOs Unstructured pdf loader
!pip install -qU faiss_cpu pymupdf pandas

In [None]:
### Now, let's load OpenAI so that we can use it
import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

Please provide your OpenAI Key: ··········


In [None]:
### Let's load the NVIDIA 10-k
from langchain_community.document_loaders import PyMuPDFLoader

# Construct the path to the PDF
pdf_path = "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/1cbe8fe7-e08a-46e3-8dcc-b429fc06c1a4.pdf"

# Initialize the PyMuPDFLoader with the correct path
loader = PyMuPDFLoader(pdf_path)

# Load the PDF into documents
documents = loader.load()



In [None]:
### We will now split the documents into chunks. We will start with a chunk size of 700 with an overlap of 50. We can play around with these to see if there is any impact on output quality.
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap = 50
)

documents = text_splitter.split_documents(documents)

In [None]:
### Let's confirm the document has been split up
len(documents)

624

In [None]:
### Let's load OpenAI's text embedding 3 small
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

In [None]:
### Let's create a vector store using Meta's FAISS
from langchain_community.vectorstores import FAISS

vector_store = FAISS.from_documents(documents, embeddings)

In [None]:
### Let's create the retriever
retriever = vector_store.as_retriever()

In [None]:
### Let's create a prompt template
from langchain import hub

retrieval_qa_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

In [None]:
print(retrieval_qa_prompt.messages[0].prompt.template)

Answer any use questions based solely on the context below:

<context>
{context}
</context>


In [None]:
### Now we need to create a prompt template.
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':

Context:
{context}

Question:
{question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
### Now we will create a basic Q&A RAG chain
from operator import itemgetter

from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

In [None]:
### Here is the first question
question = "Who is the E-VP, Operations - and how old are they?"

result = retrieval_augmented_qa_chain.invoke({"question" : question})

print(result["response"].content)

Debora Shoquist is the Executive Vice President, Operations, and she is 69 years old.


In [None]:
### Here is the second question
question = "What is the gross carrying amount of Total Amortizable Intangible Assets for Jan 29, 2023?"

result = retrieval_augmented_qa_chain.invoke({"question" : question})

print(result["response"].content)

$3,539 million


In [None]:
### Now we will do the RAGAS evaulation
### We need to generate a synthetic data test set
eval_documents = documents

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 400
)

eval_documents = text_splitter.split_documents(eval_documents)

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

generator = TestsetGenerator.with_openai()

testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.25, reasoning: 0.25, multi_context: 0.5})

embedding nodes:   0%|          | 0/1248 [00:00<?, ?it/s]



Generating:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
### Now that we have QC pairs, ground truth, we can start doing the evaluation
test_df = testset.to_pandas()


In [None]:
test_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,episode_done
0,What are the automotive safety standards that ...,[a wide range of intelligent functions into a ...,The DRIVE brand is designed and implemented ba...,simple,True
1,What potential risks and costs are associated ...,"[payment of lost profits, or injunctive relief...",Claims that our products or processes infringe...,simple,True
2,What reports does the Audit Committee receive ...,[security policies and practices and the inter...,The Audit Committee receives regular informati...,reasoning,True
3,What are the consequences of not protecting ou...,[Actions to adequately protect our IP rights c...,The consequences of not protecting our IP righ...,reasoning,True
4,"""What strategies has NVIDIA used to attract an...",[relating to human capital management.\nTo be ...,NVIDIA has used a strong employer brand and di...,multi_context,True
5,"""What advantages do vertical-specific optimiza...",[eases the deployment of NVIDIA accelerated co...,The advantages of vertical-specific optimizati...,multi_context,True
6,"""What are the potential consequences of licens...",[the USG’s export controls. Given the increasi...,The potential consequences of licensing requir...,multi_context,True
7,"""What are the GPU's strengths in the NVIDIA pl...",[Table of Contents\nAt the foundation of the N...,The GPU's strengths in the NVIDIA platform are...,multi_context,True
8,"What caused the change in Other, net in the An...","[Change in Other, net, compared to fiscal year...","The change in Other, net in the Annual Report ...",multi_context,True
9,What are some costs associated with changes to...,"[greater direct costs, including costs associa...",greater direct costs,simple,True


In [None]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [None]:
### Now we will generate some responses using our RAG pipeline based on teh questions we generated
answers = []
contexts = []

for question in test_questions:
  response = retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

In [None]:
### We will use a dataset from Hugging Face for the RAG library
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [None]:
### Now that we have the response dataset we can do the RAGAS evaluation
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [None]:
### Let's evaluate
results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
results

{'faithfulness': 1.0000, 'answer_relevancy': 0.8362, 'context_recall': 0.9500, 'context_precision': 0.9667, 'answer_correctness': 0.7764}

In [None]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What are the automotive safety standards that ...,I don't know.,[a wide range of intelligent functions into a ...,The DRIVE brand is designed and implemented ba...,,0.0,1.0,0.833333,0.930157
1,What potential risks and costs are associated ...,The potential risks and costs associated with ...,"[payment of lost profits, or injunctive relief...",Claims that our products or processes infringe...,1.0,1.0,1.0,1.0,0.51809
2,What reports does the Audit Committee receive ...,The Audit Committee receives regular informati...,[security policies and practices and the inter...,The Audit Committee receives regular informati...,,0.883927,1.0,1.0,0.745943
3,What are the consequences of not protecting ou...,The consequences could include substantial cos...,[Actions to adequately protect our IP rights c...,The consequences of not protecting our IP righ...,1.0,0.958058,1.0,1.0,0.673558
4,"""What strategies has NVIDIA used to attract an...",NVIDIA has used a strong employer brand and di...,[relating to human capital management.\nTo be ...,NVIDIA has used a strong employer brand and di...,1.0,0.966365,1.0,1.0,0.749777
5,"""What advantages do vertical-specific optimiza...",Vertical-specific optimizations offer advantag...,[eases the deployment of NVIDIA accelerated co...,The advantages of vertical-specific optimizati...,1.0,0.946656,1.0,1.0,0.672276
6,"""What are the potential consequences of licens...",The potential consequences could include the c...,[business and results of operations.\nGovernme...,The potential consequences of licensing requir...,1.0,0.883287,1.0,1.0,0.502972
7,"""What are the GPU's strengths in the NVIDIA pl...",The GPU's strengths in the NVIDIA platform are...,[expanded to several other large and important...,The GPU's strengths in the NVIDIA platform are...,1.0,0.922929,0.5,0.916667,0.996298
8,"What caused the change in Other, net in the An...",Changes in value from non-affiliated investments.,"[Change in Other, net, compared to fiscal year...","The change in Other, net in the Annual Report ...",1.0,0.941508,1.0,1.0,0.975409
9,What are some costs associated with changes to...,greater direct costs,"[greater direct costs, including costs associa...",greater direct costs,1.0,0.858916,1.0,0.916667,1.0


In [None]:
### Now that we have a baseline, we can see how changes impact our baseline model to determine if the changes are an improvement
from langchain.retrievers import MultiQueryRetriever

advanced_retriever = MultiQueryRetriever.from_llm(retriever=retriever, llm=primary_qa_llm)

In [None]:
### Let's add some things to our documents
from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain = create_stuff_documents_chain(primary_qa_llm, retrieval_qa_prompt)

In [None]:
### Let's create the retrieval chain
from langchain.chains import create_retrieval_chain

retrieval_chain = create_retrieval_chain(advanced_retriever, document_chain)

In [None]:
response = retrieval_chain.invoke({"input": "Who is the E-VP, Operations - and how old are they?"})

In [None]:
print(response["answer"])

The Executive Vice President of Operations is Debora Shoquist, and she is 69 years old.


In [None]:
response = retrieval_chain.invoke({"input": "What is the gross carrying amount of Total Amortizable Intangible Assets for Jan 29, 2023?"})

In [None]:
print(response["answer"])

The gross carrying amount of Total Amortizable Intangible Assets for Jan 29, 2023, is $3,539 million.


In [None]:
### Let's see how everything performs
answers = []
contexts = []

for question in test_questions:
  response = retrieval_chain.invoke({"input" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["context"]])

In [None]:
response_dataset_advanced_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [None]:
advanced_retrieval_results = evaluate(response_dataset_advanced_retrieval, metrics)

Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
advanced_retrieval_results_df = advanced_retrieval_results.to_pandas()
advanced_retrieval_results_df

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What are the automotive safety standards that ...,The DRIVE brand is designed and implemented ba...,[a wide range of intelligent functions into a ...,The DRIVE brand is designed and implemented ba...,,0.925869,1.0,0.833333,1.0
1,What potential risks and costs are associated ...,Legal proceedings related to infringement clai...,"[payment of lost profits, or injunctive relief...",Claims that our products or processes infringe...,1.0,0.987443,1.0,1.0,0.698717
2,What reports does the Audit Committee receive ...,The Audit Committee receives regular informati...,[security policies and practices and the inter...,The Audit Committee receives regular informati...,1.0,0.882102,1.0,1.0,0.998912
3,What are the consequences of not protecting ou...,The consequences of not protecting IP rights a...,[Actions to adequately protect our IP rights c...,The consequences of not protecting our IP righ...,1.0,0.993693,1.0,1.0,0.924602
4,"""What strategies has NVIDIA used to attract an...",NVIDIA has employed strong employer branding a...,[relating to human capital management.\nTo be ...,NVIDIA has used a strong employer brand and di...,1.0,0.971732,1.0,1.0,0.539394
5,"""What advantages do vertical-specific optimiza...",Vertical-specific optimizations in NVIDIA's da...,[eases the deployment of NVIDIA accelerated co...,The advantages of vertical-specific optimizati...,1.0,0.89415,1.0,1.0,0.745269
6,"""What are the potential consequences of licens...",The potential consequences of licensing requir...,[business and results of operations.\nGovernme...,The potential consequences of licensing requir...,1.0,0.987574,1.0,1.0,0.959494
7,"""What are the GPU's strengths in the NVIDIA pl...",The GPU's strengths in the NVIDIA platform lie...,[Table of Contents\nAt the foundation of the N...,The GPU's strengths in the NVIDIA platform are...,,0.922929,0.5,0.916667,0.99621
8,"What caused the change in Other, net in the An...","The change in Other, net in the Annual Report ...","[Change in Other, net, compared to fiscal year...","The change in Other, net in the Annual Report ...",1.0,0.943029,1.0,1.0,0.744561
9,What are some costs associated with changes to...,Some costs associated with changes to manufact...,"[greater direct costs, including costs associa...",greater direct costs,1.0,0.991689,1.0,0.916667,0.207383


In [None]:
### let's evaluate the revised pipeline against the baseline
results

{'faithfulness': 1.0000, 'answer_relevancy': 0.8362, 'context_recall': 0.9500, 'context_precision': 0.9667, 'answer_correctness': 0.7764}

In [None]:
advanced_retrieval_results

{'faithfulness': 1.0000, 'answer_relevancy': 0.9500, 'context_recall': 0.9500, 'context_precision': 0.9667, 'answer_correctness': 0.7815}

In [None]:
import pandas as pd

df_original = pd.DataFrame(list(results.items()), columns=['Metric', 'Baseline'])
df_comparison = pd.DataFrame(list(advanced_retrieval_results.items()), columns=['Metric', 'MultiQueryRetriever with Document Stuffing'])

df_merged = pd.merge(df_original, df_comparison, on='Metric')

df_merged['Delta'] = df_merged['MultiQueryRetriever with Document Stuffing'] - df_merged['Baseline']

df_merged

Unnamed: 0,Metric,Baseline,MultiQueryRetriever with Document Stuffing,Delta
0,faithfulness,1.0,1.0,0.0
1,answer_relevancy,0.836165,0.950021,0.1138563
2,context_recall,0.95,0.95,0.0
3,context_precision,0.966667,0.966667,2.833511e-12
4,answer_correctness,0.776448,0.781454,0.005006309


In [None]:
### Now let's see the impact on our RAG pipeline output if we change the embedding model (from text-embedding-3-small to large)
new_embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
vector_store = FAISS.from_documents(documents, new_embeddings)

In [None]:
new_retriever = vector_store.as_retriever()

In [None]:
new_advanced_retriever = MultiQueryRetriever.from_llm(retriever=new_retriever, llm=primary_qa_llm)

In [None]:
new_retrieval_chain = create_retrieval_chain(new_advanced_retriever, document_chain)

In [None]:
answers = []
contexts = []

for question in test_questions:
  response = new_retrieval_chain.invoke({"input" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["context"]])

In [None]:
new_response_dataset_advanced_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [None]:
new_advanced_retrieval_results = evaluate(new_response_dataset_advanced_retrieval, metrics)

Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
new_advanced_retrieval_results

{'faithfulness': 0.9208, 'answer_relevancy': 0.9519, 'context_recall': 0.9500, 'context_precision': 0.9360, 'answer_correctness': 0.7535}

In [None]:
df_baseline = pd.DataFrame(list(results.items()), columns=['Metric', 'Baseline'])
df_original = pd.DataFrame(list(advanced_retrieval_results.items()), columns=['Metric', 'ADA'])
df_comparison = pd.DataFrame(list(new_advanced_retrieval_results.items()), columns=['Metric', 'Text Embedding 3'])

df_merged = pd.merge(df_original, df_comparison, on='Metric')
df_merged = pd.merge(df_baseline, df_merged, on="Metric")

df_merged['Delta - TE3 -> ADA'] = df_merged['Text Embedding 3'] - df_merged['ADA']
df_merged['Delta - TE3 -> Baseline'] = df_merged['Text Embedding 3'] - df_merged['Baseline']

df_merged

Unnamed: 0,Metric,Baseline,ADA,Text Embedding 3,Delta - TE3 -> ADA,Delta - TE3 -> Baseline
0,faithfulness,1.0,1.0,0.920833,-0.079167,-0.079167
1,answer_relevancy,0.836165,0.950021,0.951933,0.001912,0.115768
2,context_recall,0.95,0.95,0.95,0.0,0.0
3,context_precision,0.966667,0.966667,0.935952,-0.030714,-0.030714
4,answer_correctness,0.776448,0.781454,0.753478,-0.027977,-0.02297
