In [1]:
!pip install -U -q langchain langchain-openai langchain_core langchain-community langchainhub openai
!pip install -qU ragas
!pip install -qU qdrant-client pymupdf pandas

In [2]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

In [3]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader(
    "/Users/SKTL/Desktop/Coding/VS-Code/AIE2-midterm/meta_pdf.pdf",
)

documents = loader.load()

In [4]:
documents[0].metadata

{'source': '/Users/SKTL/Desktop/Coding/VS-Code/AIE2-midterm/meta_pdf.pdf',
 'file_path': '/Users/SKTL/Desktop/Coding/VS-Code/AIE2-midterm/meta_pdf.pdf',
 'page': 0,
 'total_pages': 147,
 'format': 'PDF 1.4',
 'title': '0001326801-24-000012',
 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group',
 'subject': 'Form 10-K filed on 2024-02-02 for the period ending 2023-12-31',
 'keywords': '0001326801-24-000012; ; 10-K',
 'creator': 'EDGAR Filing HTML Converter',
 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0',
 'creationDate': "D:20240202060356-05'00'",
 'modDate': "D:20240202060413-05'00'",
 'trapped': '',
 'encryption': 'Standard V2 R3 128-bit RC4'}

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 50
)

documents = text_splitter.split_documents(documents)

In [6]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

In [7]:
from langchain_community.vectorstores import Qdrant

qdrant_vector_store = Qdrant.from_documents(
    documents,
    embeddings,
    location=":memory:",
    collection_name="meta-10-k-filings",
)

In [8]:
retriever = qdrant_vector_store.as_retriever()

In [9]:
retrieved_documents = retriever.invoke("What was the total value of 'Cash and cash equivalents' as of December 31, 2023?")
retrieved_documents = retriever.invoke("Who are Meta's 'Directors' (i.e., members of the Board of Directors)?")

In [10]:
for doc in retrieved_documents:
  print(doc)

page_content='on the Nasdaq Global Select Market under the symbol "META." Our principal executive offices are located at 1 Meta Way, Menlo Park, California 94025, and\nour telephone number is (650) 543-4800.' metadata={'source': '/Users/SKTL/Desktop/Coding/VS-Code/AIE2-midterm/meta_pdf.pdf', 'file_path': '/Users/SKTL/Desktop/Coding/VS-Code/AIE2-midterm/meta_pdf.pdf', 'page': 13, 'total_pages': 147, 'format': 'PDF 1.4', 'title': '0001326801-24-000012', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-K filed on 2024-02-02 for the period ending 2023-12-31', 'keywords': '0001326801-24-000012; ; 10-K', 'creator': 'EDGAR Filing HTML Converter', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creationDate': "D:20240202060356-05'00'", 'modDate': "D:20240202060413-05'00'", 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4', '_id': 'a5851c4498684b1ebed0ba7d48333205', '_collection_name': 'meta-10-k-filings'}
page_content='Unless expressly indicated

In [30]:
from langchain import hub

retrieval_qa_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

In [11]:
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':

Context:
{context}

Question:
{question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [12]:
from operator import itemgetter

from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

In [13]:
questions = ["What was the total value of 'Cash and cash equivalents' as of December 31, 2023?", "Who are Meta's 'Directors' (i.e., members of the Board of Directors)"]

for question in questions:
  answer = retrieval_augmented_qa_chain.invoke({"question" : question})["response"].content
  print(f"Question: {question} / Answer: {answer}")

Question: What was the total value of 'Cash and cash equivalents' as of December 31, 2023? / Answer: $41,862 billion
Question: Who are Meta's 'Directors' (i.e., members of the Board of Directors) / Answer: I don't know.


In [14]:
loader = PyMuPDFLoader(
    "/Users/SKTL/Desktop/Coding/VS-Code/AIE2-midterm/meta_pdf.pdf",
)

eval_documents = loader.load()

text_splitter_eval = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap = 50
)

eval_documents = text_splitter_eval.split_documents(eval_documents)

In [17]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
critic_llm = ChatOpenAI(model="gpt-3.5-turbo")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

testset = generator.generate_with_langchain_docs(eval_documents, 20, distributions, is_async = False)
testset.to_pandas()

embedding nodes:   0%|          | 0/2202 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What were the outcomes of the 2022 Restructuri...,"[employees across the FoA and RL segments, and...","As of December 31, 2023, the outcomes of the 2...",simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
1,How has the trading price of the Class A commo...,[The trading price of our Class A common stock...,The trading price of our Class A common stock ...,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
2,What is the authorized number of shares for Cl...,[Table of Contents\nNote 13. Stockholders' Equ...,"5,000 million shares of Class A common stock a...",simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
3,What is the amount of common stock issued acco...,"[2,614 \n— \n64,444 \n(3,530)\n64,799 \n125,71...",65,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
4,"What is the meaning of ""terms and conditions"" ...",[U.S. federal securities laws in accordance wi...,,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
5,What circumstances would not be considered a F...,"[period.\nFor purposes of this Policy, a Finan...",An out-of-period adjustment that is immaterial...,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
6,From which countries did the majority of reven...,[Accounts receivable are typically unsecured a...,"Western Europe, China, Brazil, Australia, Cana...",simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
7,What is the significance of the reporting date...,[Table of Contents\n \n \nFair Value Measureme...,The significance of the reporting date in fair...,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
8,What factors could lead to limited availabilit...,[•\nusers adopt new technologies where our pro...,,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
9,How could Delaware law and provisions in our c...,[Delaware law and provisions in our certificat...,Delaware law and provisions in the certificate...,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True


In [18]:
test_df = testset.to_pandas()

In [19]:
test_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What were the outcomes of the 2022 Restructuri...,"[employees across the FoA and RL segments, and...","As of December 31, 2023, the outcomes of the 2...",simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
1,How has the trading price of the Class A commo...,[The trading price of our Class A common stock...,The trading price of our Class A common stock ...,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
2,What is the authorized number of shares for Cl...,[Table of Contents\nNote 13. Stockholders' Equ...,"5,000 million shares of Class A common stock a...",simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
3,What is the amount of common stock issued acco...,"[2,614 \n— \n64,444 \n(3,530)\n64,799 \n125,71...",65,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
4,"What is the meaning of ""terms and conditions"" ...",[U.S. federal securities laws in accordance wi...,,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
5,What circumstances would not be considered a F...,"[period.\nFor purposes of this Policy, a Finan...",An out-of-period adjustment that is immaterial...,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
6,From which countries did the majority of reven...,[Accounts receivable are typically unsecured a...,"Western Europe, China, Brazil, Australia, Cana...",simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
7,What is the significance of the reporting date...,[Table of Contents\n \n \nFair Value Measureme...,The significance of the reporting date in fair...,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
8,What factors could lead to limited availabilit...,[•\nusers adopt new technologies where our pro...,,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True
9,How could Delaware law and provisions in our c...,[Delaware law and provisions in our certificat...,Delaware law and provisions in the certificate...,simple,[{'source': '/Users/SKTL/Desktop/Coding/VS-Cod...,True


In [20]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [21]:
answers = []
contexts = []

for question in test_questions:
  response = retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

In [22]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [23]:
response_dataset[0]

{'question': 'What were the outcomes of the 2022 Restructuring in terms of data center projects and employee layoffs?',
 'answer': 'The outcomes of the 2022 Restructuring were the completion of data center initiatives and employee layoffs.',
 'contexts': ['(the 2022 Restructuring). As of December\xa031, 2023, we have completed the data center initiatives and the 2022 employee layoffs, and substantially completed\nthe facilities consolidation initiatives.',
  '2023, we have completed the data center initiatives and the employee layoffs, and substantially completed the facilities consolidation initiatives.',
  'Note\xa03. Restructuring\n2023 Restructuring\nIn March 2023, we announced three rounds of planned layoffs to further reduce our company size by approximately 10,000 employees across the Family',
  'employees across the FoA and RL segments, and a pivot towards a next generation data center design, including cancellation of multiple data center projects'],
 'ground_truth': 'As of De

In [24]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [25]:
results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.


In [26]:
results

{'faithfulness': 0.7944, 'answer_relevancy': 0.6958, 'context_recall': 0.7150, 'context_precision': 0.7431, 'answer_correctness': 0.5698}

In [27]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What were the outcomes of the 2022 Restructuri...,The outcomes of the 2022 Restructuring were th...,"[(the 2022 Restructuring). As of December 31, ...","As of December 31, 2023, the outcomes of the 2...",0.0,0.917656,1.0,0.916667,0.743913
1,How has the trading price of the Class A commo...,The trading price of the Class A common stock ...,[The trading price of our Class A common stock...,The trading price of our Class A common stock ...,1.0,0.97935,1.0,1.0,0.846975
2,What is the authorized number of shares for Cl...,"Class A: 5,000 million shares\nClass B: 4,141 ...","[Stockholders' equity:\n \n \nCommon stock, $0...","5,000 million shares of Class A common stock a...",0.5,0.934909,1.0,1.0,0.839575
3,What is the amount of common stock issued acco...,I don't know.,"[Stockholders' equity:\n \n \nCommon stock, $0...",65,,0.0,1.0,0.805556,0.193986
4,"What is the meaning of ""terms and conditions"" ...",I don't know.,[deemed to include the restrictions imposed he...,,,0.0,0.0,0.0,0.198202
5,What circumstances would not be considered a F...,A revision of financial statements due to an e...,"[period.\nFor purposes of this Policy, a Finan...",An out-of-period adjustment that is immaterial...,1.0,0.929908,0.5,0.833333,0.582758
6,From which countries did the majority of reven...,"Western Europe, China, Brazil, Australia, Cana...","[States, with a majority of the revenue outsid...","Western Europe, China, Brazil, Australia, Cana...",0.166667,0.974979,1.0,1.0,0.997174
7,What is the significance of the reporting date...,The significance of the reporting date in fair...,[Table of Contents\n \n \nFair Value Measureme...,The significance of the reporting date in fair...,1.0,0.999999,1.0,0.805556,0.619242
8,What factors could lead to limited availabilit...,Government restrictions on access to Facebook ...,[•\nlimitations on our ability to offer a numb...,,1.0,0.827966,0.8,0.0,0.183517
9,How could Delaware law and provisions in our c...,Delaware law and provisions in the certificate...,[Delaware law and provisions in our certificat...,Delaware law and provisions in the certificate...,1.0,0.895024,1.0,1.0,0.607793


In [28]:
from langchain.retrievers import MultiQueryRetriever

advanced_retriever = MultiQueryRetriever.from_llm(retriever=retriever, llm=primary_qa_llm)

In [31]:
from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain = create_stuff_documents_chain(primary_qa_llm, retrieval_qa_prompt)

In [32]:
from langchain.chains import create_retrieval_chain

retrieval_chain = create_retrieval_chain(advanced_retriever, document_chain)

In [35]:
response = retrieval_chain.invoke({"input": "Who are Meta's 'Directors' (i.e., members of the Board of Directors)?"})

In [36]:
print(response["answer"])

The total value of 'Cash and cash equivalents' as of December 31, 2023, was $42,827 million.


In [37]:
response = retrieval_chain.invoke({"input": "Who are Meta's 'Directors' (i.e., members of the Board of Directors)?"})

In [40]:
print(response["answer"])

The members of the Board of Directors at Meta Platforms, Inc. are not explicitly mentioned in the provided context.
