In [4]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "book.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

13


In [5]:
print(docs[0].page_content[0:100])
print(docs[0].metadata)

ClashEval : Quantifying the tug-of-war between an
LLM’s internal prior and external evidence
Kevin W
{'source': 'book.pdf', 'page': 0}


In [9]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")

In [10]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [12]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "Does GPT-4o have any bias?"})

results

{'input': 'Does GPT-4o have any bias?',
 'context': [Document(page_content='5 Discussion\nTheClashEval benchmark dataset and evaluations provide novel insights into how LLMs arbitrate\nbetween their own internal knowledge and contextual information when the two are in conflict.\nA key finding is that even the most advanced LLMs like GPT-4o exhibit a strong context bias,\noverriding their own correct prior knowledge over 60% of the time when presented with incorrect\ninformation in the retrieved documents. However, this bias is not absolute - the degree to which\nthe retrieved content deviates from truth negatively correlates with the context preference rate.\nInterestingly, each LLM exhibits a different prior distribution over truthfulness across domains, such\nthat the same perturbation level affects each model differently. For instance, for a given magnitude\nof deviation, Claude Opus adheres to incorrect contextual information 30% less often than GPT-4o.\nWhile GPT-4o achieves state

In [14]:
print(results["context"][0].page_content)

5 Discussion
TheClashEval benchmark dataset and evaluations provide novel insights into how LLMs arbitrate
between their own internal knowledge and contextual information when the two are in conflict.
A key finding is that even the most advanced LLMs like GPT-4o exhibit a strong context bias,
overriding their own correct prior knowledge over 60% of the time when presented with incorrect
information in the retrieved documents. However, this bias is not absolute - the degree to which
the retrieved content deviates from truth negatively correlates with the context preference rate.
Interestingly, each LLM exhibits a different prior distribution over truthfulness across domains, such
that the same perturbation level affects each model differently. For instance, for a given magnitude
of deviation, Claude Opus adheres to incorrect contextual information 30% less often than GPT-4o.
While GPT-4o achieves state-of-the-art results on general-purpose tasks, it exhibits higher context
