# LangChain: Evaluation

## Outline:

* Example generation
* Manual evaluation (and debuging)
* LLM-assisted evaluation
* LangChain evaluation platform

In [1]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

Note: LLM's do not always produce the same results. When executing the code in your notebook, you may get slightly different answers that those in the video.

In [2]:
llm_model = "gpt-3.5-turbo"

## Create our QandA application

In [4]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

In [5]:
file = 'netflix_small.csv'
loader = CSVLoader(file_path=file)
data = loader.load()

In [7]:
from langchain_openai import OpenAIEmbeddings

In [8]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [10]:
index = VectorstoreIndexCreator(
    embedding=embeddings,
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])



In [12]:
llm = ChatOpenAI(temperature = 0.0, model=llm_model)
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=index.vectorstore.as_retriever(), 
    verbose=True,
    chain_type_kwargs = {
        "document_separator": "<<<<>>>>>"
    }
)

  warn_deprecated(


### Coming up with test datapoints

In [13]:
data[10]

Document(page_content='title: Vendetta: Truth, Lies and The Mafia\ndescription: Sicily boasts a bold "Anti-Mafia" coalition. But what happens when those trying to bring down organized crime are accused of being criminals themselves?', metadata={'source': 'netflix_small.csv', 'row': 10})

In [27]:
print(data[8].page_content)

title: The Great British Baking Show
description: A talented batch of amateur bakers face off in a 10-week competition, whipping up their best dishes in the hopes of being named the U.K.'s best.


### Hard-coded examples

In [28]:
examples = [
    {
        "query": "What does Sicily boasts of?",
        "answer": "Anti-Mafia coalition"
    },
    {
        "query": "How long is the bakers face off?",
        "answer": "10 week"
    }
]

### LLM-Generated examples

In [29]:
from langchain.evaluation.qa import QAGenerateChain

In [30]:
example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI(model=llm_model))

In [32]:
QAGenerateChain?

In [36]:
example_gen_chain.apply_and_parse?

In [33]:
new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in data[:5]]
)



In [42]:
new_examples

[{'qa_pairs': {'query': 'According to the document, what is the title of the film being described?',
   'answer': 'The title of the film being described is "Dick Johnson Is Dead."'}},
 {'qa_pairs': {'query': 'What is the title and description of the show "Blood & Water"?',
   'answer': 'The title of the show is "Blood & Water" and the description is: "After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth."'}},
 {'qa_pairs': {'query': 'According to the document, why is skilled thief Mehdi and his team of robbers pulled into a violent and deadly turf war?',
   'answer': 'Skilled thief Mehdi and his team of robbers are pulled into a violent and deadly turf war in order to protect his family from a powerful drug lord.'}},
 {'qa_pairs': {'query': 'What is the title of the reality series that takes place at the Orleans Justice Center in New Orleans?',
   'answer': 'Jailbirds New Orleans.'}},
 {'qa_pa

In [35]:
data[0]

Document(page_content='title: Dick Johnson Is Dead\ndescription: As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.', metadata={'source': 'netflix_small.csv', 'row': 0})

### Combine examples

In [43]:
examples += new_examples

In [46]:
qa.run(examples[0]["query"])

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'Sicily boasts a bold "Anti-Mafia" coalition.'

## Manual Evaluation

In [47]:
import langchain
langchain.debug = True

In [48]:
qa.run(examples[0]["query"])

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What does Sicily boasts of?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What does Sicily boasts of?",
  "context": "title: Vendetta: Truth, Lies and The Mafia\ndescription: Sicily boasts a bold \"Anti-Mafia\" coalition. But what happens when those trying to bring down organized crime are accused of being criminals themselves?<<<<>>>>>title: Europe's Most Dangerous Man: Otto Skorzeny in Spain\ndescription: Declassified documents reveal the post-WWII life of Otto Skorzeny, a close Hitler ally who escaped to Spain and became an adviser to world presidents.<<<<>>>>>title: Jailbirds New Orleans\ndescription: Feuds, flirtations and toilet talk go down amon

'Sicily boasts a bold "Anti-Mafia" coalition.'

In [50]:
# Turn off the debug mode
langchain.debug = False

## LLM assisted evaluation

In [58]:
examples[2]['qa_pairs']

{'query': 'According to the document, what is the title of the film being described?',
 'answer': 'The title of the film being described is "Dick Johnson Is Dead."'}

In [60]:
examples = [i['qa_pairs'] if 'qa_pairs' in i.keys() else i for i in examples]

In [61]:
predictions = qa.apply(examples)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [62]:
from langchain.evaluation.qa import QAEvalChain

In [63]:
llm = ChatOpenAI(temperature=0, model=llm_model)
eval_chain = QAEvalChain.from_llm(llm)

In [65]:
graded_outputs = eval_chain.evaluate(examples, predictions)

In [72]:
graded_outputs[0]

{'results': 'CORRECT'}

In [73]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Predicted Grade: " + graded_outputs[i]['results'])
    print()

Example 0:
Question: What does Sicily boasts of?
Real Answer: Anti-Mafia coalition
Predicted Answer: Sicily boasts a bold "Anti-Mafia" coalition.
Predicted Grade: CORRECT

Example 1:
Question: How long is the bakers face off?
Real Answer: 10 week
Predicted Answer: The bakers face off in a 10-week competition on "The Great British Baking Show."
Predicted Grade: CORRECT

Example 2:
Question: According to the document, what is the title of the film being described?
Real Answer: The title of the film being described is "Dick Johnson Is Dead."
Predicted Answer: The title of the film being described is "Intrusion."
Predicted Grade: INCORRECT

Example 3:
Question: What is the title and description of the show "Blood & Water"?
Real Answer: The title of the show is "Blood & Water" and the description is: "After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth."
Predicted Answer: The title is "Blood & Wat

In [None]:
graded_outputs[0]

## LangChain evaluation platform

The LangChain evaluation platform, LangChain Plus, can be accessed here https://www.langchain.plus/.  
Use the invite code `lang_learners_2023`

Reminder: Download your notebook to you local computer to save your work.