# LangChain: Evaluation

## Outline:

* Example generation
* Manual evaluation (and debuging)
* LLM-assisted evaluation
* LangChain evaluation platform

In [1]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

Note: LLM's do not always produce the same results. When executing the code in your notebook, you may get slightly different answers that those in the video.

In [2]:
# account for deprecation of LLM model
import datetime
# Get the current date
current_date = datetime.datetime.now().date()

# Define the date after which the model should be set to "gpt-3.5-turbo"
target_date = datetime.date(2024, 6, 12)

# Set the model variable based on the current date
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

## Create our QandA application

In [3]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

In [4]:
file = 'My_Movie_Dataset.csv'
loader = CSVLoader(file_path=file)
data = loader.load()

In [5]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

In [6]:
llm = ChatOpenAI(temperature = 0.0, model=llm_model)
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=index.vectorstore.as_retriever(), 
    verbose=True
)

### Coming up with test datapoints

In [7]:
data[10]

Document(page_content='\ufeffmovieId: 11\ntitle: American President, The (1995)\ngenres: Comedy|Drama|Romance', metadata={'source': 'My_Movie_Dataset.csv', 'row': 10})

In [8]:
data[11]

Document(page_content='\ufeffmovieId: 12\ntitle: Dracula: Dead and Loving It (1995)\ngenres: Comedy|Horror', metadata={'source': 'My_Movie_Dataset.csv', 'row': 11})

### Hard-coded examples

In [9]:
examples = [
    {
        "query": "Is Sunset Park a drama movie?",
        "answer": "Yes"
    },
    {
        "query": "Name a movie release in 1998 and is an Adventure film?",
        "answer": "Oliver & Company"
    }
]

### LLM-Generated examples

In [10]:
from langchain.evaluation.qa import QAGenerateChain


In [11]:
example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI(model=llm_model))

In [12]:
# the warning below can be safely ignored

In [13]:
new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in data[:5]]
)

In [14]:
new_examples[0]

{'query': 'What is the movie ID for Toy Story (1995) according to the given document?',
 'answer': 'The movie ID for Toy Story (1995) is 1.'}

In [15]:
data[0]

Document(page_content='\ufeffmovieId: 1\ntitle: Toy Story (1995)\ngenres: Adventure|Animation|Children|Comedy|Fantasy', metadata={'source': 'My_Movie_Dataset.csv', 'row': 0})

### Combine examples

In [16]:
examples += new_examples

In [17]:
qa.run(examples[0]["query"])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'Yes, Sunset Park is a drama movie.'

## Manual Evaluation

In [18]:
import langchain
langchain.debug = True

In [19]:
qa.run(examples[0]["query"])

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Is Sunset Park a drama movie?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:chain:StuffDocumentsChain > 3:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Is Sunset Park a drama movie?",
  "context": "\ufeffmovieId: 706\ntitle: Sunset Park (1996)\ngenres: Drama\n\n\ufeffmovieId: 352\ntitle: Crooklyn (1994)\ngenres: Comedy|Drama\n\n\ufeffmovieId: 334\ntitle: Vanya on 42nd Street (1994)\ngenres: Drama\n\n\ufeffmovieId: 517\ntitle: Rising Sun (1993)\ngenres: Action|Drama|Mystery"
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:RetrievalQA > 2:chain:StuffDocumentsChain > 3:chain:LLMChain > 4:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "System: Use the following pieces of context to answer the users q

'Yes, Sunset Park is a drama movie.'

In [20]:
# Turn off the debug mode
langchain.debug = False

## LLM assisted evaluation

In [21]:
predictions = qa.apply(examples)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [22]:
from langchain.evaluation.qa import QAEvalChain

In [23]:
llm = ChatOpenAI(temperature=0, model=llm_model)
eval_chain = QAEvalChain.from_llm(llm)

In [24]:
graded_outputs = eval_chain.evaluate(examples, predictions)

In [25]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Predicted Grade: " + graded_outputs[i]['text'])
    print()

Example 0:
Question: Is Sunset Park a drama movie?
Real Answer: Yes
Predicted Answer: Yes, Sunset Park is a drama movie.
Predicted Grade: CORRECT

Example 1:
Question: Name a movie release in 1998 and is an Adventure film?
Real Answer: Oliver & Company
Predicted Answer: I'm sorry, I don't have enough information to provide a specific answer. However, here are a few adventure movies released in 1998:
- The Mask of Zorro
- The Parent Trap
- Everest
- The Prince of Egypt
- A Bug's Life
Predicted Grade: INCORRECT

Example 2:
Question: What is the movie ID for Toy Story (1995) according to the given document?
Real Answer: The movie ID for Toy Story (1995) is 1.
Predicted Answer: The movie ID for Toy Story (1995) is 1 according to the given document.
Predicted Grade: CORRECT

Example 3:
Question: What is the movie title and year of release for the movie with ID 2 in the given dataset?
Real Answer: The movie title is Jumanji and it was released in 1995.
Predicted Answer: I'm sorry, but there 

In [26]:
graded_outputs[0]

{'text': 'CORRECT'}

## LangChain evaluation platform

The LangChain evaluation platform, LangChain Plus, can be accessed here https://www.langchain.plus/.  
Use the invite code `lang_learners_2023`

Reminder: Download your notebook to you local computer to save your work.