# LangChain: Evaluation

## Outline:

* Example generation
* Manual evaluation (and debuging)
* LLM-assisted evaluation

In [None]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

In [None]:
file = './data/clothing-catalog.csv'
loader = CSVLoader(file_path=file)
data = loader.load()

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
                deployment="text-embedding-ada-002",
                model="text-embedding-ada-002",
                openai_api_base=os.environ["OPENAI_BASE_URL"],
                openai_api_version="2023-06-01-preview",
                openai_api_type="azure",
                chunk_size = 1
            )

In [None]:
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings,
).from_loaders([loader])

In [None]:
from langchain.chat_models import AzureChatOpenAI

llm = AzureChatOpenAI(
    openai_api_base=os.environ['OPENAI_BASE_URL'],
    deployment_name="chat",
    openai_api_version="2023-06-01-preview",
    openai_api_key=os.environ['OPENAI_API_KEY'],
    temperature=0.0)

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=index.vectorstore.as_retriever(), 
    verbose=True,
    chain_type_kwargs = {
        "document_separator": "<<<<>>>>>"
    }
)

### Coming up with test datapoints

In [None]:
examples = [
    {
        "query": "Does the Comfort Check Shirt has pockets?",
        "answer": "Yes"
    },
    {
        "query": "What logo does the fleece jacket have?",
        "answer": "classic Mount Katahdin logo"
    }
]

### LLM-Generated examples

In [None]:
from langchain.evaluation.qa import QAGenerateChain



In [None]:
from langchain.evaluation.qa import QAGenerateChain, QAGenerateResult

In [None]:
example_gen_chain = QAGenerateChain.from_llm(AzureChatOpenAI(
    openai_api_base=os.environ['OPENAI_BASE_URL'],
    deployment_name="chat",
    openai_api_version="2023-06-01-preview",
    openai_api_key=os.environ['OPENAI_API_KEY'],
    temperature=0.0))

In [None]:
output_text = example_gen_chain.apply(
    [{"doc": t} for t in data[:5]]
)


In [None]:
new_examples = list(map(lambda item: item["qa_pairs"], output_text ))

In [None]:
examples += new_examples

In [None]:
qa.run(examples[0]["query"])

In [None]:
import langchain
langchain.debug = True

In [None]:
qa.run(examples[0]["query"])

## LLM assisted evaluation


In [None]:
langchain.debug = False

In [None]:
predictions = qa.apply(examples)

In [None]:
from langchain.evaluation.qa import QAEvalChain

In [None]:
eval_chain = QAEvalChain.from_llm(llm)

In [None]:
graded_outputs = eval_chain.evaluate(examples, predictions)

In [48]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Predicted Grade: " + graded_outputs[i]['results'])
    print()

Example 0:
Question: Does the Comfort Check Shirt has pockets?
Real Answer: Yes
Predicted Answer: Yes, the Comfort Check Shirt has a single patch pocket.
Predicted Grade: CORRECT

Example 1:
Question: What logo does the fleece jacket have?
Real Answer: classic Mount Katahdin logo
Predicted Answer: The Mountain Man Fleece Jacket features our classic Mount Katahdin logo.
Predicted Grade: CORRECT

Example 2:
Question: What is the weight of one pair of Women's Campside Oxfords?
Real Answer: The approximate weight of one pair of Women's Campside Oxfords is 1 lb. 1 oz.
Predicted Answer: The weight of one pair of Women's Campside Oxfords is approximately 1 lb. 1 oz.
Predicted Grade: CORRECT

Example 3:
Question: What are the dimensions of the small and medium Recycled Waterhog dog mats?
Real Answer: The small Recycled Waterhog dog mat has dimensions of 18" x 28" and the medium has dimensions of 22.5" x 34.5".
Predicted Answer: The small Recycled Waterhog dog mat has dimensions of 18" x 28" an