This notebook is  for the DeepLearning AI course on evaluation: https://learn.deeplearning.ai/langchain/lesson/6/evaluation

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
GHA_TOKEN = os.getenv('GHA')

In [55]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.chains import RetrievalQA
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.indexes import VectorstoreIndexCreator

In [56]:
llm=ChatOpenAI()
llm.predict("Hi, please summarise yout capabilities in 2 sentences")

'I am an AI language model designed to assist with a wide range of tasks, including generating text, answering questions, providing information, and engaging in conversation. I can help with writing, research, general knowledge, and more.'

In [None]:
file = "data/clothes.csv"
csv_loader = CSVLoader(file_path=file)
docs = csv_loader.load()

In [None]:
index = VectorstoreIndexCreator(vectorstore_cls=DocArrayInMemorySearch).from_loaders([csv_loader])

In [None]:
chat_llm= ChatOpenAI(temperature=0.0)
qa_search = RetrievalQA.from_chain_type(
    llm=chat_llm, chain_type='stuff', 
    retriever=index.vectorstore.as_retriever(),
    verbose=True,
    chain_type_kwargs = {
        'document_separator':'<<<<>>>>'
    }
    )

In [None]:
from langchain.evaluation.qa import QAGenerateChain

## here we generate questions and answers from individual documents
## to create 'gold standard' answers
chain = QAGenerateChain.from_llm(chat_llm)
examples = chain.apply_and_parse([{'doc':t} for t in docs[0:5]])

In [None]:
examples

In [None]:
import langchain
## how to turn on debugging to see what happens on each stage
langchain.debug=False
qa_search.run(examples[1]['qa_pairs']['query'])

Now we run the same questions as we asked of individual docs and compare it on the whole dataset.

In [None]:
predictions = qa_search.run([k['qa_pairs'] for k in   examples])

In [None]:
predictions

Now we evaluate the test and real answers and see how similar they are

In [None]:
from langchain.evaluation.qa import QAEvalChain

eval_chain = QAEvalChain.from_llm(chat_llm)
result = eval_chain.evaluate([k['qa_pairs'] for k in   examples], predictions)

Now we print out the test Q and A, real A and evaluation

In [None]:
for i, e in enumerate(examples):
    print (f"Test question: {examples[i]['qa_pairs']['query']}")
    print (f"Test answer: {examples[i]['qa_pairs']['answer']}")
    print (f"Real answer: {predictions[i]['result']}")
    print (f"Grade: {result[i]['results']}")
    print("-----------")