# LangChain: Evaluation

## Outline:

* Example generation
* Manual evaluation (and debugging)
* LLM-assisted evaluation

In [None]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

## Create our QandA application

In [None]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.indexes.vectorstore import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

In [None]:
useLogging = True # set to True to get logging information (and hopefully track which LLM is called when)
use_Ollama_For_Inference = True # set to True to use Ollama inference models (and pull at least the gemma:2b model)
use_Ollama_For_Embedding = True # set to True to use Ollama embedddings models (and pull at least the nomic-embed-text:latest model)
use_Pinecone = True #Turn on to use a Pinecone database. Sign up at www.pinecone.io for a free plan (including 5 indexes)
use_Chroma = True #Turn on to use a local Chroma database. Supersedes the use_Pinecone flag above (and turns it off)
use_Test_Data = False # set to True to use LimitedCSVLoader class below and only load the 577th item from the CSV file and test that querying with embeddings work well.

import openai
#Defaults to OpenAI if use_Ollama_For_Inference=False and use_Ollama_For_Inference=False
openai.api_base = inferApiBase = embedApiBase =  "https://api.openai.com/v1"
openai.base_url = inferBaseUrl = embedBaseUrl = "https://api.openai.com"
openai.api_key = inferApiKey = embedApiKey = os.environ['OPENAI_API_KEY']
embeddings_model_name = "text-embedding-ada-002"
embeddings_model_name_short = "ada"
embeddings_vector_size = 1536
infer_model_name = "gpt-3.5-turbo"
llm_platform = "openai"
embed_chunk_size = 1000
embed_overlap = 0


if use_Ollama_For_Inference:
    inferApiBase = "http://localhost:11434/v1"
    inferBaseUrl = "http://localhost:1143"
    inferApiKey = "ollama"
    infer_model_name = "gemma:2b" #you can/should customize this to test different Ollama LLMs. Use the NAME field from `ollama list`


if use_Ollama_For_Embedding:
    llm_platform = "ollama"
    embedApiBase = "http://localhost:11434/v1"
    embedBaseUrl = "http://localhost:1143"
    embedApiKey = "ollama"
    #embeddings_model_name = "mxbai-embed-large:latest"
    #embeddings_model_name_short = "mxbai"
    #embeddings_vector_size = 1024
    #embed_chunk_size = 512
    #embed_overlap = 10
    #
    embeddings_model_name = "nomic-embed-text:latest" #you can/should customize this to test different Ollama LLMs. Use the NAME field from `ollama list`
    embeddings_model_name_short = "nomic"
    embeddings_vector_size = 768
    embed_chunk_size = 8192
    embed_overlap = 0

print('Embed API Key:', embedApiKey)
print('Infer API Key:', inferApiKey)
print('Embed API Base:', embedApiBase)
print('Infer API Base:', inferApiBase)
print('Embeddings Model:', embeddings_model_name)
print('Inference Model:', infer_model_name)

index_prefix = "langchain-deeplearningai-" + embeddings_model_name_short + "-"
if use_Test_Data:
    index_prefix+='s-'
rag_index_name = index_prefix + llm_platform
print('RAG Index Name:', rag_index_name)

if use_Chroma:
    print('using Chroma Vector database')
    
    use_Pinecone = False
    storage_path = os.environ.get('CHROMA_STORAGE_PATH')
    if storage_path is None:
        raise ValueError('CHROMA_STORAGE_PATH environment variable is not set')
    

elif use_Pinecone:

    print('using Pinecone Vector database')
    from pinecone import Pinecone
    from langchain_pinecone import PineconeVectorStore
    from tqdm.autonotebook import tqdm

    PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
    PINECONE_ENV = os.environ.get("PINECONE_ENV", "PINECONE_ENV")

    if PINECONE_API_KEY is None:
        raise ValueError("PINECONE_API_KEY environment variable not set.")
        # Name our index on Pineconeopenai.api_key

    # Init pinecone
    pc = Pinecone(
        api_key=PINECONE_API_KEY,
        source_tag="langchain-deeplearningai"
    )
else:
    print('using In Memory Vector database')

from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings


if use_Ollama_For_Embedding:
    embeddings_model = OllamaEmbeddings(model=embeddings_model_name, embed_instruction='', query_instruction='')
    #embeddings_model = OllamaEmbeddings(model=embeddings_model_name)
else:
    embeddings_model = OpenAIEmbeddings(model=embeddings_model_name)

In [None]:
file = 'OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file, encoding='utf-8')
data = loader.load()

In [None]:
from langchain_core.documents.base import Document
class LimitedCSVLoader(CSVLoader):
    def load(self):
        # Call the original load method to get all rows
        all_rows = super().load()

        # Restrict to the first 5 rows
        #limited_rows = all_rows[:5]
        # Restrict to line #577
        limited_rows = all_rows[577]
        if isinstance(limited_rows, Document):
            limited_rows = [limited_rows]
        return limited_rows
    
if use_Test_Data:
    loader = LimitedCSVLoader(file_path=file, encoding='utf-8')

In [None]:
from langchain_community.embeddings import OllamaEmbeddings

index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=OllamaEmbeddings(model=embeddings_model_name, embed_instruction='', query_instruction='')
).from_loaders([loader])

db = index.vectorstore

In [None]:
from langchain_chroma import Chroma
db = Chroma(collection_name=rag_index_name, embedding_function=embeddings_model, persist_directory=storage_path)

In [261]:
infer_model_name = 'llama3:8b'
llm = ChatOpenAI(temperature = 0.0, base_url=inferApiBase, model=infer_model_name)
#llm = ChatOpenAI(temperature = 0.0)
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever= db.as_retriever(), 
    verbose=True,
    chain_type_kwargs = {
        "document_separator": "<<<<>>>>>"
    }
)

2024-06-10 16:07:31 - DEBUG - _config.py:load_ssl_context:80 - load_ssl_context verify=True cert=None trust_env=True http2=False
2024-06-10 16:07:31 - DEBUG - _config.py:load_ssl_context_verify:146 - load_verify_locations cafile='h:\\Users\\Raphael\\OneDrive\\Perso\\Technical\\AI\\Training\\GenAI\\LangChain-for-LLM-Application-Development\\venv\\lib\\site-packages\\certifi\\cacert.pem'
2024-06-10 16:07:31 - DEBUG - _config.py:load_ssl_context:80 - load_ssl_context verify=True cert=None trust_env=True http2=False
2024-06-10 16:07:31 - DEBUG - _config.py:load_ssl_context_verify:146 - load_verify_locations cafile='h:\\Users\\Raphael\\OneDrive\\Perso\\Technical\\AI\\Training\\GenAI\\LangChain-for-LLM-Application-Development\\venv\\lib\\site-packages\\certifi\\cacert.pem'


### Coming up with test datapoints

In [None]:
data[10]

In [None]:
data[11]

### Hard-coded examples

In [231]:
examples = [
    {
        "query": "Do the Cozy Comfort Pullover Set have side pockets?",
        "answer": "Yes"
    },
    {
        "query": "What collection is the Ultra-Lofty 850 Stretch Down Hooded Jacket from?",
        "answer": "The DownTek collection"
    }
]

### LLM-Generated examples

In [None]:
from langchain.evaluation.qa import QAGenerateChain


In [None]:
example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI())

In [None]:
if useLogging:
    import logging
    import requests 


    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s - %(levelname)s - %(filename)s:%(funcName)s:%(lineno)d - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    requests.packages.urllib3.add_stderr_logger()
    OLLAMA_DEBUG=1

In [None]:
infer_model_name = "gemma:2b"
llm = ChatOpenAI(temperature = 0.0, base_url=inferApiBase, model=infer_model_name)

In [None]:
import json
import re
langchain.debug = False

#llm = ChatOpenAI(temperature = 0.0) #use OpenAI

example_gen_chain = QAGenerateChain.from_llm(llm)
new_examples = example_gen_chain.apply(
    ## t.page_content is required here for non-OpenAI LLMs. OpenAI is fine with both 't' and 't.page_content' but non-OpenAI LLMs generate a ValueError (see https://github.com/langchain-ai/langchain/issues/7559 for details)
    [{"doc": t.page_content
      .replace("'", r"\'")
      .replace('"',  '\"')
      #.replace("®", "")
      #.replace('&', 'and')
      #.replace('%', ' percent')
      #.replace('–', '')
      #.replace('!', '')
      #.replace('+', 'plus')
      #.replace(" .", ".")
      #.replace(". ", ".")
      #.replace("°", " degrees")
      } 
      for t in data[:3]]
    #[{"doc": re.sub('[^A-Za-z0-9 -]+', '', t.page_content)} for t in data[:4]]
)

In [None]:
new_examples

In [None]:
transformed_examples = [item['qa_pairs'] for item in new_examples]
transformed_examples

In [None]:
data[0]

### Combine examples

In [232]:
examples += transformed_examples

In [233]:
examples

[{'query': 'Do the Cozy Comfort Pullover Set have side pockets?',
  'answer': 'Yes'},
 {'query': 'What collection is the Ultra-Lofty 850 Stretch Down Hooded Jacket from?',
  'answer': 'The DownTek collection'},
 {'query': "What is the weight of a pair of Women's Campside Oxfords?",
  'answer': '1 lb.1 oz. per pair.'},
 {'query': 'What is the purpose of the recycled waterhog dog mat?',
  'answer': 'The purpose of the recycled waterhog dog mat is to protect floors from spills and splashing with its ultradurable construction made from recycled plastic materials.'},
 {'query': "What is the main feature of the Infant and Toddler Girls' Coastal Chill Swimsuit?",
  'answer': "The main feature of the Infant and Toddler Girls' Coastal Chill Swimsuit is its bright colors, ruffles, and exclusive whimsical prints."}]

In [266]:
qa.invoke(examples[4]["query"])

2024-06-10 16:13:09,257 DEBUG Starting new HTTP connection (1): localhost:11434
2024-06-10 16:13:09 - DEBUG - connectionpool.py:_new_conn:244 - Starting new HTTP connection (1): localhost:11434




[1m> Entering new RetrievalQA chain...[0m


2024-06-10 16:13:13,558 DEBUG http://localhost:11434 "POST /api/embeddings HTTP/1.1" 200 None
2024-06-10 16:13:13 - DEBUG - connectionpool.py:_make_request:549 - http://localhost:11434 "POST /api/embeddings HTTP/1.1" 200 None
2024-06-10 16:13:13 - DEBUG - _base_client.py:_build_request:446 - Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'content': "Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n: 2\nname: Infant and Toddler Girls' Coastal Chill Swimsuit, Two-Piece\ndescription: She'll love the bright colors, ruffles and exclusive whimsical prints of this toddler's two-piece swimsuit! Our four-way-stretch and chlorine-resistant fabric keeps its shape and resists snags. The UPF 50+ rated fabric provides the highest rated sun protection possible, blocking 98% of the sun's harmful rays. The crossove


[1m> Finished chain.[0m


"According to the description, one of the main features of the Infant and Toddler Girls' Coastal Chill Swimsuit is its UPF 50+ rated fabric, which provides the highest rated sun protection possible, blocking 98% of the sun's harmful rays."

## Manual Evaluation

In [242]:
import langchain
langchain.debug = True

In [267]:
qa.invoke(examples[4]["query"])

2024-06-10 16:14:28,740 DEBUG Starting new HTTP connection (1): localhost:11434
2024-06-10 16:14:28 - DEBUG - connectionpool.py:_new_conn:244 - Starting new HTTP connection (1): localhost:11434




[1m> Entering new RetrievalQA chain...[0m


2024-06-10 16:14:33,082 DEBUG http://localhost:11434 "POST /api/embeddings HTTP/1.1" 200 None
2024-06-10 16:14:33 - DEBUG - connectionpool.py:_make_request:549 - http://localhost:11434 "POST /api/embeddings HTTP/1.1" 200 None
2024-06-10 16:14:33 - DEBUG - _base_client.py:_build_request:446 - Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'content': "Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n: 2\nname: Infant and Toddler Girls' Coastal Chill Swimsuit, Two-Piece\ndescription: She'll love the bright colors, ruffles and exclusive whimsical prints of this toddler's two-piece swimsuit! Our four-way-stretch and chlorine-resistant fabric keeps its shape and resists snags. The UPF 50+ rated fabric provides the highest rated sun protection possible, blocking 98% of the sun's harmful rays. The crossove


[1m> Finished chain.[0m


{'query': "What is the main feature of the Infant and Toddler Girls' Coastal Chill Swimsuit?",
 'result': "According to the description, one of the main features of the Infant and Toddler Girls' Coastal Chill Swimsuit is its UPF 50+ rated fabric, which provides the highest rated sun protection possible, blocking 98% of the sun's harmful rays."}

In [245]:
# Turn off the debug mode
langchain.debug = False

## LLM assisted evaluation

In [None]:
predictions = qa.apply(examples)

In [247]:
from langchain.evaluation.qa import QAEvalChain

In [262]:
#llm = ChatOpenAI(temperature=0)
eval_chain = QAEvalChain.from_llm(llm)

In [263]:
graded_outputs = eval_chain.evaluate(examples, predictions)

2024-06-10 16:08:10 - DEBUG - _base_client.py:_build_request:446 - Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'content': "You are a teacher grading a quiz.\nYou are given a question, the student's answer, and the true answer, and are asked to score the student answer as either CORRECT or INCORRECT.\n\nExample Format:\nQUESTION: question here\nSTUDENT ANSWER: student's answer here\nTRUE ANSWER: true answer here\nGRADE: CORRECT or INCORRECT here\n\nGrade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin! \n\nQUESTION: Do the Cozy Comfort Pullover Set have side pockets?\nSTUDENT ANSWER: According to the product description, yes, the Cozy Comfort Pullover Set has side pockets in the pull-

In [265]:
predictions[4]

{'query': "What is the main feature of the Infant and Toddler Girls' Coastal Chill Swimsuit?",
 'answer': "The main feature of the Infant and Toddler Girls' Coastal Chill Swimsuit is its bright colors, ruffles, and exclusive whimsical prints.",
 'result': "According to the description, one of the main features of the Infant and Toddler Girls' Coastal Chill Swimsuit is its UPF 50+ rated fabric, which provides the highest rated sun protection possible, blocking 98% of the sun's harmful rays."}

In [264]:
#Graded outputs from Ollama
graded_outputs

[{'results': 'GRADE: CORRECT'},
 {'results': 'QUESTION: What collection is the Ultra-Lofty 850 Stretch Down Hooded Jacket from?\nSTUDENT ANSWER: The Ultra-Lofty 850 Stretch Down Hooded Jacket is from the DownTek collection.\nTRUE ANSWER: The DownTek collection\nGRADE: CORRECT'},
 {'results': "QUESTION: What is the weight of a pair of Women's Campside Oxfords?\nSTUDENT ANSWER: According to the description, the approximate weight of a pair of Women's Campside Oxfords is 1 lb. 1 oz. per pair.\nTRUE ANSWER: 1 lb.1 oz. per pair.\nGRADE: CORRECT"},
 {'results': 'QUESTION: What is the purpose of the recycled waterhog dog mat?\nSTUDENT ANSWER: According to the description, the purpose of the Recycled Waterhog Dog Mat is to "Protect your floors from spills and splashing" by keeping dirt and water off your floors and plastic out of landfills, trails, and oceans. It\'s designed to be an ultradurable mat made from recycled plastic materials that can withstand the messes and mishaps of dogs.\nTRUE 

In [260]:
#Graded outputs from OpenAI
graded_outputs

[{'results': 'CORRECT'},
 {'results': 'CORRECT'},
 {'results': 'CORRECT'},
 {'results': 'CORRECT'},
 {'results': 'INCORRECT'}]

In [255]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Predicted Grade: " + graded_outputs[i]['results'])
    print()

Example 0:
Question: Do the Cozy Comfort Pullover Set have side pockets?
Real Answer: Yes
Predicted Answer: According to the product description, yes, the Cozy Comfort Pullover Set has side pockets in the pull-on pants.
Predicted Grade: GRADE: CORRECT

Example 1:
Question: What collection is the Ultra-Lofty 850 Stretch Down Hooded Jacket from?
Real Answer: The DownTek collection
Predicted Answer: The Ultra-Lofty 850 Stretch Down Hooded Jacket is from the DownTek collection.
Predicted Grade: QUESTION: What collection is the Ultra-Lofty 850 Stretch Down Hooded Jacket from?
STUDENT ANSWER: The Ultra-Lofty 850 Stretch Down Hooded Jacket is from the DownTek collection.
TRUE ANSWER: The DownTek collection
GRADE: CORRECT

Example 2:
Question: What is the weight of a pair of Women's Campside Oxfords?
Real Answer: 1 lb.1 oz. per pair.
Predicted Answer: According to the description, the approximate weight of a pair of Women's Campside Oxfords is 1 lb. 1 oz. per pair.
Predicted Grade: QUESTION: W