In [17]:
import nest_asyncio

nest_asyncio.apply()

# Response Evaluator

In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
from llama_index import (
    TreeIndex,
    VectorStoreIndex,
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext,
    Response,
)
from llama_index.llms import OpenAI
from llama_index.evaluation import ResponseEvaluator
import pandas as pd

pd.set_option("display.max_colwidth", 0)

INFO:numexpr.utils:Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


In [3]:
# gpt-3 (davinci)
gpt3 = OpenAI(temperature=0, model="text-davinci-003")
service_context_gpt3 = ServiceContext.from_defaults(llm=gpt3)

# gpt-4
gpt4 = OpenAI(temperature=0, model="gpt-4")
service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4)

In [4]:
evaluator = ResponseEvaluator(service_context=service_context_gpt3)
evaluator_gpt4 = ResponseEvaluator(service_context=service_context_gpt4)

In [5]:
documents = SimpleDirectoryReader("./test_wiki_data/").load_data()

In [6]:
# create tree index
tree_index = TreeIndex.from_documents(documents=documents)

INFO:llama_index.indices.common_tree.base:> Building index from nodes: 3 chunks
> Building index from nodes: 3 chunks


In [7]:
# create vector index
vector_index = VectorStoreIndex.from_documents(
    documents, service_context=ServiceContext.from_defaults(chunk_size=512)
)

In [8]:
# define jupyter display function
def display_eval_df(response: Response, eval_result: str) -> None:
    if response.source_nodes == []:
        print("no response!")
        return
    eval_df = pd.DataFrame(
        {
            "Response": str(response),
            "Source": response.source_nodes[0].node.text[:1000] + "...",
            "Evaluation Result": eval_result,
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)

In [9]:
query_engine = tree_index.as_query_engine()
response_tree = query_engine.query(
    "What battles took place in New York City in the American Revolution?"
)
eval_result = evaluator_gpt4.evaluate(response_tree)

INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 0] Selected node: [1]/[1]
>[Level 0] Selected node: [1]/[1]
INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 1] Selected node: [4]/[4]
>[Level 1] Selected node: [4]/[4]


In [10]:
display_eval_df(response_tree, eval_result)

Unnamed: 0,Response,Source,Evaluation Result
0,The Battle of Long Island and the Great Fire of New York.,"settlement was promptly renamed ""New York"" after the Duke of York (the future King James II and VII), who would eventually be deposed in the Glorious Revolution. After the founding, the duke gave part of the colony to proprietors George Carteret and John Berkeley. Fort Orange, 150 miles (240 km) north on the Hudson River, was renamed Albany after James's Scottish title. The transfer was confirmed in 1667 by the Treaty of Breda, which concluded the Second Anglo-Dutch War.On August 24, 1673, during the Third Anglo-Dutch War, Dutch captain Anthony Colve seized the colony of New York from the English at the behest of Cornelis Evertsen the Youngest and rechristened it ""New Orange"" after William III, the Prince of Orange. The Dutch would soon return the island to England under the Treaty of Westminster of November 1674.Several intertribal wars among the Native Americans and some epidemics brought on by contact with the Europeans caused sizeable population losses for the Lenape between the ye...",YES


In [11]:
query_engine = vector_index.as_query_engine()
response_vector = query_engine.query(
    "What battles took place in New York City in the American Revolution?"
)
eval_result = evaluator_gpt4.evaluate(response_vector)

In [12]:
display_eval_df(response_vector, eval_result)

Unnamed: 0,Response,Source,Evaluation Result
0,"The Battle of Long Island, which was the largest battle of the American Revolutionary War, took place in August 1776 within the modern-day borough of Brooklyn. The only attempt at a peaceful solution to the war took place at the Conference House on Staten Island between American delegates, including Benjamin Franklin, and British general Lord Howe on September 11, 1776.","enslaved few or several people. Others were hired out to work at labor. Slavery became integrally tied to New York's economy through the labor of slaves throughout the port, and the banking and shipping industries trading with the American South. During construction in Foley Square in the 1990s, the African Burying Ground was discovered; the cemetery included 10,000 to 20,000 of graves of colonial-era Africans, some enslaved and some free.The 1735 trial and acquittal in Manhattan of John Peter Zenger, who had been accused of seditious libel after criticizing colonial governor William Cosby, helped to establish the freedom of the press in North America. In 1754, Columbia University was founded under charter by King George II as King's College in Lower Manhattan. === American Revolution === The Stamp Act Congress met in New York in October 1765, as the Sons of Liberty organization emerged in the city and skirmished over the next ten years with British troops stationed there. The Battl...",YES


## Benchmark on Generated Question

Now lets generate a few more questions so that we have more to evaluate with and run a small benchmark. In practic

In [24]:
from llama_index.evaluation import DatasetGenerator

question_generator = DatasetGenerator.from_documents(documents)
eval_questions = question_generator.generate_questions_from_nodes(3)

len(eval_questions)

chunk_size_limit is deprecated, please specify chunk_size instead


110

In [40]:
eval_questions[:5]

['What is the population of New York City as of 2020?',
 'Which borough of New York City has the highest population?',
 'What is the economic significance of New York City?',
 'How did New York City get its name?',
 'What is the significance of the Statue of Liberty in New York City?']

In [37]:
import asyncio

def evaluate_query_engine(query_engine, questions):
    c = [query_engine.aquery(q) for q in questions]
    results = asyncio.run(asyncio.gather(*c))
    print("finished query")
    
    total_correct = 0
    for r in results:
        eval_result = 1 if evaluator.evaluate(r) == "YES" else 0
        total_correct + eval_result
        
    return total_correct, len(results)

In [38]:
vector_query_engine = vector_index.as_query_engine()
correct, total = evaluate_query_engine(vector_query_engine, eval_questions[:5])

print(f"score: {correct}/{total}")

INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=1098 request_id=8ddbc4f630bc2b5cf79e517dd0393db2 response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=1098 request_id=8ddbc4f630bc2b5cf79e517dd0393db2 response_code=200
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=1118 request_id=e47a91e0f7e7bb96338bcf2b67408ef5 response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=1118 request_id=e47a91e0f7e7bb96338bcf2b67408ef5 response_code=200
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=4390 request_id=a50bc7c7ea2e01074102ceae656d44e8 response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=4390 request_id=a50bc7c7ea2e01074102ceae656d44e8 response_code=200
INFO:openai:message='OpenAI API response

In [39]:
tree_query_engine = tree_index.as_query_engine()
correct, total = evaluate_query_engine(tree_query_engine, eval_questions[:5])

print(f"score: {correct}/{total}")

INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 0] Selected node: [1]/[1]
>[Level 0] Selected node: [1]/[1]
INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 1] Selected node: [1]/[1]
>[Level 1] Selected node: [1]/[1]
INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 0] Selected node: [1]/[1]
>[Level 0] Selected node: [1]/[1]
INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 1] Selected node: [2]/[2]
>[Level 1] Selected node: [2]/[2]
INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 0] Selected node: [3]/[3]
>[Level 0] Selected node: [3]/[3]
INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 1] Selected node: [5]/[5]
>[Level 1] Selected node: [5]/[5]
INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 0] Selected node: [1]/[1]
>[Level 0] Selected node: [1]/[1]
INFO:llama_index.indices.tree.select_leaf_retriever:>[Level 1] Selected node: [4]/[4]
>[Level 1] Selected node: [4]/[4]
INFO:llama_index.indices.tree.select_lea