In [17]:
import logging, sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Uncomment if you want to temporarily disable logger
logging.disable(sys.maxsize)

In [2]:
# NOTE: only necessary for querying with `use_async=True` in notebook
import nest_asyncio
nest_asyncio.apply()

In [3]:
# My OpenAI Key
import os
os.environ['OPENAI_API_KEY'] = ""

In [3]:
from llama_index import GPTTreeIndex, SimpleDirectoryReader, LLMPredictor, GPTVectorStoreIndex, GPTListIndex, Prompt, ServiceContext
from llama_index.indices.base import BaseGPTIndex
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from llama_index.response.schema import Response
import pandas as pd
from typing import Tuple

# Setup data

In [2]:
# fetch "New York City" page from Wikipedia
from pathlib import Path

import requests
response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'format': 'json',
        'titles': 'New York City',
        'prop': 'extracts',
        # 'exintro': True,
        'explaintext': True,
    }
).json()
page = next(iter(response['query']['pages'].values()))
nyc_text = page['extract']

data_path = Path('data')
if not data_path.exists():
    Path.mkdir(data_path)

with open('data/nyc_text.txt', 'w') as fp:
    fp.write(nyc_text)

In [4]:
documents = SimpleDirectoryReader('data').load_data()

DEBUG:llama_index.readers.file.base:> [SimpleDirectoryReader] Total files added: 1
> [SimpleDirectoryReader] Total files added: 1


# Setup benchmark

In [5]:
from dataclasses import dataclass
from typing import List

In [6]:
@dataclass
class TestCase:
    query: str 
    must_contain: List[str]

In [7]:
@dataclass
class TestOutcome:
    test: TestCase
    response: Response
    
    @property
    def is_correct_response(self) -> bool:
        is_correct = True
        for answer in self.test.must_contain:
            if answer not in self.response.response:
                is_correct = False
        return is_correct
    
    @property
    def is_correct_source(self) -> bool:
        is_correct = True
        for answer in self.test.must_contain:
            if all(answer not in node.source_text for node in self.response.source_nodes):
                is_correct = False
        return is_correct

In [8]:
class Benchmark:
    def __init__(self, tests: List[TestCase]) -> None:
        self._tests = tests
    
    def test(self, index: BaseGPTIndex, llm_predictor: LLMPredictor, **kwargs) -> List[TestOutcome]:
        outcomes: List[TestOutcome] = []
        service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
        for test in self._tests:
            query_engine = index.as_query_engine(
                service_context=service_context,
                **kwargs
            )
            response = query_engine.query(
                test.query,
            )
            outcome = TestOutcome(test=test, response=response)
            outcomes.append(outcome)
        return outcomes

In [9]:
def analyze_outcome(outcomes: List[TestOutcome]) -> None:
    rows = []
    for outcome in outcomes:
        row = [outcome.test.query, outcome.is_correct_response, outcome.is_correct_source]
        rows.append(row)
    df = pd.DataFrame(rows, columns=['Test Query', 'Correct Response', 'Correct Source'])
    return df

In [10]:
test_battle = TestCase(
    query="What battles took place in New York City in the American Revolution?",
    must_contain=["Battle of Long Island"]
)

test_mayor = TestCase(
    query='Who was elected as the mayor after the Great Depression?',
    must_contain=["Fiorello La Guardia"]
)

test_tourists = TestCase(
    query='How many tourists visited New York City in 2019?',
    must_contain=['66.6 million']
)
test_airport = TestCase(
    query='What are the airports in New York City?',
    must_contain=['LaGuardia Airport']
)
test_visit = TestCase(
    query='When was the first documented visit into New York Harbor?',
    must_contain=['1524']
)

In [11]:
bm = Benchmark([
    test_battle,
    test_mayor,
    test_tourists,
    test_airport,
    test_visit,
])

# LLM based evaluation

In [592]:
EVAL_PROMPT_TMPL = (
    "Given the question below. \n"
    "---------------------\n"
    "{query_str}"
    "\n---------------------\n"
    "Decide if the following retreived context is relevant. \n"
    "\n---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Then decide if the answer is correct. \n"
    "\n---------------------\n"
    "{answer_str}"
    "\n---------------------\n"
    "Answer in the following format:\n"
    "'Context is relevant: <True>\nAnswer is correct: <True>' "
    "and explain why."
)

DEFAULT_EVAL_PROMPT = Prompt(EVAL_PROMPT_TMPL)

In [593]:
import re
def extract_eval_result(result_str: str):
    boolean_pattern = r"(True|False)"
    matches = re.findall(boolean_pattern, result_str)
    return [match == "True" for match in matches]    

In [594]:
def analyze_outcome_llm_single(outcome: TestOutcome, llm_predictor: LLMPredictor) -> Tuple[bool, bool]:
    try:
        source_text = outcome.response.source_nodes[0].source_text
    except:
        source_text = "Failed to retrieve any context"
    result_str, _ = llm_predictor.predict(
        DEFAULT_EVAL_PROMPT,
        query_str=outcome.test.query,
        context_str=source_text,
        answer_str=outcome.response.response
    )
    is_context_relevant, is_answer_correct = extract_eval_result(result_str)
    return is_answer_correct, is_context_relevant, result_str

def analyze_outcome_llm(outcomes: List[TestOutcome], llm_predictor: LLMPredictor) -> None:
    rows = []
    for outcome in outcomes:
        is_correct_response, is_correct_source, result_str = analyze_outcome_llm_single(outcome, llm_predictor)
        row = [outcome.test.query, is_correct_response, is_correct_source, result_str]
        rows.append(row)
    df = pd.DataFrame(rows, columns=['Test Query', 'Correct Response (LLM)', 'Correct Source (LLM)', 'Eval (LLM)'])
    return df

# Build Indices

In [643]:
vector_index = GPTVectorStoreIndex.from_documents(
    documents, 
)

In [473]:
list_index = GPTListIndex.from_documents(
    documents, 
)

In [468]:
tree_index = GPTTreeIndex.from_documents(documents)

# Create LLMPredictors

In [12]:
# gpt-4
llm_predictor_gpt4 = LLMPredictor(
    llm=ChatOpenAI(temperature=0, model_name="gpt-4")
)

In [169]:
# gpt-3 (text-davinci-003)
llm_predictor_gpt3 = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003"))

In [22]:
# chatgpt (gpt-3.5-turbo)
llm_predictor_chatgpt = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"))

# Benchmarking 

### Tree Index + GPT4

In [583]:
outcomes_tree_gpt4 = bm.test(tree_index, llm_predictor_gpt4)

In [584]:
analyze_outcome(outcomes_tree_gpt4)

Unnamed: 0,Test Query,Correct Response,Correct Source
0,What battles took place in New York City in th...,True,True
1,Who was elected as the mayor after the Great D...,False,False
2,How many tourists visited New York City in 2019?,False,False
3,What are the airports in New York City?,False,False
4,When was the first documented visit into New Y...,False,False


### Tree Index + GPT3

In [549]:
outcomes_tree_gpt3 = bm.test(tree_index, llm_predictor_gpt3)

In [550]:
analyze_outcome(outcomes_tree_gpt3)

Unnamed: 0,Test Query,Correct Response,Correct Source
0,What battles took place in New York City in th...,True,False
1,Who was elected as the mayor after the Great D...,False,False
2,How many tourists visited New York City in 2019?,False,False
3,What are the airports in New York City?,True,False
4,When was the first documented visit into New Y...,True,False


### List Index + GPT4

In [18]:
outcomes_list_gpt4 = bm.test(list_index, llm_predictor_gpt4, response_mode="tree_summarize", use_async=True)

In [19]:
analyze_outcome(outcomes_list_gpt4)

Unnamed: 0,Test Query,Correct Response,Correct Source
0,What battles took place in New York City in th...,False,True
1,Who was elected as the mayor after the Great D...,False,True
2,How many tourists visited New York City in 2019?,True,True
3,What are the airports in New York City?,True,True
4,When was the first documented visit into New Y...,True,True


### List Index + GPT3

In [501]:
outcomes_list_gpt3 = bm.test(list_index, llm_predictor_gpt3, response_mode="tree_summarize", use_async=True)

In [502]:
analyze_outcome(outcomes_list_gpt3)

Unnamed: 0,Test Query,Correct Response,Correct Source
0,What battles took place in New York City in th...,True,True
1,Who was elected as the mayor during the Great ...,True,True
2,How many tourists visited New York City in 2019?,False,True
3,What are the airports in New York City?,True,True
4,When was the first documented visit into New Y...,True,True


### List Index + ChatGPT

In [23]:
outcomes_list_chatgpt = bm.test(list_index, llm_predictor_chatgpt, response_mode="tree_summarize", use_async=True)

In [24]:
analyze_outcome(outcomes_list_chatgpt)

Unnamed: 0,Test Query,Correct Response,Correct Source
0,What battles took place in New York City in th...,False,True
1,Who was elected as the mayor after the Great D...,False,True
2,How many tourists visited New York City in 2019?,False,True
3,What are the airports in New York City?,True,True
4,When was the first documented visit into New Y...,True,True


### Vector Store Index + GPT4 

In [487]:
outcomes_vector_gpt4 = bm.test(vector_index, llm_predictor_gpt4)

In [488]:
analyze_outcome(outcomes_vector_gpt4)

Unnamed: 0,Test Query,Correct Response,Correct Source
0,What battles took place in New York City in th...,True,True
1,Who was elected as the mayor during the Great ...,True,True
2,How many tourists visited New York City in 2019?,False,False
3,What are the airports in New York City?,True,True
4,When was the first documented visit into New Y...,True,True


### Vector Store Index + GPT3

In [644]:
outcomes_vector_gpt3 = bm.test(vector_index, llm_predictor_gpt3)

In [645]:
analyze_outcome(outcomes_vector_gpt3)

Unnamed: 0,Test Query,Correct Response,Correct Source
0,What battles took place in New York City in th...,True,True
1,Who was elected as the mayor after the Great D...,True,False
2,How many tourists visited New York City in 2019?,False,False
3,What are the airports in New York City?,True,False
4,When was the first documented visit into New Y...,True,False


# LLM based Evaluation

In [646]:
analyze_outcome(outcomes_vector_gpt3)

Unnamed: 0,Test Query,Correct Response,Correct Source
0,What battles took place in New York City in th...,True,True
1,Who was elected as the mayor after the Great D...,True,False
2,How many tourists visited New York City in 2019?,False,False
3,What are the airports in New York City?,True,False
4,When was the first documented visit into New Y...,True,False


In [647]:
eval_gpt4 = analyze_outcome_llm(outcomes_vector_gpt3, llm_predictor_gpt4)

In [657]:
eval_gpt4

Unnamed: 0,Test Query,Correct Response (LLM),Correct Source (LLM),Eval (LLM)
0,What battles took place in New York City in th...,True,True,Context is relevant: True\nAnswer is correct: ...
1,Who was elected as the mayor after the Great D...,True,False,Context is relevant: False\nAnswer is correct:...
2,How many tourists visited New York City in 2019?,True,False,Context is relevant: False\nAnswer is correct:...
3,What are the airports in New York City?,True,False,Context is relevant: False\nAnswer is correct:...
4,When was the first documented visit into New Y...,True,False,Context is relevant: False\nAnswer is correct:...


In [651]:
eval_chatgpt = analyze_outcome_llm(outcomes_vector_gpt3, llm_predictor_chatgpt)

In [652]:
eval_chatgpt

Unnamed: 0,Test Query,Correct Response (LLM),Correct Source (LLM),Eval (LLM)
0,What battles took place in New York City in th...,True,True,\n\nContext is relevant: True\nAnswer is corre...
1,Who was elected as the mayor after the Great D...,True,True,\n\nContext is relevant: True\nAnswer is corre...
2,How many tourists visited New York City in 2019?,False,False,\n\nContext is relevant: False\nAnswer is corr...
3,What are the airports in New York City?,True,False,\n\nContext is relevant: False\nAnswer is corr...
4,When was the first documented visit into New Y...,False,True,\n\nContext is relevant: True\nAnswer is corre...


In [649]:
eval_gpt3 = analyze_outcome_llm(outcomes_vector_gpt3, llm_predictor_gpt3)

In [650]:
eval_gpt3

Unnamed: 0,Test Query,Correct Response (LLM),Correct Source (LLM),Eval (LLM)
0,What battles took place in New York City in th...,True,True,\n\nContext is relevant: True\nAnswer is corre...
1,Who was elected as the mayor after the Great D...,True,True,\n\nContext is relevant: True\nAnswer is corre...
2,How many tourists visited New York City in 2019?,False,False,\n\nContext is relevant: False\nAnswer is corr...
3,What are the airports in New York City?,True,True,\n\nContext is relevant: True\nAnswer is corre...
4,When was the first documented visit into New Y...,True,True,\n\nContext is relevant: True\nAnswer is corre...
