In [None]:
# !pip install llama-index-vector-stores-faiss
# !pip install llama-index-embeddings-huggingface
# !pip install llama-index
# !pip install faiss-cpu
# !pip install datasets~=2.20.0
# !pip install transformers[torch]
# !pip install accelerate -U

This notebook was running on a AWS sagemaker ml.g5.4xlarge notebook instance which has one A10 GPU (24G memory)
To run it you need to install all the dependencies and add your own OpenAI API keys

Dataset can be downloaded from here: 

In [1]:
import datasets
import faiss
import os
import random
from collections import defaultdict
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document, load_index_from_storage, VectorStoreIndex, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.ingestion import IngestionPipeline

random.seed(42)

In [47]:
os.environ["OPENAI_API_KEY"] = "Your API Key"

In [3]:
hotpot_qa = datasets.load_from_disk("data/hotpot_qa_filtered")
wiki_docs = datasets.load_from_disk("data/hotpot_qa_wiki_docs")

In [4]:
wiki_docs

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 451380
})

In [5]:
hotpot_qa

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 76715
    })
    validation: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 6260
    })
})

In [6]:
# hotpot_qa["train"][0]['supporting_facts']

In [7]:
# hotpot_qa["train"][0]['context']

In [8]:
def get_pos_sentences(record):
    records = defaultdict(list)
    for title, idx in zip(record['supporting_facts']["title"], record['supporting_facts']["sent_id"]):
        # print(title, idx)
        cont_idx = record['context']["title"].index(title)
        # print(record['context']["title"][cont_idx])
        sentence = record['context']["sentences"][cont_idx][idx]
        # print(sentence)
        records[title].append(sentence)
    return records

def get_pos_chunk(record):
    records = {}
    for title, idx in zip(record['supporting_facts']["title"], record['supporting_facts']["sent_id"]):
        cont_idx = record['context']["title"].index(title)
        records[title] = " ".join(record['context']["sentences"][cont_idx])
    return records

### Create training set

In [6]:
from torch.utils.data import DataLoader
from sentence_transformers import InputExample
from sentence_transformers import losses
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments, SentenceTransformerTrainer
import random
from datasets import Dataset
from sentence_transformers.evaluation import TripletEvaluator

In [10]:
def get_triplet_data(record: dict) -> list[InputExample]:
    input_examples = []
    pos_titles = set(record['supporting_facts']["title"])
    pos_chunks = []
    neg_chunks = []
    for title, sentences in zip(record['context']["title"], record['context']["sentences"]):
        if title in pos_titles:
            pos_chunks.append(" ".join(sentences))
        else:
            neg_chunks.append(" ".join(sentences))

    for p_ch in pos_chunks:
        for n_ch in neg_chunks:
            input_examples.append({"anchor": record["question"], "positive": p_ch, "negative":n_ch})
            # input_examples.append([record["question"], p_ch, n_ch])
    return input_examples

In [20]:
test_examples = []
for record in hotpot_qa["validation"]:
    test_examples.extend(get_triplet_data(record))
    
len(test_examples)

99526

In [21]:
test_dataset = Dataset.from_list(random.sample(test_examples, 10000))

In [22]:
train_examples = []
for record in hotpot_qa["train"]:
    train_examples.extend(get_triplet_data(record))

In [23]:
len(train_examples)

1218618

In [1]:
# set small size for quick iter
TRAIN_SIZE = 100000
EVAL_SIZE = 2000

all_data = random.sample(train_examples, TRAIN_SIZE + EVAL_SIZE)

In [25]:
train_data = all_data[:TRAIN_SIZE]
len(train_data)

100000

In [26]:
eval_data = all_data[TRAIN_SIZE:]
len(eval_data)

2000

In [19]:
train_dataset = Dataset.from_list(train_data)

In [20]:
eval_dataset = Dataset.from_list(eval_data) # change it in final code

In [21]:
model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True, device="cuda")
triplet_loss = losses.TripletLoss(model=model)

README.md:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

In [22]:
# (Optional) Get base model performance
test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    batch_size = 16,
    show_progress_bar = True,
    name="hotpot-val-set",
)
test_evaluator(model)

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

{'hotpot-val-set_cosine_accuracy': 0.867,
 'hotpot-val-set_dot_accuracy': 0.1331,
 'hotpot-val-set_manhattan_accuracy': 0.8678,
 'hotpot-val-set_euclidean_accuracy': 0.867,
 'hotpot-val-set_max_accuracy': 0.8678}

In [18]:
BATCH_SIZE = 4
# LEARNING_RATE = 2e-4
LEARNING_RATE = 1e-5
WARM_RATIO = 0.1

In [23]:
# 5. (Optional) Specify training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=f"data/ft_models/ft_{TRAIN_SIZE}",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    learning_rate=LEARNING_RATE,
    warmup_ratio=WARM_RATIO,
    load_best_model_at_end=True,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=10,
    logging_steps=500,
    run_name="emb-base-all-triplet-test",  # Will be used in W&B if `wandb` is installed
)

In [24]:
# 6. (Optional) Create an evaluator & evaluate the base model
dev_evaluator = TripletEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
    name="hotpot-qa-sample-1k",
)
# dev_evaluator(model)

In [None]:
# 7. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=triplet_loss,
    evaluator=dev_evaluator,
)
trainer.train()

Step,Training Loss,Validation Loss,Hotpot-qa-sample-1k Cosine Accuracy,Hotpot-qa-sample-1k Dot Accuracy,Hotpot-qa-sample-1k Manhattan Accuracy,Hotpot-qa-sample-1k Euclidean Accuracy,Hotpot-qa-sample-1k Max Accuracy
500,1.8325,1.28925,0.9165,0.0855,0.9175,0.9175,0.9175
1000,1.2075,0.870721,0.9425,0.055,0.9435,0.944,0.944
1500,1.0185,0.815812,0.9515,0.0505,0.953,0.952,0.953
2000,0.8405,0.715509,0.9575,0.0415,0.9575,0.9585,0.9585
2500,0.8645,0.73466,0.9465,0.0535,0.9555,0.9465,0.9555
3000,0.8512,0.724865,0.9535,0.0465,0.9565,0.954,0.9565
3500,0.8601,0.806895,0.9505,0.049,0.9515,0.951,0.9515
4000,0.8955,0.763556,0.9545,0.047,0.953,0.956,0.956
4500,0.821,0.730599,0.9515,0.049,0.953,0.9515,0.953
5000,0.8329,0.722934,0.954,0.0445,0.9565,0.954,0.9565


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [None]:
# help(model.save)

# the result different between Cosine accuracy and dot accuracy means the embedding was not normalized

# https://docs.llamaindex.ai/en/stable/module_guides/models/embeddings/
# Based on the document llama index use cosine similarity by default which works

In [26]:
trainer.save_model(f"data/ft_models/ft_{TRAIN_SIZE}_v2")

In [23]:
TRAIN_SIZE

100000

In [25]:
# help(SentenceTransformer)
ft_model = SentenceTransformer(f"data/ft_models/ft_{TRAIN_SIZE}_v2", trust_remote_code=True, device="cuda")

In [27]:
# # (Optional) Evaluate the trained model on the test set
test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    batch_size = 16,
    show_progress_bar = True,
    name="hotpot-val-set",
)
test_evaluator(ft_model)

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

{'hotpot-val-set_cosine_accuracy': 0.9619,
 'hotpot-val-set_dot_accuracy': 0.0386,
 'hotpot-val-set_manhattan_accuracy': 0.9614,
 'hotpot-val-set_euclidean_accuracy': 0.9615,
 'hotpot-val-set_max_accuracy': 0.9619}

## Create a baseline using all pretrained model in Val set

### Create index For Val set

In [None]:
# create index for all the 6K validataion queries is very time consuming, select the first 1000 queries for this demo

In [7]:
all_context_docs = set()

for record in hotpot_qa["validation"].select(range(1000)):
    non_target = set(record["context"]["title"]) - set(record["supporting_facts"]["title"])
    if len(non_target) < 2:
        neg_sample = non_target
    else:
        neg_sample = random.sample(non_target, 2)

    for t in neg_sample:
        all_context_docs.add(t)
    for t in set(record["supporting_facts"]["title"]):
        all_context_docs.add(t)

since Python 3.9 and will be removed in a subsequent version.
  neg_sample = random.sample(non_target, 2)


In [8]:
len(all_context_docs)

3943

In [9]:
corpus = wiki_docs.filter(lambda example: example["title"] in all_context_docs)

Filter:   0%|          | 0/451380 [00:00<?, ? examples/s]

In [10]:
corpus

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 3752
})

## Create Llama index

In [11]:
documents = [Document(
    text=doc["text"],
    doc_id=doc["id"],
    metadata={"url":doc["url"], "title": doc["title"]}
) for doc in corpus]

#### We create index for 3.7K documents in total

In [12]:
len(documents)

3752

In [40]:
# # Only use labeled data

# documents = []

# for record in hotpot_qa["validation"]:
#     for title, sentences in zip(record['context']["title"], record['context']["sentences"]):
#         documents.append(Document(text=" ".join(sentences), metadata={"title": title}))

In [13]:
# len(documents)

In [17]:
chuck_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=50)
# nodes = chuck_splitter.get_nodes_from_documents(documents)

embed_model = HuggingFaceEmbedding(model_name=f"data/ft_models/ft_{TRAIN_SIZE}_v2", trust_remote_code=True, embed_batch_size=64,  device='cuda')

In [18]:
# test the model
test_emeds = embed_model.get_text_embedding("Hello World!")
len(test_emeds)

1024

In [35]:
embed_model

HuggingFaceEmbedding(model_name='data/ft_models/ft_100000_v2', embed_batch_size=64, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7f789920d330>, num_workers=None, max_length=8192, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None)

In [20]:
d = 1024
faiss_index = faiss.IndexFlatL2(d)
vector_store = FaissVectorStore(faiss_index=faiss_index)

In [21]:
# The following configuration assume one A10 GPU host

In [22]:
import time

start = time.time()

pipeline = IngestionPipeline(
    transformations=[
        chuck_splitter,
        embed_model,
    ],
)

# Ingest directly into a vector db
# nodes = pipeline.run(nodes=nodes, num_workers=2, show_progress=True)
nodes = pipeline.run(documents=documents, num_workers=2, show_progress=True)

end = time.time()
print(end - start)



895.2661793231964


#### For all the 3000 documents we got 22.9K document chunks

In [23]:
len(nodes)

22942

In [24]:
# # store intermediate results
# import pickle

# with open('data/doc_nodes.pickle', 'wb') as f:
#     # Pickle the 'data' dictionary using the highest protocol available.
#     pickle.dump(nodes, f)

In [25]:
# takes about 8 mins to index 500 queries so in total this may take about 2 hours for the whole val set

In [26]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)
storage_context.docstore.add_documents(nodes)

In [27]:
index = VectorStoreIndex(
    nodes, storage_context=storage_context, show_progress=True, embed_model=embed_model
)

Generating embeddings: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

### Persist index

In [30]:
INDEX_STORE = "data/hotpot_qa_storage"

In [31]:
index.storage_context.persist(persist_dir=INDEX_STORE)

## Load index

In [140]:
# load index from disk
vector_store = FaissVectorStore.from_persist_dir(INDEX_STORE)
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir=INDEX_STORE
)
index = load_index_from_storage(storage_context=storage_context, embed_model=embed_model)

## Query Index

Help on method as_retriever in module llama_index.core.indices.vector_store.base:

as_retriever(**kwargs: Any) -> llama_index.core.base.base_retriever.BaseRetriever method of llama_index.core.indices.vector_store.base.VectorStoreIndex instance



In [58]:
retrieve_engine = index.as_retriever()
response = retrieve_engine.retrieve("What did the author do growing up?")

In [59]:
response

[NodeWithScore(node=TextNode(id_='fbc13228-b4d6-4a6c-9b2a-c5f484ac10c5', embedding=None, metadata={'url': 'https://en.wikipedia.org/wiki/C.%20S.%20Lewis', 'title': 'C. S. Lewis'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='5813', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'url': 'https://en.wikipedia.org/wiki/C.%20S.%20Lewis', 'title': 'C. S. Lewis'}, hash='87bcd18afa5a634c294bb2a7c39a01492f368e22b00de2596028d280a9ec48ea'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='9e1f5062-c954-43d3-9fe2-3d9f141496c9', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='a4cd2ffbe2904fd62b9967b2f92b40654dc40d2c64c6d33a61d400b4e98a8522')}, text='Clive Staples Lewis  (29 November 1898\xa0– 22 November 1963) was a British writer, literary scholar, and Anglican lay theologian. He held academic positions in English literature at both Magdalen College, Oxford (1925–1954), and Magdalene College, Cam

In [113]:
query_engine = index.as_query_engine(similarity_top_k=3)
query_engine.query("What did the author do growing up?")

**********
Trace: query
    |_query -> 0.962059 seconds
      |_synthesize -> 0.933216 seconds
        |_templating -> 1.6e-05 seconds
        |_llm -> 0.927075 seconds
**********


Response(response='The author attended the Glasgow Academy at the age of eight, then continued his education at the Forfar Academy at the age of 10. Later, at 14, he left home for Dumfries Academy, where he became a voracious reader and enjoyed penny dreadfuls and the works of various authors.', source_nodes=[NodeWithScore(node=TextNode(id_='fbc13228-b4d6-4a6c-9b2a-c5f484ac10c5', embedding=None, metadata={'url': 'https://en.wikipedia.org/wiki/C.%20S.%20Lewis', 'title': 'C. S. Lewis'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='5813', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'url': 'https://en.wikipedia.org/wiki/C.%20S.%20Lewis', 'title': 'C. S. Lewis'}, hash='87bcd18afa5a634c294bb2a7c39a01492f368e22b00de2596028d280a9ec48ea'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='9e1f5062-c954-43d3-9fe2-3d9f141496c9', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='a4cd2ffbe2904fd62

In [133]:
# help(Settings)

In [197]:
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.core import Settings
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.base.base_query_engine import BaseQueryEngine
import nest_asyncio
from llama_index.llms.openai import OpenAI
import json
import os
import string
from openai import AsyncOpenAI

nest_asyncio.apply()
client = AsyncOpenAI()

In [143]:
# help(QueryEngineTool)

In [144]:
# Using the LlamaDebugHandler to print the trace of the sub questions
# captured by the SUB_QUESTION callback event type
query_engine = index.as_query_engine(similarity_top_k=3)
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

Settings.callback_manager = callback_manager

In [145]:
# help(CallbackManager)

In [146]:
# setup base query engine as tool
query_engine_tools = [
    QueryEngineTool(
        query_engine=query_engine,
        metadata=ToolMetadata(
            name="wiki_search_engine",
            description="Wikipedia query engin",
        ),
    ),
]
# llm = OpenAI(temperature=0.1, model="gpt-4o")

sub_query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    use_async=True,
    llm=llm,
)

In [147]:
help(SubQuestionQueryEngine.from_defaults)

Help on method from_defaults in module llama_index.core.query_engine.sub_question_query_engine:

from_defaults(query_engine_tools: Sequence[llama_index.core.tools.query_engine.QueryEngineTool], llm: Optional[llama_index.core.llms.llm.LLM] = None, question_gen: Optional[llama_index.core.question_gen.types.BaseQuestionGenerator] = None, response_synthesizer: Optional[llama_index.core.response_synthesizers.base.BaseSynthesizer] = None, service_context: Optional[llama_index.core.service_context.ServiceContext] = None, verbose: bool = True, use_async: bool = True) -> 'SubQuestionQueryEngine' method of abc.ABCMeta instance



In [93]:
# help(query_engine)

In [88]:
query_engine.query(
    "Were Scott Derrickson and Ed Wood of the same nationality?"
)

**********
Trace: query
    |_query -> 0.50256 seconds
      |_synthesize -> 0.474832 seconds
        |_templating -> 1.5e-05 seconds
        |_llm -> 0.470094 seconds
**********


Response(response='Yes.', source_nodes=[NodeWithScore(node=TextNode(id_='7bd3e25a-5261-4ebd-a7c1-1d85e04c01d8', embedding=None, metadata={'url': 'https://en.wikipedia.org/wiki/Scott%20Derrickson', 'title': 'Scott Derrickson'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='2816539', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'url': 'https://en.wikipedia.org/wiki/Scott%20Derrickson', 'title': 'Scott Derrickson'}, hash='031edde4c8df5319ef6dc8f2e3a4bb0f38b84a0cca7fbaa670fa156d5390ee81'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='8b1c9c7f-339f-4258-9de6-4f77868fbf48', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='733ed73c5e27905b06f56b5f8ce7bc576f22a061cb03c6e77b992f148af447a9')}, text='Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012) and 

In [89]:
sub_query_engine.query(
    "What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?"
)

Generated 2 sub questions.
[1;3;38;2;237;90;200m[wiki_search_engine] Q: Who portrayed Corliss Archer in the film Kiss and Tell?
[0m[1;3;38;2;90;149;237m[wiki_search_engine] Q: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?
[0m[1;3;38;2;90;149;237m[wiki_search_engine] A: Chief of Protocol of the United States
[0m[1;3;38;2;237;90;200m[wiki_search_engine] A: Shirley Temple
[0m**********
Trace: query
    |_query -> 1.903579 seconds
      |_llm -> 1.051759 seconds
      |_sub_question -> 0.541399 seconds
        |_query -> 0.541021 seconds
          |_synthesize -> 0.51277 seconds
            |_templating -> 1.7e-05 seconds
            |_llm -> 0.507151 seconds
      |_sub_question -> 0.464494 seconds
        |_query -> 0.464119 seconds
          |_synthesize -> 0.436556 seconds
            |_templating -> 1.5e-05 seconds
            |_llm -> 0.431738 seconds
      |_synthesize -> 0.308534 seconds
        |_templating -> 1.6e-0

Response(response='Chief of Protocol of the United States', source_nodes=[NodeWithScore(node=TextNode(id_='57133d82-e021-470f-b574-42331caeab39', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Sub question: Who portrayed Corliss Archer in the film Kiss and Tell?\nResponse: Shirley Temple', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=None), NodeWithScore(node=TextNode(id_='5594265b-ca62-46f0-9aff-93ccaae37d11', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Sub question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\nResponse: Chief of Protocol of the United States', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{c

### Quick QA eval

We need a way to eval whether the answer is correct. To do that we do two things: 1. If the model generated answer contain the ground truth answer then mark it correct. 2. If the model generated answer doesn't contain the exact ground truth answer then use a LLM to decide wether the answer is correct or not.

In [148]:
# disable logs
Settings.callback_manager = None

In [149]:
# sub_query_engine.query(
#     "What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?"
# )

In [154]:
test_qa = hotpot_qa["validation"].select(range(1000))

In [151]:
test_qa[0].keys()

dict_keys(['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'])

In [152]:
def get_answer(example: dict, query_engine: BaseQueryEngine=sub_query_engine) -> dict:
    resp = query_engine.query(example["question"])
    return {"ft_gen_ans": resp.response}

In [153]:
get_answer(test_qa[0])

Generated 2 sub questions.
[1;3;38;2;237;90;200m[wiki_search_engine] Q: What is the nationality of Scott Derrickson?
[0m[1;3;38;2;90;149;237m[wiki_search_engine] Q: What is the nationality of Ed Wood?
[0m[1;3;38;2;237;90;200m[wiki_search_engine] A: American
[0m[1;3;38;2;90;149;237m[wiki_search_engine] A: The nationality of Ed Wood is American.
[0m

{'ft_gen_ans': 'Yes, Scott Derrickson and Ed Wood were of the same nationality.'}

In [155]:
# huggingface emb model doesn't support concurrent execution, need to figure out a way to setup different model worker so we can use num_proc > 1
test_qa = test_qa.map(get_answer, num_proc=1)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generated 2 sub questions.
[1;3;38;2;237;90;200m[wiki_search_engine] Q: What is the nationality of Scott Derrickson?
[0m[1;3;38;2;90;149;237m[wiki_search_engine] Q: What is the nationality of Ed Wood?
[0m[1;3;38;2;237;90;200m[wiki_search_engine] A: American
[0m[1;3;38;2;90;149;237m[wiki_search_engine] A: The nationality of Ed Wood is American.
[0mGenerated 2 sub questions.
[1;3;38;2;237;90;200m[wiki_search_engine] Q: Who portrayed Corliss Archer in the film Kiss and Tell?
[0m[1;3;38;2;90;149;237m[wiki_search_engine] Q: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?
[0m[1;3;38;2;237;90;200m[wiki_search_engine] A: Shirley Temple portrayed Corliss Archer in the film Kiss and Tell.
[0m[1;3;38;2;90;149;237m[wiki_search_engine] A: The woman who portrayed Corliss Archer in the film Kiss and Tell held the position of United States Ambassador to Czechoslovakia.
[0mGenerated 2 sub questions.
[1;3;38;2;237;90;200m[wiki_searc

In [158]:
test_qa[0].keys()

dict_keys(['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context', 'ft_gen_ans'])

In [188]:
CORRECTNESS_INSTRUCTIONS = """
You are proficient in English and skilled at reading comprehension tasks. Your task is to evaluate a model-generated answer by comparing it to a provided ground truth answer. For each statement in the generated answer, classify it according to one of the following categories:

* Correct: The generated answer can be verified using the ground truth.
* Incorrect: The generated answer is either conflict or cannot be validated against the ground truth.
* NoAnswer: The generated answer indicates that the model does not know the answer or don't have the right context to answer the question.

Summarize your findings in the following JSON format:

{{
    "EvalResult": "Correct" / "Incorrect" / "NoAnswer" based on your analysis
}}
"""

USER_MESSAGE = """
Question:
{question}

Generated Answer:
{generated_answer}

Ground Truth:
{ground_truth}

Evaluation Result:
"""


def load_json_str(gen: str) -> dict:
    start = gen.index("{")
    end = gen.rindex("}") + 1

    # Extract the JSON string
    json_string = gen[start:end]
    return json.loads(json_string)


# gpt-4-1106-preview
async def call_chat_api(system_prompt, user_message, model="gpt-4o-mini", return_json=True) -> dict:
    """
    call the openai chat api with the given prompt.

    Args:
        system_prompt: str, system prompt
        user_message: str, user message
        model: str, the model name

    Return:

        response: dict, the response message from openai
    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_message},
    ]
    #     print(f"start a call with system prompt {system_prompt}")
    #     print(f"start a call with user message {user_message}")

    if return_json:
        response = await client.chat.completions.create(
            model=model, messages=messages, response_format={"type": "json_object"}
        )
    else:
        response = await client.chat.completions.create(
            model=model, messages=messages, response_format={"type": "text"}
        )
    return load_json_str(response.choices[0].message.content)


async def is_correct(record: dict) -> str:
    user_message = USER_MESSAGE.format(question=record["question"], generated_answer=record["ft_gen_ans"], ground_truth=record["answer"])
    result = await call_chat_api(
        system_prompt=CORRECTNESS_INSTRUCTIONS,
        user_message=user_message)
    # print(result)
    return result["EvalResult"]

In [199]:
def remove_punctuation(input_string: str):
    return input_string.translate(str.maketrans('', '', string.punctuation))


async def eval_record(record: dict) -> bool:
    # try to do a fuzzy match
    answer = remove_punctuation(record["ft_gen_ans"]).lower()
    gt = remove_punctuation(record["answer"]).lower()
    if answer in gt or gt in answer:
        return "Correct"
    else:
        return await is_correct(record)

In [200]:
import asyncio

semaphore = asyncio.Semaphore(2)


async def process_query(record, func):
    max_retry = 3
    async with semaphore:
        retry_count = 0
        while retry_count < max_retry:
            try:
                output = await func(record)
                break
            except Exception as e:
                retry_count += 1
                print(f"Retrying {retry_count} times due to exception: {e}")
                await asyncio.sleep(10)
                output = {}
        return output


async def execute(test_qa: Dataset, func):
    tasks = []
    for record in test_qa:
        task = asyncio.create_task(process_query(record, eval_record))
        tasks.append(task)
    return await asyncio.gather(*tasks, return_exceptions=True)

In [208]:
results = await execute(test_qa, eval_record)

In [209]:
from collections import Counter

In [211]:
eval_result = Counter(results)

In [212]:
eval_result

Counter({'Correct': 540, 'Incorrect': 430, 'NoAnswer': 30})

In [217]:
test_qa = test_qa.add_column("eval_result", results)

In [224]:
incorrect_qa = test_qa.filter(lambda example: example["eval_result"] == "Incorrect")
correct_qa = test_qa.filter(lambda example: example["eval_result"] == "Correct")

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [229]:
Counter(hotpot_qa["validation"]['level'])

Counter({'hard': 6260})

In [228]:
hotpot_qa["validation"]

Dataset({
    features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
    num_rows: 6260
})

## As a comparison here are some latest SOTA results on HPQA

https://contextual.ai/introducing-rag2/
