# Parameter Optimization

In [1]:
!mkdir data && wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"

--2023-11-04 00:16:34--  https://arxiv.org/pdf/2307.09288.pdf
Resolving arxiv.org (arxiv.org)... 128.84.21.199
Connecting to arxiv.org (arxiv.org)|128.84.21.199|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13661300 (13M) [application/pdf]
Saving to: ‘data/llama2.pdf’


2023-11-04 00:17:10 (376 KB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]



In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
from pathlib import Path
from llama_hub.file.pdf.base import PDFReader
from llama_hub.file.unstructured.base import UnstructuredReader
from llama_hub.file.pymu_pdf.base import PyMuPDFReader

In [3]:
loader = PDFReader()
docs0 = loader.load_data(file=Path("./data/llama2.pdf"))

In [4]:
from llama_index import Document

doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]

In [5]:
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import IndexNode

## Define Eval Dataset

In [6]:
from llama_index.evaluation import (
    DatasetGenerator,
    QueryResponseDataset,
)
from llama_index import ServiceContext
from llama_index.llms import OpenAI

In [7]:
# NOTE: run this if the dataset isn't already saved

# have a default way of chunking the data to generate questions from 
node_parser = SimpleNodeParser.from_defaults(chunk_size=1024)
eval_nodes = node_parser.get_nodes_from_documents(docs)

# Note: we only generate from the first 20 nodes, since the rest are references
eval_service_context = ServiceContext.from_defaults(llm=OpenAI(model="gpt-4"))
dataset_generator = DatasetGenerator(
    eval_nodes[:20],
    service_context=eval_service_context,
    show_progress=True,
    num_questions_per_chunk=3,
)

In [10]:
eval_dataset = await dataset_generator.agenerate_dataset_from_nodes(num=60)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:12<00:00,  1.54it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:16<00:00,  5.56s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.40s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.16s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:07<00:00,  2.66s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:15<00:00,  5.24s/it]
100%|█████████████████████████████████████████

In [11]:
eval_dataset.save_json("data/llama2_eval_qr_dataset.json")

In [8]:
# optional
eval_dataset = QueryResponseDataset.from_json(
    "data/llama2_eval_qr_dataset.json"
)

In [9]:
from llama_index.evaluation import BatchEvalRunner

In [10]:
eval_qs = eval_dataset.questions
ref_response_strs = [r for (_, r) in eval_dataset.qr_pairs]

## Define Function to Optimize

Here we define function to optimize given the parameters.

The function specifically does the following: 1) builds an index from documents, 2) queries index, and runs some basic evaluation.

In [11]:
from llama_index import VectorStoreIndex, load_index_from_storage, StorageContext
from llama_index.param_tuner.base import ParamTuner, TunedResult, RunResult
from llama_index.evaluation.eval_utils import get_responses
from llama_index.evaluation import SemanticSimilarityEvaluator
import os
import numpy as np
from pathlib import Path

In [12]:
def objective_function(params_dict):
    chunk_size = params_dict["chunk_size"]
    docs = params_dict["docs"]
    top_k = params_dict["top_k"]
    # eval_batch_runner = params_dict["eval_batch_runner"]
    eval_qs = params_dict["eval_qs"]
    ref_response_strs = params_dict["ref_response_strs"]

    index_out_path = f"./storage_{chunk_size}"
    if not os.path.exists(index_out_path):
        Path(index_out_path).mkdir(parents=True, exist_ok=True)
        # parse docs
        node_parser = SimpleNodeParser.from_defaults(chunk_size=chunk_size)
        base_nodes = node_parser.get_nodes_from_documents(docs)
    
        # build index
        index = VectorStoreIndex(base_nodes)
        # save index to disk
        # index.set_index_id(f"vector_index_{chunk_size}")
        index.storage_context.persist(index_out_path)
    else:
        # rebuild storage context
        storage_context = StorageContext.from_defaults(persist_dir=index_out_path)
        # load index
        index = load_index_from_storage(
            storage_context, 
            # index_id=f"vector_index_{chunk_size}"
        )

    # query engine
    query_engine = index.as_query_engine(similarity_top_k=top_k)

    # get predicted responses
    pred_response_objs = get_responses(eval_qs, query_engine, show_progress=True)
    # pred_responses = [str(p) for p in pred_response_objs]

    # run evaluator
    # eval_results = await batch_runner.aevaluate_responses(
    #     eval_qs, responses=pred_responses, reference=ref_response_strs
    # )
    # NOTE: can uncomment other evaluators
    eval_service_context = ServiceContext.from_defaults(llm=OpenAI(model="gpt-3.5-turbo"))
    evaluator_s = SemanticSimilarityEvaluator(service_context=eval_service_context)
    eval_batch_runner = BatchEvalRunner({"semantic_similarity": evaluator_s}, workers=2, show_progress=True)
    eval_results = eval_batch_runner.evaluate_responses(
        eval_qs, responses=pred_response_objs, reference=ref_response_strs
    )

    # get semantic similarity metric
    mean_score = np.array([r.score for r in eval_results["semantic_similarity"]]).mean()

    return RunResult(score=mean_score, params=params_dict)

In [14]:
# chunk_size=256

# index_out_path = f"./storage_{chunk_size}"
# if not os.path.exists(index_out_path):
#     Path(index_out_path).mkdir(parents=True, exist_ok=True)
#     # parse docs
#     node_parser = SimpleNodeParser.from_defaults(chunk_size=chunk_size)
#     base_nodes = node_parser.get_nodes_from_documents(docs)

#     # build index
#     index = VectorStoreIndex(base_nodes)
#     # save index to disk
#     # index.set_index_id(f"vector_index_{chunk_size}")
#     index.storage_context.persist(index_out_path)
# else:
#     # rebuild storage context
#     storage_context = StorageContext.from_defaults(persist_dir=index_out_path)
#     # load index
#     index = load_index_from_storage(
#         storage_context, 
#         # index_id=f"vector_index_{chunk_size}"
#     )

In [15]:
# # TMP TEST
# param_combination = {'top_k': 1, 'chunk_size': 256}
# tmp_fixed_param_dict = {
#     "docs": docs,
#     # "eval_batch_runner": eval_batch_runner,
#     "eval_qs": eval_qs[:4],
#     "ref_response_strs": ref_response_strs[:4]
# }

# objective_function({**param_combination, **tmp_fixed_param_dict})

## Run ParamTuner (default)

In [31]:
from llama_index.param_tuner.base import ParamTuner

In [32]:
# param_dict = {
#     "chunk_size": [256, 512, 1024, 2048],
#     "top_k": [1, 2, 5]
# }
param_dict = {
    "chunk_size": [256],
    "top_k": [1]
}
fixed_param_dict = {
    "docs": docs,
    # "eval_batch_runner": eval_batch_runner,
    "eval_qs": eval_qs[:10],
    "ref_response_strs": ref_response_strs[:10]
}

In [33]:
param_tuner = ParamTuner(
    param_fn=objective_function,
    param_dict=param_dict, 
    fixed_param_dict=fixed_param_dict,
)

In [34]:
results = param_tuner.tune()

[{'top_k': 1, 'chunk_size': 256}]
{'top_k': 1, 'chunk_size': 256}


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.26it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  5.80it/s]


In [37]:
results.best_run_result



## Run ParamTuner (Ray Tune)

In [16]:
from llama_index.param_tuner.base import RayTuneParamTuner

In [17]:
param_dict = {
    "chunk_size": [256],
    "top_k": [1]
}
fixed_param_dict = {
    "docs": docs,
    "eval_qs": eval_qs[:10],
    "ref_response_strs": ref_response_strs[:10]
}

In [20]:
param_tuner = RayTuneParamTuner(
    param_fn=objective_function,
    param_dict=param_dict, 
    fixed_param_dict=fixed_param_dict,
    run_config_dict={"storage_path": "/tmp/custom/ray_tune", "name": "my_exp"}
)

In [None]:
results = param_tuner.tune()

In [25]:
results.best_run_result.params.keys()

dict_keys(['docs', 'eval_qs', 'ref_response_strs', 'chunk_size', 'top_k'])

In [26]:
results.best_idx

0