# RAG-based LLM Applications

- https://github.com/ray-project/llm-applications
- https://endpoints.anyscale.com/

## Set up

In [1]:
import os
import openai
from pathlib import Path
from pprint import pprint
import ray
from tqdm import tqdm

In [2]:
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")
from dotenv import load_dotenv; load_dotenv()

True

In [3]:
# Directories
EFS_DIR = Path("/efs/shared_storage/goku")
ROOT_DIR = Path(os.getcwd()).parent
EXPERIMENTS_DIR = Path(ROOT_DIR, "experiments_small")
print (f"EFS_DIR: {EFS_DIR}")
print (f"ROOT_DIR: {ROOT_DIR}")
print (f"EXPERIMENTS_DIR: {EXPERIMENTS_DIR}")

EFS_DIR: /efs/shared_storage/goku
ROOT_DIR: /home/ray/ray-assistant
EXPERIMENTS_DIR: /home/ray/ray-assistant/experiments_small


In [4]:
# Credentials
ray.init(runtime_env={"env_vars": {
    "OPENAI_API_BASE": os.environ["OPENAI_API_BASE"],
    "OPENAI_API_KEY": os.environ["OPENAI_API_KEY"], 
    "ANYSCALE_API_BASE": os.environ["ANYSCALE_API_BASE"],
    "ANYSCALE_API_KEY": os.environ["ANYSCALE_API_KEY"],
    "DB_CONNECTION_STRING": os.environ["DB_CONNECTION_STRING"],
}})

2023-09-04 12:13:36,712	INFO worker.py:1431 -- Connecting to existing Ray cluster at address: 10.0.5.135:6379...
2023-09-04 12:13:36,725	INFO worker.py:1612 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-yn5cwtau135l5cajlbkzrdyqqp.i.anyscaleuserdata-staging.com [39m[22m
2023-09-04 12:13:36,728	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_53a8a13ea982be5f3f90661c309df5a6.zip' (0.24MiB) to Ray cluster...
2023-09-04 12:13:36,730	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_53a8a13ea982be5f3f90661c309df5a6.zip'.


0,1
Python version:,3.8.13
Ray version:,2.6.3
Dashboard:,http://session-yn5cwtau135l5cajlbkzrdyqqp.i.anyscaleuserdata-staging.com


## Data

### Load data

Our data is already ready at `/efs/shared_storage/goku/docs.ray.io/en/master/` (on Staging, `us-east-1`) but if you wanted to load it yourself, run this bash command (change `/desired/output/directory`, but make sure it's on the shared storage,
so that it's accessible to the workers):
```bash
export DOCS_PATH=/desired/output/directory
wget -e robots=off --recursive --no-clobber --page-requisites \
  --html-extension --convert-links --restrict-file-names=windows \
  --domains docs.ray.io --no-parent --accept=html \
  -P $DOCS_PATH https://docs.ray.io/en/master/
```

In [6]:
# Ray dataset
docs_path = Path(EFS_DIR, "docs.ray.io/en/master/")
ds = ray.data.from_items([{"path": path} for path in docs_path.rglob("*.html") if not path.is_dir()])
print(f"{ds.count()} documents")

3282 documents


### Chunk data

In [7]:
from bs4 import BeautifulSoup, NavigableString, Tag

In [8]:
def extract_text_from_section(section):
    texts = []
    for elem in section.children:
        if isinstance(elem, NavigableString):
            if elem.strip():
                texts.append(elem.strip())
        elif elem.name == 'section':
            continue
        else:
            texts.append(elem.get_text().strip())
    return '\n'.join(texts)

In [9]:
def path_to_uri(path, scheme="https://", domain="docs.ray.io"):
    return scheme + domain + str(path).split(domain)[-1]

In [10]:
def extract_sections(record):
    with open(record["path"], "r", encoding="utf-8") as html_file:
        soup = BeautifulSoup(html_file, "html.parser")
    sections = soup.find_all("section")
    section_list = []
    for section in sections:
        section_id = section.get("id")
        section_text = extract_text_from_section(section)
        if section_id:
            uri = path_to_uri(path=record["path"])
            section_list.append({"source": f"{uri}#{section_id}", "text": section_text})
    return section_list

In [11]:
html_file_path = Path(EFS_DIR, "docs.ray.io/en/master/rllib/rllib-env.html")
extract_sections({"path": html_file_path})[0]

{'source': 'https://docs.ray.io/en/master/rllib/rllib-env.html#environments',
 'text': '\nEnvironments#\nRLlib works with several different types of environments, including Farama-Foundation Gymnasium, user-defined, multi-agent, and also batched environments.\nTip\nNot all environments work with all algorithms. Check out the algorithm overview for more information.\n'}

In [12]:
# Extract sections
sections_ds = ds.flat_map(extract_sections)
sections = sections_ds.take_all()
print (len(sections))

2023-09-04 12:33:41,071	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap(extract_sections)]
2023-09-04 12:33:41,072	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-09-04 12:33:41,074	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/200 [00:00<?, ?it/s]

5727


In [13]:
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [14]:
chunk_size = 300
chunk_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
)

In [15]:
# Chunks
chunks = text_splitter.create_documents(
    texts=[section["text"] for section in sections], 
    metadatas=[{"source": section["source"]} for section in sections]
)

In [16]:
print (f"{len(chunks)} chunks\n")
pprint (chunks[0].page_content)
print (f"\nmetadata:\n{chunks[0].metadata}")

32276 chunks

('Reference#\n'
 'Monitor and debug your Ray applications and clusters using the API and CLI '
 'documented in these references.\n'
 'The guides include:\n'
 'State API\n'
 'State CLI\n'
 'System Metrics')

metadata:
{'source': 'https://docs.ray.io/en/master/ray-observability/reference/index.html#reference'}


In [17]:
# Ray dataset
chunks_ds = ray.data.from_items([{"text": chunk.page_content, "source": chunk.metadata["source"]} for chunk in chunks])
chunks_ds.show(1)

2023-09-04 12:34:39,845	INFO dataset.py:2180 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.


{'text': 'Reference#\nMonitor and debug your Ray applications and clusters using the API and CLI documented in these references.\nThe guides include:\nState API\nState CLI\nSystem Metrics', 'source': 'https://docs.ray.io/en/master/ray-observability/reference/index.html#reference'}


### Embed data

In [18]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import numpy as np
from ray.data import ActorPoolStrategy

In [19]:
class EmbedChunks:
    def __init__(self, model_name):
        model_kwargs = {"device": "cuda"}
        encode_kwargs = {"device": "cuda", "batch_size": 100}
        if model_name == "text-embedding-ada-002":
            self.embedding_model = OpenAIEmbeddings(
                model=model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs,
                openai_api_base=os.environ["OPENAI_API_BASE"],
                openai_api_key=os.environ["OPENAI_API_KEY"])
        else:
            self.embedding_model = HuggingFaceEmbeddings(
                model_name=model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs)
    
    def __call__(self, batch):
        embeddings = self.embedding_model.embed_documents(batch["text"])
        return {"text": batch["text"], "source": batch["source"], "embeddings": embeddings}

In [20]:
# Embed chunks
embedding_model_name = "thenlper/gte-base"
embedded_chunks = chunks_ds.map_batches(
    EmbedChunks,
    fn_constructor_kwargs={"model_name": embedding_model_name},
    batch_size=100, 
    num_gpus=1,
    compute=ActorPoolStrategy(size=2))

In [22]:
# Sample
sample = embedded_chunks.take(1)
print ("embedding size:", len(sample[0]["embeddings"]))
pprint(sample[0]["text"])

2023-09-04 12:36:36,645	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[MapBatches(EmbedChunks)]
2023-09-04 12:36:36,646	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-09-04 12:36:36,647	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
2023-09-04 12:36:36,663	INFO actor_pool_map_operator.py:117 -- MapBatches(EmbedChunks): Waiting for 2 pool actors to start...


Running 0:   0%|          | 0/200 [00:00<?, ?it/s]



embedding size: 768
('It is equivalent to PENDING_CREATION,\n'
 'but means the actor was dead more than once.\n'
 'DEAD: The actor is permanatly dead.')


### Index data

In [23]:
import psycopg
from pgvector.psycopg import register_vector

In [None]:
%%bash
# Set up pgvector
bash ../setup-pgvector.sh

In [162]:
%%bash
# Drop existing table if it exists
psql "$DB_CONNECTION_STRING" -c "DROP TABLE IF EXISTS document;"
sudo -u postgres psql -f ../migrations/vector-768.sql  # "thenlper/gte-base" dimension is 768

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
DROP TABLE
CREATE TABLE


If we have already created an index (and saved it), we can reload it:

In [163]:
%%bash
# Load index
export SQL_DUMP_FP="/efs/shared_storage/goku/sql_dumps/gte-base_300_50.sql"
echo $SQL_DUMP_FP
psql "$DB_CONNECTION_STRING" -f $SQL_DUMP_FP  # load

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/efs/shared_storage/goku/sql_dumps/gte-base_300_50.sql
SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
ALTER TABLE
ALTER TABLE
DROP SEQUENCE
DROP TABLE
DROP EXTENSION
CREATE EXTENSION
COMMENT
SET
SET
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER TABLE
ALTER SEQUENCE
ALTER TABLE
COPY 32276
 setval 
--------
  32276
(1 row)

ALTER TABLE


In [164]:
%%bash
psql "$DB_CONNECTION_STRING" -c "SELECT count(*) FROM document;"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 count 
-------
 32276
(1 row)



otherwise, we can index the data and save it:

In [159]:
class StoreResults:
    def __call__(self, batch):
        with psycopg.connect(os.environ["DB_CONNECTION_STRING"]) as conn:
            register_vector(conn)
            with conn.cursor() as cur:
                for text, source, embedding in zip(batch["text"], batch["source"], batch["embeddings"]):
                    cur.execute("INSERT INTO document (text, source, embedding) VALUES (%s, %s, %s)", (text, source, embedding,),)
        return {}

In [160]:
# Index data
embedded_chunks.map_batches(
    StoreResults,
    batch_size=128,
    num_cpus=1,
    compute=ActorPoolStrategy(size=28),
).count()

2023-09-04 15:34:51,436	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[MapBatches(EmbedChunks)] -> ActorPoolMapOperator[MapBatches(StoreResults)]
2023-09-04 15:34:51,439	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-09-04 15:34:51,440	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
2023-09-04 15:34:51,465	INFO actor_pool_map_operator.py:117 -- MapBatches(EmbedChunks): Waiting for 2 pool actors to start...
2023-09-04 15:35:10,444	INFO actor_pool_map_operator.py:117 -- MapBatches(StoreResults): Waiting for 28 pool actors to start...


Running 0:   0%|          | 0/200 [00:00<?, ?it/s]



0

In [161]:
%%bash
# Save index
export SQL_DUMP_FP="/efs/shared_storage/goku/sql_dumps/gte-base_300_50.sql"
mkdir -p $(dirname "$SQL_DUMP_FP") && touch $SQL_DUMP_FP
sudo -u postgres pg_dump -c > $SQL_DUMP_FP  # save

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Retrieval

In [165]:
import json
import numpy as np

In [166]:
# Embed query
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
query = "What is the default batch size for map_batches?"
embedding = np.array(embedding_model.embed_query(query))
len(embedding)

768

In [179]:
# Get context
num_chunks = 5
with psycopg.connect(os.environ["DB_CONNECTION_STRING"]) as conn:
    register_vector(conn)
    with conn.cursor() as cur:
        cur.execute("SELECT * FROM document ORDER BY embedding <-> %s LIMIT %s", (embedding, num_chunks))
        rows = cur.fetchall()
        context = [{"text": row[1]} for row in rows]
        sources = [row[2] for row in rows]
for i, item in enumerate(context):
    print (sources[i])
    print (item["text"])
    print ()

https://docs.ray.io/en/master/data/api/doc/ray.data.Dataset.map_batches.html#ray-data-dataset-map-batches
entire blocks as batches (blocks may contain different numbers of rows).
The actual size of the batch provided to fn may be smaller than
batch_size if batch_size doesn’t evenly divide the block(s) sent
to a given map task. Default batch_size is 4096 with “default”.

https://docs.ray.io/en/master/data/transforming-data.html#configuring-batch-size
batch_size.
Note
The default batch size depends on your resource type. If you’re using CPUs,
the default batch size is 4096. If you’re using GPUs, you must specify an explicit
batch size.

https://docs.ray.io/en/master/data/batch_inference.html#configuring-batch-size
# Specify that each input batch should be of size 2.
ds.map_batches(assert_batch, batch_size=2)
Caution
The default batch_size of 4096 may be too large for datasets with large rows
(for example, tables with many columns or a collection of large images).

https://docs.ray.io/en/

## Generation

In [172]:
import time

In [173]:
def generate_response(
    llm, temperature=0.0, 
    system_content="", assistant_content="", user_content="", 
    max_retries=3, retry_interval=60):
    """Generate response from an LLM."""
    retry_count = 0
    while retry_count < max_retries:
        try:
            response = openai.ChatCompletion.create(
                model=llm,
                temperature=temperature,
                messages=[
                    {"role": "system", "content": system_content},
                    {"role": "assistant", "content": assistant_content},
                    {"role": "user", "content": user_content},
                ],
            )
            return response["choices"][-1]["message"]["content"]
        except Exception as e:
            print(e)
            time.sleep(retry_interval)  # default is per-minute rate limits
            retry_count += 1
    return ""

In [174]:
# Credentials
openai.api_base = os.environ["ANYSCALE_API_BASE"]
openai.api_key = os.environ["ANYSCALE_API_KEY"]

In [175]:
# Generate response
generate_response(
    llm="meta-llama/Llama-2-70b-chat-hf",
    temperature=0.0,
    system_content="Answer the query using the context provided.",
    user_content=f"query: {query}, context: {context}"
)

'The default batch size for map_batches is 4096. However, this may not always be the actual size of the batch provided to the function, as the batch size may need to be adjusted to fit the block size of the data being processed. The default batch size can be overridden by specifying a different value for the batch_size argument when calling map_batches. Note that the default batch size may vary depending on the resource type being used, with a default of 4096 for CPUs and a requirement for an explicit batch size specification when using GPUs.'

Let's combine the context retrieval and response generation together into a conventient query agent that we can use to easily generate our responses.

In [180]:
class QueryAgent:
    def __init__(self, embedding_model_name="thenlper/gte-base",
                 llm="meta-llama/Llama-2-70b-chat-hf", 
                 temperature=0.0, max_context_length=4096,
                 system_content="", assistant_content=""):
        
        # Embedding model
        model_kwargs = {"device": "cuda"}
        encode_kwargs = {"device": "cuda", "batch_size": 100}
        if embedding_model_name == "text-embedding-ada-002":
            self.embedding_model = OpenAIEmbeddings(
                model=embedding_model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs,
                openai_api_base=os.environ["OPENAI_API_BASE"],
                openai_api_key=os.environ["OPENAI_API_KEY"])
        else:
            self.embedding_model = HuggingFaceEmbeddings(
                model_name=embedding_model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs)
        
        # LLM
        self.llm = llm
        self.temperature = temperature
        self.context_length = max_context_length - len(system_content + assistant_content)
        self.system_content = system_content
        self.assistant_content = assistant_content

    def __call__(self, query, num_chunks=5):
        # Get context
        embedding = np.array(self.embedding_model.embed_query(query))
        with psycopg.connect(os.environ["DB_CONNECTION_STRING"]) as conn:
            register_vector(conn)
            with conn.cursor() as cur:
                cur.execute("SELECT * FROM document ORDER BY embedding <-> %s LIMIT %s", (embedding, num_chunks))
                rows = cur.fetchall()
                context = [{"text": row[1]} for row in rows]
                sources = [row[2] for row in rows]
            
        # Generate response
        user_content = f"query: {query}, context: {context}"
        answer = generate_response(
            llm=self.llm,
            temperature=self.temperature,
            system_content=self.system_content,
            assistant_content=self.assistant_content,
            user_content=user_content[: self.context_length],
        )

        # Result
        result = {
            "question": query,
            "sources": sources,
            "answer": answer,
        }
        return result

In [181]:
query = "What is the default batch size for map_batches?"
system_content = "Answer the query using the context provided."
agent = QueryAgent(
    embedding_model_name="thenlper/gte-base",
    llm="meta-llama/Llama-2-7b-chat-hf",
    max_context_length=4096,
    system_content=system_content,
)
result = agent(query=query)
print(json.dumps(result, indent=2))

{
  "question": "What is the default batch size for map_batches?",
  "sources": [
    "https://docs.ray.io/en/master/data/api/doc/ray.data.Dataset.map_batches.html#ray-data-dataset-map-batches",
    "https://docs.ray.io/en/master/data/transforming-data.html#configuring-batch-size",
    "https://docs.ray.io/en/master/data/batch_inference.html#configuring-batch-size",
    "https://docs.ray.io/en/master/data/batch_inference.html#configuring-batch-size",
    "https://docs.ray.io/en/master/tune/getting-started.html#setting-up-a-tuner-for-a-training-run-with-tune"
  ],
  "answer": "Based on the provided context, the default batch size for `map_batches` is 4096. However, it's important to note that the default batch size may vary depending on the resource type being used. If using CPUs, the default batch size is 4096, while if using GPUs, an explicit batch size must be specified. Additionally, it's recommended to use a smaller batch size for datasets with large rows, such as tables with many 

## Datasets

In [182]:
openai.api_base = os.environ["OPENAI_API_BASE"]
openai.api_key = os.environ["OPENAI_API_KEY"]

### Synthetic

In [183]:
num_questions = 3
system_content = f"""
Create {num_questions} questions using only the context provided.
End each question with a '?' character and then in a newline write the answer to that question using only the context provided.
Separate each question/answer pair by a newline.
"""

In [184]:
# Generate questions
synthetic_data = []
for chunk in chunks[:3]:  # small samples
    response = generate_response(
        llm="gpt-4",
        temperature=0.0,
        system_content=system_content,
        user_content=f"context: {chunk.page_content}"
    )
    entries = response.split("\n\n")
    for entry in entries:
        question, answer = entry.split("\n")
        synthetic_data.append({"question": question, "source": chunk.metadata["source"], "answer": answer})

In [185]:
synthetic_data[:3]

[{'question': 'What can you use to monitor and debug your Ray applications and clusters?',
  'source': 'https://docs.ray.io/en/master/ray-observability/reference/index.html#reference',
  'answer': 'You can use the API and CLI documented in the references to monitor and debug your Ray applications and clusters.'},
 {'question': 'What are the guides included in the references?',
  'source': 'https://docs.ray.io/en/master/ray-observability/reference/index.html#reference',
  'answer': 'The guides included in the references are State API, State CLI, and System Metrics.'},
 {'question': 'What are the three guides mentioned in the context?',
  'source': 'https://docs.ray.io/en/master/ray-observability/reference/index.html#reference',
  'answer': 'The three guides mentioned in the context are State API, State CLI, and System Metrics.'}]

### Manual

Now we'll manually create our reference (ground-truth) dataset. We have a list of user queries and the ideal source to answer the query [`datasets/eval-dataset-v1.jsonl`](https://github.com/ray-project/llm-applications/blob/main/datasets/eval-dataset-v1.jsonl). We will our LLM app above to generate reference answer for each query/source pair using `gpt-4`.

In [253]:
import re
import urllib.parse
from bs4 import BeautifulSoup
from IPython.display import clear_output, display, JSON

In [254]:
# If running tests / small samples, set num_samples to <10
# None = all samples
num_samples = 10

In [255]:
with open(Path(ROOT_DIR, "datasets/eval-dataset-v1.jsonl"), "r") as f:
    data = [json.loads(item) for item in list(f)]

In [256]:
data[:5]

[{'question': 'I’m struggling a bit with Ray Data type conversions when I do map_batches. Any advice?',
  'source': 'https://docs.ray.io/en/master/data/transforming-data.html#configuring-batch-format'},
 {'question': 'How does autoscaling work in a Ray Serve application?',
  'source': 'https://docs.ray.io/en/master/serve/scaling-and-resource-allocation.html#autoscaling'},
 {'question': 'how do I get the address of a ray node',
  'source': 'https://docs.ray.io/en/master/ray-core/miscellaneous.html#node-information'},
 {'question': 'Does Ray support NCCL?',
  'source': 'https://docs.ray.io/en/master/ray-more-libs/ray-collective.html'},
 {'question': 'Is Ray integrated with DeepSpeed?',
  'source': 'https://docs.ray.io/en/master/ray-air/examples/gptj_deepspeed_fine_tuning.html#fine-tuning-the-model-with-ray-air-a-name-train-a'}]

In [257]:
def fetch_text(uri):
    url, anchor = uri.split("#") if "#" in uri else (uri, None)
    file_path = Path(EFS_DIR, url.split("https://")[-1])
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()
    soup = BeautifulSoup(html_content, "html.parser")
    if anchor:
        target_element = soup.find(id=anchor)
        if target_element:
            text = target_element.get_text()
        else:
            return fetch_text(uri=url)
    else:
        text = soup.get_text()
    return text

In [258]:
# Sample
uri = "https://docs.ray.io/en/master/data/transforming-data.html#configuring-batch-format"
fetch_text(uri=uri)

'\nConfiguring batch format#\nRay Data represents batches as dicts of NumPy ndarrays or pandas DataFrames. By\ndefault, Ray Data represents batches as dicts of NumPy ndarrays.\nTo configure the batch type, specify batch_format in\nmap_batches(). You can return either format from your function.\n\n\n\nNumPy\nfrom typing import Dict\nimport numpy as np\nimport ray\n\ndef increase_brightness(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:\n    batch["image"] = np.clip(batch["image"] + 4, 0, 255)\n    return batch\n\nds = (\n    ray.data.read_images("s3://anonymous@ray-example-data/image-datasets/simple")\n    .map_batches(increase_brightness, batch_format="numpy")\n)\n\n\n\n\n\npandas\nimport pandas as pd\nimport ray\n\ndef drop_nas(batch: pd.DataFrame) -> pd.DataFrame:\n    return batch.dropna()\n\nds = (\n    ray.data.read_csv("s3://anonymous@air-example-data/iris.csv")\n    .map_batches(drop_nas, batch_format="pandas")\n)\n\n\n\n\n'

In [259]:
# Content for inference
system_content = """
    "Answer the query using the context provided.
    Then, you must {score} your response between 1 and 5.
    You must return your response in a line with only the score.
    Do not add any more details.
    On a separate line provide your {reasoning} for the score as well.
    Return your response following the exact format outlined below.
    Do not add or remove anything.
    And all of this must be in a valid JSON format.
    
    {"answer": answer,
     "score": score,
     "reasoning": reasoning}
    """
assistant_content = ""

In [260]:
def extract_from_response(response):
    # Define regular expressions for extracting values
    answer_pattern = r'"answer"\s*:\s*"([^"]*)"'
    score_pattern = r'"score"\s*:\s*([0-9]+)'
    reasoning_pattern = r'"reasoning"\s*:\s*"([^"]*)"'

    # Extract values using regular expressions
    answer_match = re.search(answer_pattern, response)
    score_match = re.search(score_pattern, response)
    reasoning_match = re.search(reasoning_pattern, response)

    # Convert
    if answer_match and score_match and reasoning_match:
        answer = answer_match.group(1)
        score = float(score_match.group(1))
        reasoning = reasoning_match.group(1)
        return answer, score, reasoning

    return "", "", ""

In [279]:
def get_references(data, llm, temperature, max_context_length, system_content, assistant_content, num_samples=None):
    results = []
    for row in tqdm(data[:num_samples]):
        # Get context
        query = row["question"]
        context = fetch_text(uri=row["source"])

        # Generate response
        context_length = max_context_length - len(system_content + assistant_content)
        user_content = f"The query is {query} and the additional context is {context}"[:context_length]
        response = generate_response(
            llm=llm,
            temperature=temperature,
            system_content=system_content, 
            assistant_content=assistant_content, 
            user_content=user_content)

        # Extract from response
        answer, score, reasoning = extract_from_response(response=response)

        # Store result
        result = ({
                "question": query,
                "source": row["source"],
                "answer": answer,
                "score": score,
                "reasoning": reasoning,
            })
        results.append(result)
        clear_output(wait=True)
        display(JSON(json.dumps(result, indent=2)))
    return results

Let's generate reference responses with `gpt-4` as well:

In [280]:
# GPT-4
openai.api_base = os.environ["OPENAI_API_BASE"]
openai.api_key = os.environ["OPENAI_API_KEY"]
results = get_references(
    data=data, llm="gpt-4", temperature=0.0, max_context_length=8192, 
    system_content=system_content, assistant_content=assistant_content,
    num_samples=num_samples)
print (np.mean([float(result["score"]) for result in results if result["score"]]))

<IPython.core.display.JSON object>

100%|██████████| 10/10 [01:47<00:00, 10.76s/it]

4.8





In [281]:
# Save to file
references_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "references", "gpt-4.json")
references_fp.parent.mkdir(parents=True, exist_ok=True)
with open(references_fp, "w") as fp:
    json.dump(results, fp, indent=4)

Let's generate reference responses with `Llama-2-70b` as well:

In [282]:
# Llama-2-70b
openai.api_base = os.environ["ANYSCALE_API_BASE"]
openai.api_key = os.environ["ANYSCALE_API_KEY"]
results = get_references(
    data=data, llm="meta-llama/Llama-2-70b-chat-hf", temperature=0.0, max_context_length=4096, 
    system_content=system_content, assistant_content=assistant_content,
    num_samples=num_samples)
print (np.mean([float(result["score"]) for result in results if result["score"]]))

<IPython.core.display.JSON object>

100%|██████████| 10/10 [01:11<00:00,  7.15s/it]

4.888888888888889





In [283]:
# Save to file
references_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "references", "llama-2-70b.json")
references_fp.parent.mkdir(parents=True, exist_ok=True)
with open(references_fp, "w") as fp:
    json.dump(results, fp, indent=4)

## Evaluator

Now that we've seen the answers, scores and reasoning for our references dataset from both `gpt-4` and `Llama-2-70b`. We can use these responses to decide on a quality evaluator for our future experiments. This evaluator will be used to score answers for different experiment configuations and so we need to be able to trust their scores, reasoning, etc. After inspecting Llama2 evaluating Llama2's answers, it is definitely not a good evaluator. For most answers the reasoning is not good, and the score is pretty random with lots of 4s. Therefore, our evaluator will be `gpt-4`.

In [297]:
EVALUATOR = "gpt-4"

## Experiments

We're going to start experimenting with the various components in our LLM application such as our evaluator, context, sections, chunking size, number of chunks in our context, embedding models, OSS/closed LLMs and more!

### Utilities

Before we get started with our experiments, we're going to define some utility functions that we'll use to easily generate and evaluate responses using the different experiment configurations. We'll also define some functions to help determine our response quality score, retrieval recall score, etc.

In [298]:
import subprocess

In [299]:
# Paths
DATA_PATH = str(Path(ROOT_DIR, "datasets", "eval-dataset-v1.jsonl"))
REFERENCE_LOC = str(Path(ROOT_DIR, EXPERIMENTS_DIR, "references", "gpt-4.json"))

In [300]:
# Mappings
EMBEDDING_DIMENSIONS = {
    "thenlper/gte-base": 768,
    "BAAI/bge-large-en": 1024,
    "text-embedding-ada-002": 1536
}
MAX_CONTEXT_LENGTHS = {
    "gpt-4": 8192,
    "gpt-3.5-turbo": 4096,
    "gpt-3.5-turbo-16k": 16384,
    "meta-llama/Llama-2-7b-chat-hf": 4096,
    "meta-llama/Llama-2-13b-chat-hf": 4096,
    "meta-llama/Llama-2-70b-chat-hf": 4096,
}

In [301]:
def execute_bash(command):
    results = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    return results

In [302]:
def set_index(sections, embedding_model_name, chunk_size, chunk_overlap):
    # Drop current Vector DB and prepare for new one
    execute_bash(f'psql "{os.environ["DB_CONNECTION_STRING"]}" -c "DROP TABLE document;"')
    execute_bash(f'sudo -u postgres psql -f ../migrations/vector-{EMBEDDING_DIMENSIONS[embedding_model_name]}.sql')
    SQL_DUMP_FP = Path(EFS_DIR, "sql_dumps", f"{embedding_model_name.split('/')[-1]}_{chunk_size}_{chunk_overlap}.sql")
    
    # Vector DB
    if SQL_DUMP_FP.exists():  # Load from SQL dump
        execute_bash(f'psql "{os.environ["DB_CONNECTION_STRING"]}" -f {SQL_DUMP_FP}')
    else:  # Create new index
        # Create chunks dataset
        text_splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n", "\n", " ", ""],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )
        chunks = text_splitter.create_documents(
            texts=[section["text"] for section in sections], 
            metadatas=[{"source": section["source"]} for section in sections]
        )
        chunks_ds = ray.data.from_items([{"text": chunk.page_content, "source": chunk.metadata["source"]} for chunk in chunks])

        # Embed chunks
        embedded_chunks = chunks_ds.map_batches(
            EmbedChunks,
            fn_constructor_kwargs={"model_name": embedding_model_name},
            batch_size=100, 
            num_gpus=1,
            compute=ActorPoolStrategy(size=2))
        
        # Index data
        embedded_chunks.map_batches(
            StoreResults,
            batch_size=128,
            num_cpus=1,
            compute=ActorPoolStrategy(size=28),
        ).count()
        
        # Save to SQL dump
        execute_bash(f"sudo -u postgres pg_dump -c > {SQL_DUMP_FP}")

In [303]:
def set_credentials(llm):
    if llm.startswith("gpt"):
        openai.api_base = os.environ["OPENAI_API_BASE"]
        openai.api_key = os.environ["OPENAI_API_KEY"]
    else:
        openai.api_base = os.environ["ANYSCALE_API_BASE"]
        openai.api_key = os.environ["ANYSCALE_API_KEY"]

In [304]:
# Generate responses
def generate_responses(
    experiment_name, data_path, sections,
    chunk_size, chunk_overlap, num_chunks,
    embedding_model_name, 
    llm, temperature, max_context_length, 
    system_content, assistant_content="",
    num_samples=None):
    
    # Set credentials
    set_credentials(llm=llm)
    
    # Build index
    set_index(
        sections=sections,
        embedding_model_name=embedding_model_name,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    
    # Query agent
    agent = QueryAgent(
        embedding_model_name=embedding_model_name,
        llm=llm,
        temperature=temperature,
        max_context_length=max_context_length,
        system_content=system_content,
        assistant_content=assistant_content,
    )

    # Generate responses
    results = []
    with open(Path(data_path), "r") as f:
        questions = [json.loads(item)["question"] for item in list(f)][:num_samples]
    for query in tqdm(questions):
        result = agent(query=query, num_chunks=num_chunks)
        results.append(result)
        clear_output(wait=True)
        display(JSON(json.dumps(result, indent=2)))

    # Save to file
    responses_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "responses", f"{experiment_name}.json")
    responses_fp.parent.mkdir(parents=True, exist_ok=True)
    config = {
        "experiment_name": experiment_name,
        "data_path": data_path,
        "chunk_size": chunk_size,
        "chunk_overlap": chunk_overlap,
        "num_chunks": num_chunks,
        "embedding_model_name": embedding_model_name,
        "llm": llm,
        "temperature": temperature,
        "max_context_length": max_context_length,
        "system_content": system_content,
        "assistant_content": assistant_content,
    }
    responses = {
        "config": config,
        "results": results,
    }
    with open(responses_fp, "w") as fp:
        json.dump(responses, fp, indent=4)

In [305]:
def get_retrieval_score(references, generated):
    matches = np.zeros(len(references))
    for i in range(len(references)):
        reference_source = references[i]["source"].split("#")[0]
        if not reference_source:
            matches[i] = 1
            continue
        for source in generated[i]["sources"]:
            # sections don't have to perfectly match
            if reference_source == source.split("#")[0]:
                matches[i] = 1
                continue
    retrieval_score = np.mean(matches)
    return retrieval_score

In [306]:
def evaluate_responses(
    experiment_name, reference_loc, response_loc,
    evaluator, temperature, max_context_length,
    system_content, assistant_content="",
    num_samples=None):
    
    # Set credentials
    set_credentials(llm=evaluator)
    
    # Load answers
    with open(Path(reference_loc), "r") as f:
        references = [item for item in json.load(f)][:num_samples]
    with open(Path(response_loc), "r") as f:
        generated = [item for item in json.load(f)["results"]][:num_samples]
    assert len(references) == len(generated)

    # Quality score
    results = []
    context_length = max_context_length - len(system_content + assistant_content)
    for ref, gen in tqdm(zip(references, generated), total=len(references)):
        assert ref["question"] == gen["question"]
        user_content = str(
            {
                "question": gen["question"],
                "generated_answer": gen["answer"],
                "reference_answer": ref["answer"],
            }
        )[:context_length]

        # Generate response
        response = generate_response(
            llm=evaluator,
            temperature=temperature,
            system_content=system_content,
            assistant_content=assistant_content,
            user_content=user_content,
        )

        # Extract from response
        score, reasoning = response.split("\n", 1)

        # Store result
        result = {
            "question": gen["question"],
            "generated_answer": gen["answer"],
            "reference_answer": ref["answer"],
            "score": float(score),
            "reasoning": reasoning.lstrip("\n"),
            "sources": gen["sources"],
        }
        results.append(result)
        clear_output(wait=True)
        display(JSON(json.dumps(result, indent=2)))

    # Save to file
    evaluator_name = evaluator.split("/")[-1].lower()
    evaluation_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "evaluations", f"{experiment_name}_{evaluator_name}.json")
    evaluation_fp.parent.mkdir(parents=True, exist_ok=True)
    config = {
        "experiment_name": experiment_name,
        "reference_loc": reference_loc,
        "response_loc": response_loc,
        "evaluator": evaluator,
        "temperature": temperature,
        "max_context_length": max_context_length,
        "system_content": system_content,
        "assistant_content": assistant_content,
    }
    evaluation = {
        "config": config,
        "retrieval_score": get_retrieval_score(references, generated),
        "quality_score": np.mean([item["score"] for item in results if (item["score"] and item["reference_answer"])]),
        "results": results,
    }
    with open(evaluation_fp, "w") as fp:
        json.dump(evaluation, fp, indent=4)

In [307]:
def run_experiment(
    experiment_name, data_path, sections,
    chunk_size, chunk_overlap, num_chunks,
    embedding_model_name, llm,
    reference_loc, evaluator,
    num_samples=None):
    """Generate responses and evaluate them."""
    
    # Generate responses
    generate_responses(
        experiment_name=experiment_name, 
        data_path=data_path,
        sections=sections,
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap, 
        num_chunks=num_chunks,
        embedding_model_name=embedding_model_name, 
        llm=llm, 
        temperature=0.0, 
        max_context_length=MAX_CONTEXT_LENGTHS[llm], 
        system_content="Answer the query using the context provided.",
        num_samples=num_samples)

    # Evaluate responses
    evaluation_system_content = """
        Your job is to rate the quality of our generated answer {generated_answer}
        given a query {query} and a reference answer {reference_answer}.
        Your score has to be between 1 and 5.
        You must return your response in a line with only the score.
        Do not return answers in any other format.
        On a separate line provide your reasoning for the score as well.
        """
    evaluate_responses(
        experiment_name=experiment_name,
        reference_loc=reference_loc, 
        response_loc=str(Path(ROOT_DIR, EXPERIMENTS_DIR, "responses", f"{experiment_name}.json")),
        evaluator=EVALUATOR, 
        temperature=0.0, 
        max_context_length=MAX_CONTEXT_LENGTHS[EVALUATOR],
        system_content=evaluation_system_content,
        num_samples=num_samples)

In [308]:
def print_experiment(experiment_name, evaluator=EVALUATOR):
    eval_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "evaluations", f"{experiment_name}_{evaluator}.json")
    with open(eval_fp, "r") as fp:
        d = json.load(fp)
    print (experiment_name)
    print ("  retrieval score:", d["retrieval_score"])
    print ("  quality score:", d["quality_score"])
    print ()

In [309]:
llm = "meta-llama/Llama-2-70b-chat-hf"

### Context

We're first going to test if the additonal context we provide is helpful at all. This is to validate that the RAG system is indeed worth the effort.

In [310]:
# Without context
num_chunks = 0
experiment_name = f"without-context"
run_experiment(
    experiment_name=experiment_name, 
    data_path=DATA_PATH,
    sections=sections,
    chunk_size=100, 
    chunk_overlap=50,
    num_chunks=num_chunks,
    embedding_model_name="thenlper/gte-base",
    llm=llm,
    reference_loc=REFERENCE_LOC,
    evaluator=EVALUATOR,
    num_samples=num_samples)

<IPython.core.display.JSON object>

100%|██████████| 10/10 [01:05<00:00,  6.56s/it]


In [311]:
print_experiment(experiment_name=experiment_name)

without-context
  retrieval score: 0.0
  quality score: 2.45



In [312]:
# With context
num_chunks = 5
experiment_name = "with-context"
run_experiment(
    experiment_name=experiment_name, 
    data_path=DATA_PATH,
    sections=sections,
    chunk_size=300, 
    chunk_overlap=50, 
    num_chunks=num_chunks,
    embedding_model_name="thenlper/gte-base",
    llm=llm,
    reference_loc=REFERENCE_LOC,
    evaluator=EVALUATOR,
    num_samples=num_samples)

<IPython.core.display.JSON object>

100%|██████████| 10/10 [00:57<00:00,  5.80s/it]


In [313]:
print_experiment(experiment_name=experiment_name)

with-context
  retrieval score: 1.0
  quality score: 4.1



As we can see, **using context (RAG)** does indeed help in the quality of our answers!

### Chunk size

In [314]:
chunk_sizes = [100, 300, 500, 700]

In [315]:
for chunk_size in chunk_sizes:
    experiment_name = f"chunk-size-{chunk_size}"
    run_experiment(
        experiment_name=experiment_name, 
        data_path=DATA_PATH,
        sections=sections,
        chunk_size=chunk_size, 
        chunk_overlap=50, 
        num_chunks=5,
        embedding_model_name="thenlper/gte-base",
        llm=llm,
        reference_loc=REFERENCE_LOC,
        evaluator=EVALUATOR,
        num_samples=num_samples)

<IPython.core.display.JSON object>

100%|██████████| 10/10 [00:56<00:00,  5.65s/it]


In [316]:
for chunk_size in chunk_sizes:
    experiment_name = f"chunk-size-{chunk_size}"
    print_experiment(experiment_name=experiment_name)

chunk-size-100
  retrieval score: 0.7
  quality score: 3.2

chunk-size-300
  retrieval score: 1.0
  quality score: 4.05

chunk-size-500
  retrieval score: 1.0
  quality score: 4.5

chunk-size-700
  retrieval score: 0.8
  quality score: 4.2



Seem that a larger chunk size does help but it tapers off around the 600 characters mark (too much context might be too noisy).

**Note**: If we were to use larger chunk sizes (ours is based on characters), keep in mind that [most](https://huggingface.co/spaces/mteb/leaderboard) open source embedding models have a maximum sequence length of 512 sub-word tokens. This means that if our chunk contains more than 512 sub-word tokens, the embedding wouldn't account for it anyway (unless we finetune our embedding model to have longer sequence lengths).

In [317]:
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

### Number of chunks

**Note**: Keep in mind that the `chunk_size` you chose multiplied by the `num_chunks` below fits inside the LLM's context length. We're experimenting with the chunk size and number of chunks as if they were indepdent variables but they area heavily related. Especially since all of our LLMs have a finite maximum context length. So ideally, we would tune for a combination if `chunk_size` * `num_chunks`.

In [318]:
num_chunks_list = [1, 3, 5, 7]

In [319]:
for num_chunks in num_chunks_list:
    experiment_name = f"num-chunks-{num_chunks}"
    run_experiment(
        experiment_name=experiment_name, 
        data_path=DATA_PATH,
        sections=sections,
        chunk_size=CHUNK_SIZE, 
        chunk_overlap=CHUNK_OVERLAP, 
        num_chunks=num_chunks,
        embedding_model_name="thenlper/gte-base",
        llm=llm,
        reference_loc=REFERENCE_LOC,
        evaluator=EVALUATOR,
        num_samples=num_samples)

<IPython.core.display.JSON object>

100%|██████████| 10/10 [00:51<00:00,  5.13s/it]


In [320]:
for num_chunks in num_chunks_list:
    experiment_name=f"num-chunks-{num_chunks}"
    print_experiment(experiment_name=experiment_name)

num-chunks-1
  retrieval score: 0.3
  quality score: 3.3

num-chunks-3
  retrieval score: 0.8
  quality score: 4.1

num-chunks-5
  retrieval score: 1.0
  quality score: 4.6

num-chunks-7
  retrieval score: 1.0
  quality score: 4.4



Increasing our number of chunks improves our retrieval and quality scores. We had to stop testing at 6 chunks since our `chunk_size` is 600 tokens and `Llama-2-70b`'s maximum context length is 4096 tokens (we also have to account for the system, assistant and user content to our LLM). This is a major reason to invest in extending context size via RoPE scaling (rotary position embeddings), etc. But it also seems that the benefit of increasing the number of chunks is starting to taper off.

In [321]:
NUM_CHUNKS = 7

### Embedding models

So far, we've used [`thenlper/gte-base`](https://huggingface.co/thenlper/gte-base) as our embedding model because it's a relatively small (0.22 GB) and performant option. But now, let's explore other popular options such the current leader on the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard), [`BAAI/bge-large-en`](https://huggingface.co/BAAI/bge-large-en) (1.34 GB), and OpenAI's [`text-embedding-ada-002`](https://openai.com/blog/new-and-improved-embedding-model).

In [322]:
embedding_model_names = ["thenlper/gte-base", "BAAI/bge-large-en", "text-embedding-ada-002"]

In [323]:
for embedding_model_name in embedding_model_names:
    experiment_name = f"{embedding_model_name.split('/')[-1]}"
    run_experiment(
        experiment_name=experiment_name, 
        data_path=DATA_PATH,
        sections=sections,
        chunk_size=CHUNK_SIZE, 
        chunk_overlap=CHUNK_OVERLAP, 
        num_chunks=NUM_CHUNKS,
        embedding_model_name=embedding_model_name,
        llm=llm,
        reference_loc=REFERENCE_LOC,
        evaluator=EVALUATOR,
        num_samples=num_samples)

<IPython.core.display.JSON object>

100%|██████████| 10/10 [01:04<00:00,  6.41s/it]


In [324]:
for embedding_model_name in embedding_model_names:
    experiment_name = f"{embedding_model_name.split('/')[-1]}"
    print_experiment(experiment_name=experiment_name)

gte-base
  retrieval score: 1.0
  quality score: 4.3

bge-large-en
  retrieval score: 0.5
  quality score: 3.0

text-embedding-ada-002
  retrieval score: 0.8
  quality score: 4.5



This is an interesting outcome because the #1 (`BAAI/bge-large-en`) on the current leaderboard isn't necessarily the best for our specific task. Using the smaller `thenlper/gte-base` produced the best retrieval and quality scores in our experiments.

In [325]:
EMBEDDING_MODEL_NAME = "thenlper/gte-base"

### OSS vs. closed LLMs

In [326]:
llms = ["gpt-3.5-turbo",
        "gpt-4",
        "meta-llama/Llama-2-7b-chat-hf", 
        "meta-llama/Llama-2-13b-chat-hf", 
        "meta-llama/Llama-2-70b-chat-hf"]

In [327]:
for llm in llms:
    experiment_name = f"{llm.split('/')[-1].lower()}"
    run_experiment(
        experiment_name=experiment_name, 
        data_path=DATA_PATH,
        sections=sections,
        chunk_size=CHUNK_SIZE, 
        chunk_overlap=CHUNK_OVERLAP, 
        num_chunks=NUM_CHUNKS,
        embedding_model_name=EMBEDDING_MODEL_NAME,
        llm=llm,
        reference_loc=REFERENCE_LOC,
        evaluator=EVALUATOR,
        num_samples=num_samples)

<IPython.core.display.JSON object>

100%|██████████| 10/10 [00:55<00:00,  5.54s/it]


In [328]:
for llm in llms:
    experiment_name = f"{llm.split('/')[-1].lower()}"
    print_experiment(experiment_name=experiment_name)

gpt-3.5-turbo
  retrieval score: 1.0
  quality score: 4.7

gpt-4
  retrieval score: 1.0
  quality score: 4.9

llama-2-7b-chat-hf
  retrieval score: 1.0
  quality score: 3.75

llama-2-13b-chat-hf
  retrieval score: 1.0
  quality score: 4.2

llama-2-70b-chat-hf
  retrieval score: 1.0
  quality score: 4.4



**Note**: Some of our LLMs have much larger context lengths, ex. `gpt-4` is 8192 and `gpt-3.5-turbo-16k` is 16384. We could increase the number of chunks that we use for these since we saw that increasing `num_chunks` continued to improve the retrieval and quality scores. However, we will keep this value fixed for now since the performance started to taper off anyway and so we can compare these performances under the exact same configurations.

In [329]:
LLM = "meta-llama/Llama-2-70b-chat-hf"

## Cost analysis

**Note**: Our `Llama-2` models are priced at $1/M tokens with [Anyscale Endpoints](https://endpoints.anyscale.com/).

In [330]:
# Pricing details
pricing = {
    "gpt-3.5-turbo": {
        "prompt": 2e-6,
        "sampled": 2e-6
    },
    "gpt-4": {
        "prompt": 3e-5,
        "sampled": 6e-5
    },
    "llama-2-7b-chat-hf": {
        "prompt": 1e-6,
        "sampled": 1e-6
    },
    "llama-2-13b-chat-hf": {
        "prompt": 1e-6,
        "sampled": 1e-6
    },
    "llama-2-70b-chat-hf": {
        "prompt": 1e-6,
        "sampled": 1e-6
    }
}

In [331]:
def cost_analysis(llm):
    experiment_name = f"{llm.split('/')[-1].lower()}"
    eval_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "evaluations", f"{experiment_name}_{EVALUATOR}.json")
    with open(eval_fp, "r") as fp:
        d = json.load(fp)
    num_samples = len(d["results"])
    prompt_size, sampled_size = 0, 0
    for result in d["results"]:
        prompt_size += len(result["question"]) + (CHUNK_SIZE * NUM_CHUNKS)
        sampled_size += len(result["generated_answer"])
    total_cost = pricing[experiment_name]["prompt"] * prompt_size + pricing[experiment_name]["sampled"] * sampled_size
    avg_cost = total_cost / num_samples
    
    print (llm)
    print (f"  avg prompt size: {int(prompt_size/num_samples)}")
    print (f"  avg sampled size: {int(sampled_size/num_samples)}")
    print (f"  total cost: ${total_cost:.2f}")
    print (f"  avg cost: ${avg_cost:.2f}")
    print ()

In [332]:
for llm in llms:
    cost_analysis(llm=llm)

gpt-3.5-turbo
  avg prompt size: 3550
  avg sampled size: 1032
  total cost: $0.09
  avg cost: $0.01

gpt-4
  avg prompt size: 3550
  avg sampled size: 782
  total cost: $1.53
  avg cost: $0.15

meta-llama/Llama-2-7b-chat-hf
  avg prompt size: 3550
  avg sampled size: 2162
  total cost: $0.06
  avg cost: $0.01

meta-llama/Llama-2-13b-chat-hf
  avg prompt size: 3550
  avg sampled size: 1561
  total cost: $0.05
  avg cost: $0.01

meta-llama/Llama-2-70b-chat-hf
  avg prompt size: 3550
  avg sampled size: 1224
  total cost: $0.05
  avg cost: $0.00



## Next steps

In progress:
- hybrid routing

LlamaIndex:
- Generate synthetic datasets (query, source, answer)
- add context to embeddings
- better chunking logic
- fine-tune embedding model
- fine-tune base LLM (gpt-3.5 and OSS)

Later:
- additional data sources
- longer context lengths (RoPE)
- keyword search with semantic (embedding) search
- reranking with LLM after results from (faster) embedding search