# RAG-based LLM Applications

- https://github.com/ray-project/llm-applications
- https://endpoints.anyscale.com/

## Set up

In [1]:
import os
import openai
from pathlib import Path
from pprint import pprint
import ray
from tqdm import tqdm

In [2]:
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")
from dotenv import load_dotenv; load_dotenv()

True

In [3]:
# Directories
EFS_DIR = Path("/efs/shared_storage/pcmoritz")
ROOT_DIR = Path(os.getcwd()).parent
EXPERIMENTS_DIR = Path(ROOT_DIR, "experiments_small")
print (f"EFS_DIR: {EFS_DIR}")
print (f"ROOT_DIR: {ROOT_DIR}")
print (f"EXPERIMENTS_DIR: {EXPERIMENTS_DIR}")

EFS_DIR: /efs/shared_storage/pcmoritz
ROOT_DIR: /home/ray/default/llm-applications
EXPERIMENTS_DIR: /home/ray/default/llm-applications/experiments_small


In [4]:
# Credentials
ray.init(runtime_env={"env_vars": {
    "OPENAI_API_BASE": os.environ["OPENAI_API_BASE"],
    "OPENAI_API_KEY": os.environ["OPENAI_API_KEY"], 
    "ANYSCALE_API_BASE": os.environ["ANYSCALE_API_BASE"],
    "ANYSCALE_API_KEY": os.environ["ANYSCALE_API_KEY"],
    "DB_CONNECTION_STRING": os.environ["DB_CONNECTION_STRING"],
}})

2023-09-01 14:57:35,898	INFO worker.py:1431 -- Connecting to existing Ray cluster at address: 10.0.59.50:6379...
2023-09-01 14:57:35,909	INFO worker.py:1612 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-iq4d2ux1mdavtyqs5xdnlk2vcv.i.anyscaleuserdata-staging.com [39m[22m
2023-09-01 14:57:35,912	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_1f4f27ecdbdbf14c4c3622920d5e2960.zip' (0.11MiB) to Ray cluster...
2023-09-01 14:57:35,913	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_1f4f27ecdbdbf14c4c3622920d5e2960.zip'.


0,1
Python version:,3.9.15
Ray version:,3.0.0.dev0
Dashboard:,http://session-iq4d2ux1mdavtyqs5xdnlk2vcv.i.anyscaleuserdata-staging.com


## Data

### Load data

Our data is already ready at `/efs/shared_storage/goku/docs.ray.io/en/master/` (on Staging, `us-east-1`) but if you wanted to load it yourself, run this bash command (change `/desired/output/directory`, but make sure it's on the shared storage,
so that it's accessible to the workers):
```bash
export DOCS_PATH=/desired/output/directory
wget -e robots=off --recursive --no-clobber --page-requisites \
  --html-extension --convert-links --restrict-file-names=windows \
  --domains docs.ray.io --no-parent --accept=html \
  -P $DOCS_PATH https://docs.ray.io/en/master/
```

In [8]:
# Ray dataset
docs_path = Path(EFS_DIR, "docs.ray.io/en/master/")
ds = ray.data.from_items([{"path": path} for path in docs_path.rglob("*.html") if not path.is_dir()])
print(f"{ds.count()} documents")

3294 documents


### Chunk data

In [9]:
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [10]:
loader = ReadTheDocsLoader(f"{EFS_DIR}/docs.ray.io/en/master/", encoding="utf-8", errors="ignore")
docs = loader.load()

In [11]:
for doc in docs:  # clean
    doc.metadata["source"] = doc.metadata["source"].replace(str(EFS_DIR)+"/", "https://")

In [12]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=300,
    chunk_overlap=50,
    length_function=len,
)

In [13]:
# Chunks
chunks = text_splitter.create_documents(
    texts=[doc.page_content for doc in docs], 
    metadatas=[doc.metadata for doc in docs]
)

In [14]:
print (f"{len(chunks)} chunks\n")
pprint (chunks[0].page_content)  # a few tokens
print (f"\nmetadata:\n{chunks[0].metadata}")

49220 chunks

('Environments#\n'
 'RLlib works with several different types of environments, including '
 'Farama-Foundation Gymnasium, user-defined, multi-agent, and also batched '
 'environments.\n'
 'Tip\n'
 'Not all environments work with all algorithms. Check out the algorithm '
 'overview for more information.\n'
 'Configuring Environments#')

metadata:
{'source': 'https://docs.ray.io/en/master/rllib-env.html'}


In [15]:
# Ray dataset
chunks_ds = ray.data.from_items([{"text": chunk.page_content, "source": chunk.metadata["source"]} for chunk in chunks])
chunks_ds.show(1)

2023-08-31 07:37:53,928	INFO dataset.py:2180 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.


{'text': 'Environments#\nRLlib works with several different types of environments, including Farama-Foundation Gymnasium, user-defined, multi-agent, and also batched environments.\nTip\nNot all environments work with all algorithms. Check out the algorithm overview for more information.\nConfiguring Environments#', 'source': 'https://docs.ray.io/en/master/rllib-env.html'}


### Embed data

In [7]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import numpy as np
from ray.data import ActorPoolStrategy

In [17]:
class EmbedChunks:
    def __init__(self, model_name):
        model_kwargs = {"device": "cuda"}
        encode_kwargs = {"device": "cuda", "batch_size": 100}
        if model_name == "text-embedding-ada-002":
            self.embedding_model = OpenAIEmbeddings(
                model=model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs,
                openai_api_base=os.environ["OPENAI_API_BASE"],
                openai_api_key=os.environ["OPENAI_API_KEY"])
        else:
            self.embedding_model = HuggingFaceEmbeddings(
                model_name=model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs)
    
    def __call__(self, batch):
        embeddings = self.embedding_model.embed_documents(batch["text"])
        return {"text": batch["text"], "source": batch["source"], "embeddings": embeddings}

In [18]:
# Embed chunks
embedding_model_name = "thenlper/gte-base"
embedded_chunks = chunks_ds.map_batches(
    EmbedChunks,
    fn_constructor_kwargs={"model_name": embedding_model_name},
    batch_size=100, 
    num_gpus=1,
    compute=ActorPoolStrategy(size=2))

In [None]:
# Sample
sample = embedded_chunks.take(5)
print ("embedding size:", len(sample[0]["embeddings"]))
pprint(sample[0]["text"])

### Index data

In [10]:
import psycopg
from pgvector.psycopg import register_vector

In [20]:
class StoreResults:
    def __call__(self, batch):
        with psycopg.connect(os.environ["DB_CONNECTION_STRING"]) as conn:
            register_vector(conn)
            with conn.cursor() as cur:
                for text, source, embedding in zip(batch["text"], batch["source"], batch["embeddings"]):
                    cur.execute("INSERT INTO document (text, source, embedding) VALUES (%s, %s, %s)", (text, source, embedding,),)
        return {}

In [None]:
%%bash
# Set up pgvector
bash ../setup-pgvector.sh

In [21]:
%%bash
# Drop current vector DB (if any) and setup for new one
psql "$DB_CONNECTION_STRING" -c "DROP TABLE document;"
export SQL_DUMP_FP="/efs/shared_storage/goku/sql_dumps/gte-base_300_50.sql"
sudo -u postgres psql -f ../migrations/vector-768.sql  # "thenlper/gte-base" dimension is 768

ERROR:  table "document" does not exist


CREATE TABLE


If we have already created an index (and saved it), we can reload it:

In [22]:
%%bash
# Load index
export SQL_DUMP_FP="/efs/shared_storage/goku/sql_dumps/gte-base_300_50.sql"
psql "$DB_CONNECTION_STRING" -f $SQL_DUMP_FP  # load
psql "$DB_CONNECTION_STRING" -c "SELECT count(*) FROM document;"  # num rows

SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
ALTER TABLE
ALTER TABLE
DROP SEQUENCE
DROP TABLE
DROP EXTENSION
CREATE EXTENSION
COMMENT
SET
SET
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER TABLE
ALTER SEQUENCE
ALTER TABLE
COPY 49220
 setval 
--------
  49220
(1 row)

ALTER TABLE
 count 
-------
 49220
(1 row)



otherwise, we can index the data and save it:

In [31]:
# Index data
embedded_chunks.map_batches(
    StoreResults,
    batch_size=128,
    num_cpus=1,
    compute=ActorPoolStrategy(size=28),
).count()

2023-08-29 13:45:01,236	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[MapBatches(EmbedChunks)] -> ActorPoolMapOperator[MapBatches(StoreResults)]
2023-08-29 13:45:01,237	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-08-29 13:45:01,238	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
2023-08-29 13:45:01,258	INFO actor_pool_map_operator.py:117 -- MapBatches(EmbedChunks): Waiting for 2 pool actors to start...
2023-08-29 13:45:22,186	INFO actor_pool_map_operator.py:117 -- MapBatches(StoreResults): Waiting for 28 pool actors to start...


Running 0:   0%|          | 0/200 [00:00<?, ?it/s]



0

In [35]:
%%bash
# Save index
export SQL_DUMP_FP="/efs/shared_storage/goku/sql_dumps/gte-base_300_50.sql"
sudo -u postgres pg_dump -c > $SQL_DUMP_FP  # save
psql "$DB_CONNECTION_STRING" -c "SELECT count(*) FROM document;"  # num rows

 count 
-------
 49220
(1 row)



## Retrieval

In [5]:
import json
import numpy as np

In [11]:
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
conn = psycopg.connect(os.environ["DB_CONNECTION_STRING"])
register_vector(conn)

In [25]:
# Embed query
query = "What is the default batch size for map_batches?"
embedding = np.array(embedding_model.embed_query(query))
len(embedding)

768

In [26]:
with conn.cursor() as cur:
    cur.execute("SELECT * FROM document ORDER BY embedding <-> %s LIMIT 5", (embedding,))
    rows = cur.fetchall()
    context = [{"text": row[1], "source": row[2]} for row in rows]

In [27]:
for item in context:
    print (item["source"])
    print (item["text"])
    print ()

https://docs.ray.io/en/master/_modules/ray/data/dataset.html
The actual size of the batch provided to ``fn`` may be smaller than
                ``batch_size`` if ``batch_size`` doesn't evenly divide the block(s) sent
                to a given map task. Default batch_size is 4096 with "default".

https://docs.ray.io/en/master/data/api/doc/ray.data.Dataset.map_batches.html
The actual size of the batch provided to fn may be smaller than
batch_size if batch_size doesn’t evenly divide the block(s) sent
to a given map task. Default batch_size is 4096 with “default”.
compute – Either “tasks” (default) to use Ray Tasks or an

https://docs.ray.io/en/master/ray-air/api/doc/ray.data.preprocessors.BatchMapper.html
None wil use the entire underlying blocks as batches (blocks may
contain different number of rows) and the actual size of the batch provided
to fn may be smaller than batch_size if batch_size doesn’t
evenly divide the block(s) sent to a given map task. Defaults to 4096,

https://docs.r

## Generation

In [12]:
import time

In [13]:
def generate_response(
    llm, temperature=0.0, 
    system_content="", assistant_content="", user_content="", 
    max_retries=3, retry_interval=60):
    """Generate response from an LLM."""
    retry_count = 0
    while retry_count < max_retries:
        try:
            response = openai.ChatCompletion.create(
                model=llm,
                temperature=temperature,
                messages=[
                    {"role": "system", "content": system_content},
                    {"role": "assistant", "content": assistant_content},
                    {"role": "user", "content": user_content},
                ],
            )
            return response["choices"][-1]["message"]["content"]
        except Exception as e:
            print(e)
            time.sleep(retry_interval)  # default is per-minute rate limits
            retry_count += 1
    return ""

In [29]:
# Credentials
openai.api_base = os.environ["ANYSCALE_API_BASE"]
openai.api_key = os.environ["ANYSCALE_API_KEY"]

In [40]:
# Generate response
generate_response(
    llm="meta-llama/Llama-2-70b-chat-hf",
    temperature=0.0,
    system_content="Answer the query using the context provided.",
    user_content=f"query: {query}, context: {context}"
)

'The default batch size for map_batches is 4096. This is mentioned in multiple sources, including the Ray documentation for Dataset.map_batches and BatchMapper.\n\nHere are the relevant quotes from the provided context:\n\n* "Default batch_size is 4096 with \'default\'."\n* "Default batch_size is 4096 with “default”."\n* "Defaults to 4096,"\n* "The default batch size depends on your resource type. If you’re using CPUs, the default batch size is 4096."\n\nSo, the default batch size for map_batches is 4096, regardless of whether you\'re using CPUs or GPUs. However, if you\'re using GPUs, you must specify an explicit batch size.'

Let's combine the context retrieval and response generation together into a conventient query agent that we can use to easily generate our responses.

In [14]:
class QueryAgent:
    def __init__(self, embedding_model_name="thenlper/gte-base",
                 llm="meta-llama/Llama-2-70b-chat-hf", 
                 temperature=0.0, max_context_length=4096,
                 system_content="", assistant_content=""):
        
        # Embedding model
        model_kwargs = {"device": "cuda"}
        encode_kwargs = {"device": "cuda", "batch_size": 100}
        if embedding_model_name == "text-embedding-ada-002":
            self.embedding_model = OpenAIEmbeddings(
                model=embedding_model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs,
                openai_api_base=os.environ["OPENAI_API_BASE"],
                openai_api_key=os.environ["OPENAI_API_KEY"])
        else:
            self.embedding_model = HuggingFaceEmbeddings(
                model_name=embedding_model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs)
        
        # LLM
        self.llm = llm
        self.temperature = temperature
        self.context_length = max_context_length - len(system_content + assistant_content)
        self.system_content = system_content
        self.assistant_content = assistant_content

        # VectorDB connection
        self.conn = psycopg.connect(os.environ["DB_CONNECTION_STRING"])
        register_vector(self.conn)

    def __call__(self, query, num_chunks=5):
        # Get context
        embedding = np.array(self.embedding_model.embed_query(query))
        with self.conn.cursor() as cur:
            cur.execute("SELECT * FROM document ORDER BY embedding <-> %s LIMIT %s", (embedding, num_chunks))
            rows = cur.fetchall()
            context = [{"text": row[1]} for row in rows]
            sources = [row[2] for row in rows]

        # Generate response
        user_content = f"query: {query}, context: {context}"
        answer = generate_response(
            llm=self.llm,
            temperature=self.temperature,
            system_content=self.system_content,
            assistant_content=self.assistant_content,
            user_content=user_content[: self.context_length],
        )

        # Result
        result = {
            "question": query,
            "sources": sources,
            "answer": answer,
        }
        return result

In [41]:
query = "What is the default batch size for map_batches?"
system_content = "Answer the query using the context provided."
agent = QueryAgent(
    embedding_model_name="thenlper/gte-base",
    llm="meta-llama/Llama-2-7b-chat-hf",
    max_context_length=4096,
    system_content=system_content,
)
result = agent(query=query)
print(json.dumps(result, indent=2))

{
  "question": "What is the default batch size for map_batches?",
  "sources": [
    "https://docs.ray.io/en/master/_modules/ray/data/dataset.html",
    "https://docs.ray.io/en/master/data/api/doc/ray.data.Dataset.map_batches.html",
    "https://docs.ray.io/en/master/ray-air/api/doc/ray.data.preprocessors.BatchMapper.html",
    "https://docs.ray.io/en/master/data/transforming-data.html",
    "https://docs.ray.io/en/master/ray-air/examples/torch_incremental_learning.html"
  ],
  "answer": "Based on the provided context, the default batch size for `map_batches` is 4096."
}


## Datasets

We'll start by creating our reference (ground-truth) dataset. We have a list of user queries and the ideal source to answer the query [`datasets/eval-dataset-v1.jsonl`](https://github.com/ray-project/llm-applications/blob/main/datasets/eval-dataset-v1.jsonl). We will our LLM app above to generate reference answer for each query/source pair using `gpt-4`.

In [20]:
import re
import urllib.parse
from bs4 import BeautifulSoup
from IPython.display import clear_output, display, JSON

In [25]:
# If running tests / small samples, set num_samples to <10
# None = all samples
num_samples = 10

In [33]:
with open(Path(ROOT_DIR, "datasets/eval-dataset-v1.jsonl"), "r") as f:
    data = [json.loads(item) for item in list(f)]

In [34]:
# Clean up
for row in data:
    row["source"] = row["source"].replace("https://docs.ray.io/en/latest/", "https://docs.ray.io/en/master/")

In [35]:
data[:5]

[{'question': 'I’m struggling a bit with Ray Data type conversions when I do map_batches. Any advice?',
  'source': 'https://docs.ray.io/en/master/data/transforming-data.html#configuring-batch-format'},
 {'question': 'How does autoscaling work in a Ray Serve application?',
  'source': 'https://docs.ray.io/en/master/serve/scaling-and-resource-allocation.html#autoscaling'},
 {'question': 'how do I get the address of a ray node',
  'source': 'https://docs.ray.io/en/master/ray-core/miscellaneous.html#node-information'},
 {'question': 'Does Ray support NCCL?',
  'source': 'https://docs.ray.io/en/master/ray-more-libs/ray-collective.html'},
 {'question': 'could you give me an example of using this library for data-parallel training of CNNs on Ray?',
  'source': 'https://docs.ray.io/en/master/ray-air/computer-vision.html#training-vision-models'}]

In [36]:
def path_to_uri(path, scheme="https://", domain="docs.ray.io"):
    return scheme + domain + path.split(domain)[-1]

In [37]:
def fetch_text(uri):
    # Parse the URL to get the file path and anchor
    url_parts = urllib.parse.urlparse(uri)
    file_path = str(EFS_DIR) + "/" + url_parts.netloc + url_parts.path
    anchor = url_parts.fragment
    
    try:
        # Read the HTML file
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse the HTML content
        soup = BeautifulSoup(html_content, "html.parser")

        if anchor:
            # Find the element with the specified anchor
            target_element = soup.find(id=anchor)

            if target_element:
                # Extract the text within the anchor element
                text = target_element.get_text()
            else:
                # Return the whole page
                return fetch_text(file_path)
        else:
            # If no anchor is provided, fetch text from the entire page
            text = soup.get_text()
        
        return text

    except FileNotFoundError:
        print (f"File not found: {uri}")
        return ""

In [265]:
uri = "https://docs.ray.io/en/master/data/transforming-data.html#configuring-batch-format"
fetch_text(uri)

'\nConfiguring batch format#\nRay Data represents batches as dicts of NumPy ndarrays or pandas DataFrames. By\ndefault, Ray Data represents batches as dicts of NumPy ndarrays.\nTo configure the batch type, specify batch_format in\nmap_batches(). You can return either format from your function.\n\n\n\nNumPy\nfrom typing import Dict\nimport numpy as np\nimport ray\n\ndef increase_brightness(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:\n    batch["image"] = np.clip(batch["image"] + 4, 0, 255)\n    return batch\n\nds = (\n    ray.data.read_images("s3://[email\xa0protected]/image-datasets/simple")\n    .map_batches(increase_brightness, batch_format="numpy")\n)\n\n\n\n\n\npandas\nimport pandas as pd\nimport ray\n\ndef drop_nas(batch: pd.DataFrame) -> pd.DataFrame:\n    return batch.dropna()\n\nds = (\n    ray.data.read_csv("s3://[email\xa0protected]/iris.csv")\n    .map_batches(drop_nas, batch_format="pandas")\n)\n\n\n\n\n'

In [38]:
# Content for inference
system_content = """
    "Answer the query using the context provided.
    Then, you must {score} your response between 1 and 5.
    You must return your response in a line with only the score.
    Do not add any more details.
    On a separate line provide your {reasoning} for the score as well.
    Return your response following the exact format outlined below.
    Do not add or remove anything.
    And all of this must be in a valid JSON format.
    
    {"answer": answer,
     "score": score,
     "reasoning": reasoning}
    """
assistant_content = ""

In [39]:
def extract_from_response(response):
    # Define regular expressions for extracting values
    answer_pattern = r'"answer"\s*:\s*"([^"]*)"'
    score_pattern = r'"score"\s*:\s*([0-9]+)'
    reasoning_pattern = r'"reasoning"\s*:\s*"([^"]*)"'

    # Extract values using regular expressions
    answer_match = re.search(answer_pattern, response)
    score_match = re.search(score_pattern, response)
    reasoning_match = re.search(reasoning_pattern, response)

    # Convert
    if answer_match and score_match and reasoning_match:
        answer = answer_match.group(1)
        score = float(score_match.group(1))
        reasoning = reasoning_match.group(1)
        return answer, score, reasoning

    return "", "", ""

In [40]:
def get_references(data, llm, temperature, max_context_length, system_content, assistant_content, num_samples=None):
    results = []
    for row in tqdm(data[:num_samples]):
        # Get context
        query = row["question"]
        context = fetch_text(row["source"])

        # Generate response
        context_length = max_context_length - len(system_content + assistant_content)
        user_content = f"The query is {query} and the additional context is {context}"[:context_length]
        response = generate_response(
            llm=llm,
            temperature=temperature,
            system_content=system_content, 
            assistant_content=assistant_content, 
            user_content=user_content)

        # Extract from response
        answer, score, reasoning = extract_from_response(response=response)

        # Store result
        result = ({
                "question": query,
                "source": row["source"],
                "answer": answer,
                "score": score,
                "reasoning": reasoning,
            })
        results.append(result)
        clear_output(wait=True)
        display(JSON(json.dumps(result, indent=2)))
    return results

### gpt-4

In [305]:
openai.api_base = os.environ["OPENAI_API_BASE"]
openai.api_key = os.environ["OPENAI_API_KEY"]

In [307]:
results = get_references(
    data=data, llm="gpt-4", temperature=0.0, max_context_length=8192, 
    system_content=system_content, assistant_content=assistant_content,
    num_samples=num_samples)

<IPython.core.display.JSON object>


100%|██████████| 10/10 [01:51<00:00, 11.18s/it][A


In [271]:
references_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "references", "gpt-4.json")
references_fp.parent.mkdir(parents=True, exist_ok=True)

In [272]:
# Save to file
with open(references_fp, "w") as fp:
    json.dump(results, fp, indent=4)

In [273]:
# Read from file
with open(references_fp, "r") as fp:
    results = json.load(fp)

In [308]:
# Average score gpt-4 gave itself
print (np.mean([float(result["score"]) for result in results if result["score"]]))

4.7


### Llama-2-70b

Let's generate reference responses with `Llama-2-70b` as well:

In [339]:
openai.api_base = os.environ["ANYSCALE_API_BASE"]
openai.api_key = os.environ["ANYSCALE_API_KEY"]

In [311]:
results = get_references(
    data=data, llm="meta-llama/Llama-2-70b-chat-hf", temperature=0.0, max_context_length=4096, 
    system_content=system_content, assistant_content=assistant_content,
    num_samples=num_samples)

<IPython.core.display.JSON object>


100%|██████████| 10/10 [16:00<00:00, 96.09s/it][A


In [316]:
references_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "references", "llama-2-70b.json")

In [317]:
# Save to file
with open(references_fp, "w") as fp:
    json.dump(results, fp, indent=4)

In [318]:
# Read from file
with open(references_fp, "r") as fp:
    results = json.load(fp)

In [319]:
# Average score llama-2-70b gave itself
print (np.mean([float(result["score"]) for result in results if result["score"]]))

4.888888888888889


## Evaluator

Now that we've seen the answers, scores and reasoning for our references dataset from both `gpt-4` and `Llama-2-70b`. We can use these responses to decide on a quality evaluator for our future experiments. This evaluator will be used to score answers for different experiment configuations and so we need to be able to trust their scores, reasoning, etc. After inspecting Llama2 evaluating Llama2's answers, it is definitely not a good evaluator. For most answers the reasoning is not good, and the score is pretty random with lots of 4s. Therefore, our evaluator will be `gpt-4`.

In [41]:
EVALUATOR = "gpt-4"

## Experiments

We're going to start experimenting with the various components in our LLM application such as our evaluator, context, sections, chunking size, number of chunks in our context, embedding models, OSS/closed LLMs and more!

### Utilities

Before we get started with our experiments, we're going to define some utility functions that we'll use to easily generate and evaluate responses using the different experiment configurations. We'll also define some functions to help determine our response quality score, retrieval recall score, etc.

In [42]:
import subprocess

In [15]:
# Paths
DATA_PATH = str(Path(ROOT_DIR, "datasets", "eval-dataset-v1.jsonl"))
REFERENCE_LOC = str(Path(ROOT_DIR, EXPERIMENTS_DIR, "references", "gpt-4.json"))

In [16]:
# Mappings
EMBEDDING_DIMENSIONS = {
    "thenlper/gte-base": 768,
    "BAAI/bge-large-en": 1024,
    "text-embedding-ada-002": 1536
}
MAX_CONTEXT_LENGTHS = {
    "gpt-4": 8192,
    "gpt-3.5-turbo": 4096,
    "gpt-3.5-turbo-16k": 16384,
    "meta-llama/Llama-2-7b-chat-hf": 4096,
    "meta-llama/Llama-2-13b-chat-hf": 4096,
    "meta-llama/Llama-2-70b-chat-hf": 4096,
}

In [45]:
def execute_bash(command):
    results = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    return results

In [46]:
def create_index(embedding_model_name, chunk_size, chunk_overlap):
    # Drop current Vector DB and prepare for new one
    execute_bash(f'''psql "{os.environ["DB_CONNECTION_STRING"]}" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state = 'idle in transaction';"''')
    execute_bash(f'psql "{os.environ["DB_CONNECTION_STRING"]}" -c "DROP TABLE document;"')
    execute_bash(f'sudo -u postgres psql -f ../migrations/vector-{EMBEDDING_DIMENSIONS[embedding_model_name]}.sql')
    SQL_DUMP_FP = Path(EFS_DIR, "sql_dumps", f"{embedding_model_name.split('/')[-1]}_{chunk_size}_{chunk_overlap}.sql")
    
    # Load vector DB
    if SQL_DUMP_FP.exists():  # Load from SQL dump
        execute_bash(f'psql "{os.environ["DB_CONNECTION_STRING"]}" -f {SQL_DUMP_FP}')
    else:  # Create new index
        # Create chunks dataset
        text_splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n", "\n", " ", ""],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )
        chunks = text_splitter.create_documents(
            texts=[doc.page_content for doc in docs], 
            metadatas=[doc.metadata for doc in docs]
        )
        chunks_ds = ray.data.from_items([{"text": chunk.page_content, "source": chunk.metadata["source"]} for chunk in chunks])

        # Embed chunks
        embedded_chunks = chunks_ds.map_batches(
            EmbedChunks,
            fn_constructor_kwargs={"model_name": embedding_model_name},
            batch_size=100, 
            num_gpus=1,
            compute=ActorPoolStrategy(size=2))
        
        # Index data
        embedded_chunks.map_batches(
            StoreResults,
            batch_size=128,
            num_cpus=1,
            compute=ActorPoolStrategy(size=28),
        ).count()
        
        # Save to SQL dump
        execute_bash(f"sudo -u postgres pg_dump -c > {SQL_DUMP_FP}")

In [17]:
def set_credentials(llm):
    if llm.startswith("gpt"):
        openai.api_base = os.environ["OPENAI_API_BASE"]
        openai.api_key = os.environ["OPENAI_API_KEY"]
    else:
        openai.api_base = os.environ["ANYSCALE_API_BASE"]
        openai.api_key = os.environ["ANYSCALE_API_KEY"]

In [18]:
# Generate responses
def generate_responses(
    experiment_name, data_path, 
    chunk_size, chunk_overlap, num_chunks,
    embedding_model_name, 
    llm, temperature, max_context_length, 
    system_content, assistant_content="",
    num_samples=None):
    
    # Set credentials
    set_credentials(llm=llm)
    
    # Query agent
    agent = QueryAgent(
        embedding_model_name=embedding_model_name,
        llm=llm,
        temperature=temperature,
        max_context_length=max_context_length,
        system_content=system_content,
        assistant_content=assistant_content,
    )

    # Generate responses
    results = []
    with open(Path(data_path), "r") as f:
        questions = [json.loads(item)["question"] for item in list(f)][:num_samples]
    for query in tqdm(questions):
        result = agent(query=query, num_chunks=num_chunks)
        results.append(result)
        clear_output(wait=True)
        display(JSON(json.dumps(result, indent=2)))

    # Save to file
    responses_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "responses", f"{experiment_name}.json")
    responses_fp.parent.mkdir(parents=True, exist_ok=True)
    config = {
        "experiment_name": experiment_name,
        "data_path": data_path,
        "chunk_size": chunk_size,
        "chunk_overlap": chunk_overlap,
        "num_chunks": num_chunks,
        "embedding_model_name": embedding_model_name,
        "llm": llm,
        "temperature": temperature,
        "max_context_length": max_context_length,
        "system_content": system_content,
        "assistant_content": assistant_content,
    }
    responses = {
        "config": config,
        "results": results,
    }
    with open(responses_fp, "w") as fp:
        json.dump(responses, fp, indent=4)

In [43]:
def get_retrieval_score(references, generated):
    matches = np.zeros(len(references))
    for i in range(len(references)):
        reference_source = references[i]["source"].split("#")[0]
        if not reference_source:
            matches[i] = 1
            continue
        for source in generated[i]["sources"]:
            # sections don't have to perfectly match
            if reference_source == source.split("#")[0]:
                matches[i] = 1
                continue
    retrieval_score = np.mean(matches)
    return retrieval_score

In [38]:
def evaluate_responses(
    experiment_name, reference_loc, response_loc,
    evaluator, temperature, max_context_length,
    system_content, assistant_content="",
    num_samples=None):
    
    # Set credentials
    set_credentials(llm=evaluator)
    
    # Load answers
    with open(Path(reference_loc), "r") as f:
        references = [item for item in json.load(f)][:num_samples]
    with open(Path(response_loc), "r") as f:
        generated = [item for item in json.load(f)["results"]][:num_samples]
    assert len(references) == len(generated)

    # Quality score
    results = []
    context_length = max_context_length - len(system_content + assistant_content)
    for ref, gen in tqdm(zip(references, generated), total=len(references)):
        assert ref["question"] == gen["question"]
        user_content = str(
            {
                "question": gen["question"],
                "generated_answer": gen["answer"],
                "reference_answer": ref["answer"],
            }
        )[:context_length]

        # Generate response
        response = generate_response(
            llm=evaluator,
            temperature=temperature,
            system_content=system_content,
            assistant_content=assistant_content,
            user_content=user_content,
        )

        # Extract from response
        score, reasoning = response.split("\n", 1)

        # Store result
        result = {
            "question": gen["question"],
            "generated_answer": gen["answer"],
            "reference_answer": ref["answer"],
            "score": float(score),
            "reasoning": reasoning.lstrip("\n"),
            "sources": gen["sources"],
        }
        results.append(result)
        clear_output(wait=True)
        display(JSON(json.dumps(result, indent=2)))

    # Save to file
    evaluator_name = evaluator.split("/")[-1].lower()
    evaluation_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "evaluations", f"{experiment_name}_{evaluator_name}.json")
    evaluation_fp.parent.mkdir(parents=True, exist_ok=True)
    config = {
        "experiment_name": experiment_name,
        "reference_loc": reference_loc,
        "response_loc": response_loc,
        "evaluator": evaluator,
        "temperature": temperature,
        "max_context_length": max_context_length,
        "system_content": system_content,
        "assistant_content": assistant_content,
    }
    evaluation = {
        "config": config,
        "retrieval_score": get_retrieval_score(references, generated),
        "quality_score": np.mean([item["score"] for item in results if (item["score"] and item["reference_answer"])]),
        "results": results,
    }
    with open(evaluation_fp, "w") as fp:
        json.dump(evaluation, fp, indent=4)

In [51]:
def run_experiment(
    experiment_name, data_path,
    chunk_size, chunk_overlap, num_chunks,
    embedding_model_name, llm,
    reference_loc, evaluator,
    num_samples=None):
    """Generate responses and evaluate them."""

    # Build index
    create_index(
        embedding_model_name=embedding_model_name,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    
    # Generate responses
    generate_responses(
        experiment_name=experiment_name, 
        data_path=data_path, 
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap, 
        num_chunks=num_chunks,
        embedding_model_name=embedding_model_name, 
        llm=llm, 
        temperature=0.0, 
        max_context_length=MAX_CONTEXT_LENGTHS[llm], 
        system_content="Answer the query using the context provided.",
        num_samples=num_samples)

    # Evaluate responses
    evaluation_system_content = """
        Your job is to rate the quality of our generated answer {generated_answer}
        given a query {query} and a reference answer {reference_answer}.
        Your score has to be between 1 and 5.
        You must return your response in a line with only the score.
        Do not return answers in any other format.
        On a separate line provide your reasoning for the score as well.
        """
    evaluate_responses(
        experiment_name=experiment_name,
        reference_loc=reference_loc, 
        response_loc=str(Path(ROOT_DIR, EXPERIMENTS_DIR, "responses", f"{experiment_name}.json")),
        evaluator=EVALUATOR, 
        temperature=0.0, 
        max_context_length=MAX_CONTEXT_LENGTHS[EVALUATOR],
        system_content=evaluation_system_content,
        num_samples=num_samples)

In [52]:
def print_experiment(experiment_name, evaluator=EVALUATOR):
    eval_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "evaluations", f"{experiment_name}_{evaluator}.json")
    with open(eval_fp, "r") as fp:
        d = json.load(fp)
    print (experiment_name)
    print ("  retrieval score:", d["retrieval_score"])
    print ("  quality score:", d["quality_score"])
    print ()

### Context

We're first going to test if the additonal context we provide is helpful at all. This is to validate that the RAG system is indeed worth the effort.

In [None]:
# Without context
num_chunks = 0
experiment_name = f"without-context"
run_experiment(
    experiment_name=experiment_name, 
    data_path=DATA_PATH,
    chunk_size=100, 
    chunk_overlap=50,
    num_chunks=num_chunks,
    embedding_model_name="thenlper/gte-base",
    llm="gpt-3.5-turbo",
    reference_loc=REFERENCE_LOC,
    evaluator=EVALUATOR,
    num_samples=num_samples)

In [356]:
print_experiment(experiment_name=experiment_name)

without-context
  retrieval score: 0.0
  quality score: 3.3



In [357]:
# With context
num_chunks = 5
experiment_name = "with-context"
run_experiment(
    experiment_name=experiment_name, 
    data_path=DATA_PATH,
    chunk_size=300, 
    chunk_overlap=50, 
    num_chunks=num_chunks,
    embedding_model_name="thenlper/gte-base",
    llm="gpt-3.5-turbo",
    reference_loc=REFERENCE_LOC,
    evaluator=EVALUATOR,
    num_samples=num_samples)

<IPython.core.display.JSON object>

100%|██████████| 10/10 [00:59<00:00,  5.98s/it]


In [358]:
print_experiment(experiment_name=experiment_name)

with-context
  retrieval score: 0.6
  quality score: 3.85



As we can see, **using context (RAG)** does indeed help in the quality of our answers!

### Chunk size

In [366]:
chunk_sizes = [100, 300, 500, 700]

In [367]:
for chunk_size in chunk_sizes:
    experiment_name = f"chunk-size-{chunk_size}"
    run_experiment(
        experiment_name=experiment_name, 
        data_path=DATA_PATH,
        chunk_size=chunk_size, 
        chunk_overlap=50, 
        num_chunks=5,
        embedding_model_name="thenlper/gte-base",
        llm="gpt-3.5-turbo",
        reference_loc=REFERENCE_LOC,
        evaluator=EVALUATOR,
        num_samples=num_samples)

<IPython.core.display.JSON object>

100%|██████████| 10/10 [01:14<00:00,  7.47s/it]


In [368]:
for chunk_size in chunk_sizes:
    experiment_name = f"chunk-size-{chunk_size}"
    print_experiment(experiment_name=experiment_name)

chunk-size-100
  retrieval score: 0.6
  quality score: 3.8

chunk-size-300
  retrieval score: 0.6
  quality score: 3.9

chunk-size-500
  retrieval score: 0.8
  quality score: 4.45

chunk-size-700
  retrieval score: 0.7
  quality score: 4.0



Seem that a larger chunk size does help but it tapers off around the 600 characters mark (too much context might be too noisy).

**Note**: If we were to use larger chunk sizes (ours is based on characters), keep in mind that [most](https://huggingface.co/spaces/mteb/leaderboard) open source embedding models have a maximum sequence length of 512 sub-word tokens. This means that if our chunk contains more than 512 sub-word tokens, the embedding wouldn't account for it anyway (unless we finetune our embedding model to have longer sequence lengths).

In [54]:
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

### Number of chunks

**Note**: Keep in mind that the `chunk_size` you chose multiplied by the `num_chunks` below fits inside the LLM's context length. We're experimenting with the chunk size and number of chunks as if they were indepdent variables but they area heavily related. Especially since all of our LLMs have a finite maximum context length. So ideally, we would tune for a combination if `chunk_size` * `num_chunks`.

In [370]:
num_chunks_list = [1, 3, 5, 7]

In [371]:
for num_chunks in num_chunks_list:
    experiment_name = f"num-chunks-{num_chunks}"
    run_experiment(
        experiment_name=experiment_name, 
        data_path=DATA_PATH,
        chunk_size=CHUNK_SIZE, 
        chunk_overlap=CHUNK_OVERLAP, 
        num_chunks=num_chunks,
        embedding_model_name="thenlper/gte-base",
        llm="gpt-3.5-turbo",
        reference_loc=REFERENCE_LOC,
        evaluator=EVALUATOR,
        num_samples=num_samples)

<IPython.core.display.JSON object>

100%|██████████| 10/10 [00:56<00:00,  5.65s/it]


In [372]:
for num_chunks in num_chunks_list:
    experiment_name=f"num-chunks-{num_chunks}"
    print_experiment(experiment_name=experiment_name)

num-chunks-1
  retrieval score: 0.3
  quality score: 4.25

num-chunks-3
  retrieval score: 0.6
  quality score: 4.25

num-chunks-5
  retrieval score: 0.8
  quality score: 4.25

num-chunks-7
  retrieval score: 0.8
  quality score: 4.65



Increasing our number of chunks improves our retrieval and quality scores. We had to stop testing at 6 chunks since our `chunk_size` is 600 tokens and `Llama-2-70b`'s maximum context length is 4096 tokens (we also have to account for the system, assistant and user content to our LLM). This is a major reason to invest in extending context size via RoPE scaling (rotary position embeddings), etc. But it also seems that the benefit of increasing the number of chunks is starting to taper off.

In [55]:
NUM_CHUNKS = 7

### Embedding models

So far, we've used [`thenlper/gte-base`](https://huggingface.co/thenlper/gte-base) as our embedding model because it's a relatively small (0.22 GB) and performant option. But now, let's explore other popular options such the current leader on the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard), [`BAAI/bge-large-en`](https://huggingface.co/BAAI/bge-large-en) (1.34 GB), and OpenAI's [`text-embedding-ada-002`](https://openai.com/blog/new-and-improved-embedding-model).

In [374]:
embedding_model_names = ["thenlper/gte-base", "BAAI/bge-large-en", "text-embedding-ada-002"]

In [375]:
for embedding_model_name in embedding_model_names:
    experiment_name = f"{embedding_model_name.split('/')[-1]}"
    run_experiment(
        experiment_name=experiment_name, 
        data_path=DATA_PATH,
        chunk_size=CHUNK_SIZE, 
        chunk_overlap=CHUNK_OVERLAP, 
        num_chunks=NUM_CHUNKS,
        embedding_model_name=embedding_model_name,
        llm="gpt-3.5-turbo",
        reference_loc=REFERENCE_LOC,
        evaluator=EVALUATOR,
        num_samples=num_samples)

<IPython.core.display.JSON object>

100%|██████████| 10/10 [01:07<00:00,  6.72s/it]


In [376]:
for embedding_model_name in embedding_model_names:
    experiment_name = f"{embedding_model_name.split('/')[-1]}"
    print_experiment(experiment_name=experiment_name)

gte-base
  retrieval score: 0.8
  quality score: 4.4

bge-large-en
  retrieval score: 0.6
  quality score: 4.1

text-embedding-ada-002
  retrieval score: 0.7
  quality score: 3.75



This is an interesting outcome because the #1 (`BAAI/bge-large-en`) on the current leaderboard isn't necessarily the best for our specific task. Using the smaller `thenlper/gte-base` produced the best retrieval and quality scores in our experiments.

In [56]:
EMBEDDING_MODEL_NAME = "thenlper/gte-base"

### OSS vs. closed LLMs

In [57]:
llms = ["gpt-3.5-turbo",
        "gpt-4",
        "meta-llama/Llama-2-7b-chat-hf", 
        "meta-llama/Llama-2-13b-chat-hf", 
        "meta-llama/Llama-2-70b-chat-hf"]

In [60]:
for llm in llms:
    experiment_name = f"{llm.split('/')[-1].lower()}"
    run_experiment(
        experiment_name=experiment_name, 
        data_path=DATA_PATH,
        chunk_size=CHUNK_SIZE, 
        chunk_overlap=CHUNK_OVERLAP, 
        num_chunks=NUM_CHUNKS,
        embedding_model_name=EMBEDDING_MODEL_NAME,
        llm=llm,
        reference_loc=REFERENCE_LOC,
        evaluator=EVALUATOR,
        num_samples=num_samples)

<IPython.core.display.JSON object>

100%|██████████| 10/10 [02:24<00:00, 14.43s/it]


In [64]:
for llm in llms:
    experiment_name = f"{llm.split('/')[-1].lower()}"
    print_experiment(experiment_name=experiment_name)

gpt-3.5-turbo
  retrieval score: 0.8
  quality score: 4.5

gpt-4
  retrieval score: 0.8
  quality score: 4.5

llama-2-7b-chat-hf
  retrieval score: 0.8
  quality score: 3.8

llama-2-13b-chat-hf
  retrieval score: 0.8
  quality score: 3.35

llama-2-70b-chat-hf
  retrieval score: 0.8
  quality score: 4.15



**Note**: Some of our LLMs have much larger context lengths, ex. `gpt-4` is 8192 and `gpt-3.5-turbo-16k` is 16384. We could increase the number of chunks that we use for these since we saw that increasing `num_chunks` continued to improve the retrieval and quality scores. However, we will keep this value fixed for now since the performance started to taper off anyway and so we can compare these performances under the exact same configurations.

In [65]:
LLM = "meta-llama/Llama-2-70b-chat-hf"

## Cost analysis

**Note**: Our `Llama-2` models are priced at $1/M tokens with [Anyscale Endpoints](https://endpoints.anyscale.com/).

In [None]:
# Pricing details
pricing = {
    "gpt-3.5-turbo": {
        "prompt": 2e-6,
        "sampled": 2e-6
    },
    "gpt-4": {
        "prompt": 3e-5,
        "sampled": 6e-5
    },
    "llama-2-7b-chat-hf": {
        "prompt": 1e-6,
        "sampled": 1e-6
    },
    "llama-2-13b-chat-hf": {
        "prompt": 1e-6,
        "sampled": 1e-6
    },
    "llama-2-70b-chat-hf": {
        "prompt": 1e-6,
        "sampled": 1e-6
    }
}

In [None]:
def cost_analysis(llm):
    experiment_name = f"{llm.split('/')[-1].lower()}"
    eval_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "evaluations", f"{experiment_name}_{EVALUATOR}.json")
    with open(eval_fp, "r") as fp:
        d = json.load(fp)
    num_samples = len(d["results"])
    prompt_size, sampled_size = 0, 0
    for result in d["results"]:
        prompt_size += len(result["question"]) + (CHUNK_SIZE * NUM_CHUNKS)
        sampled_size += len(result["generated_answer"])
    total_cost = pricing[experiment_name]["prompt"] * prompt_size + pricing[experiment_name]["sampled"] * sampled_size
    avg_cost = total_cost / num_samples
    
    print (llm)
    print (f"  avg prompt size: {int(prompt_size/num_samples)}")
    print (f"  avg sampled size: {int(sampled_size/num_samples)}")
    print (f"  total cost: ${total_cost:.2f}")
    print (f"  avg cost: ${avg_cost:.2f}")
    print ()

In [None]:
for llm in llms:
    cost_analysis(llm=llm)

## Routing queries

We can close the gap in performance between open source and proprietary models by routing queries to the right model according to the hardness of the question. In this section, we do this by training a classifier on the score of the models given by the evaluation. We have collected a large dataset `routing-questions.jsonl` with queries to train the classifier. Let's first generate the responses for these questions:

In [None]:
DATA_PATH = str(Path(ROOT_DIR, "datasets", "routing-questions.jsonl"))
NUM_SAMPLES = 10 # Change this to None to run on all samples

routing_experiments = {
    "gpt-4": "gpt-4",
    "llama-2-70b": "meta-llama/Llama-2-70b-chat-hf"
}

for experiment_name, llm in routing_experiments.items():
    generate_responses(
        experiment_name="routing-" + experiment_name, 
        data_path=DATA_PATH, 
        chunk_size=500, 
        chunk_overlap=50, 
        num_chunks=7,
        embedding_model_name="thenlper/gte-base", 
        llm=llm, 
        temperature=0.0, 
        max_context_length=MAX_CONTEXT_LENGTHS[llm], 
        system_content="Answer the query using the context provided.",
        num_samples=NUM_SAMPLES)

Next, we will run the evaluator on these responses:

In [45]:
import tempfile

# Adapt format for reference answers
routing_llama_2_70b_responses = str(Path(ROOT_DIR, EXPERIMENTS_DIR, "responses", "routing-llama-2-70b.json"))
routing_gpt_4_responses = str(Path(ROOT_DIR, EXPERIMENTS_DIR, "responses", "routing-gpt-4.json"))
routing_gpt_4_references = tempfile.NamedTemporaryFile(mode="w")

with open(routing_gpt_4_responses) as f:
    records = json.load(f)["results"]
    for record in records:
        record["source"] = ""
    json.dump(records, routing_gpt_4_references)
    routing_gpt_4_references.flush()


# Evaluate responses
evaluation_system_content = """
    Your job is to rate the quality of our generated answer {generated_answer}
    given a query {query} and a reference answer {reference_answer}.
    Your score has to be between 1 and 5.
    You must return your response in a line with only the score.
    Do not return answers in any other format.
    On a separate line provide your reasoning for the score as well.
    """
evaluate_responses(
    experiment_name="routing",
    reference_loc=routing_gpt_4_references.name, 
    response_loc=routing_llama_2_70b_responses,
    evaluator="gpt-4", 
    temperature=0.0, 
    max_context_length=MAX_CONTEXT_LENGTHS["gpt-4"],
    system_content=evaluation_system_content,
    num_samples=NUM_SAMPLES)

<IPython.core.display.JSON object>

100%|██████████| 10/10 [01:04<00:00,  6.40s/it]


Now we use the data we collected to train a routing classifier.

In [46]:
from sklearn.model_selection import train_test_split

with open(Path(ROOT_DIR, EXPERIMENTS_DIR, "evaluations", "routing_gpt-4.json")) as f:
    records = json.load(f)["results"]

X = [record["question"] for record in records]
y = [record["score"] >= 4 for record in records]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [51]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

nlp = spacy.load('en_core_web_md')

def spacy_tokenizer(sentence):
    tokens = nlp(sentence)
    tokens = [token.lemma_.lower().strip() for token in tokens]
    return [token for token in tokens if token not in spacy.lang.en.stop_words.STOP_WORDS]

In [52]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

classifier = LogisticRegression()


bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

# Create pipeline using Bag of Words
pipe = Pipeline([('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Let's check the performance on the test set:

In [55]:
from sklearn import metrics

predicted = pipe.predict(X_test)

print("Total number of samples: ", len(predicted))
print("Number of samples for which the OSS model can be used: ", sum(predicted))
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Total number of samples:  3
Number of samples for which the OSS model can be used:  3
Logistic Regression Accuracy: 0.6666666666666666
Logistic Regression Precision: 0.6666666666666666
Logistic Regression Recall: 1.0


## Next steps

In progress:
- connect with serving scripts
- hybrid routing

LlamaIndex:
- Generate synthetic datasets (query, source, answer)
- add context to embeddings
- better chunking logic
- fine-tune embedding model
- fine-tune base LLM (gpt-3.5 and OSS)

Later:
- additional data sources
- longer context lengths (RoPE)
- keyword search with semantic (embedding) search
- reranking with LLM after results from (faster) embedding search