# Establishing a baseline for retrieval

Experiments with smaller sets of data and move towards fine-tuning of index configurations.

In [4]:
import warnings
from eval import Eval
import os

warnings.filterwarnings("ignore")

raw_chunk_path = "process_data/data/2008-mazda3-chunks.json" # Ex: ["chunk1", "chunk2", ...]
labeled_chunks_path = "process_data/data/mazda-labeled.json" # Ex: [{"query": "Some query?", "relevant_item_ids": ["86"]}]

e = Eval(
    model_provider="hf",
    model_str="sentence-transformers/all-MiniLM-L6-v2",
    embedding_dim=384,
    raw_data_path=raw_chunk_path,
    labeled_data_path=labeled_chunks_path,
    input_data_type="json",
    vector_data_type="float32",
    algorithm="flat",
    ret_k=4,
    find_threshold=False,
)

14:17:52 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
14:17:53 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

14:17:55 root INFO   Indexing data...
14:17:55 root INFO   Data indexed. self.total_indexing_time=128.7949981689453s


In [5]:
import nest_asyncio

nest_asyncio.apply()

e.calc_metrics()

14:18:03 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
14:18:04 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

14:18:05 root INFO   Calculating retrieval metrics for test_id: b7a9e96e-37e5-4f03-8480-7ebb3e8c3d8b
14:18:05 root INFO   Overall f1 at 4 for retrieval: 0.5936507936507937


## Retrieve metrics from eval run

In [6]:
from redis import Redis

client = Redis.from_url("redis://localhost:6379/0")

res = client.json().get(f"eval:{e.settings.test_id}")
res["metrics"]["retrieval"]

{'precision_at_k': 0.5,
 'recall_at_k': 0.8472222222222222,
 'f1_at_k': 0.5936507936507937}

# Try with different Index config

In [47]:
e2 = Eval(
    model_provider="hf",
    model_str="intfloat/e5-large-v2",
    embedding_dim=1024,
    raw_data_path=raw_chunk_path,
    labeled_data_path=labeled_chunks_path,
    input_data_type="json",
    vector_data_type="float32",
    algorithm="flat",
    ret_k=6,
    find_threshold=False,
)

10:39:47 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: intfloat/e5-large-v2


10:39:49 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

10:40:04 root INFO   Indexing data...
10:40:04 root INFO   Data indexed. self.total_indexing_time=89.56700134277344s


In [48]:
e2.calc_metrics()

10:40:08 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: intfloat/e5-large-v2
10:40:09 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

10:40:10 root INFO   Calculating retrieval metrics for test_id: 2e12986d-0b4a-47b1-ac77-03ac750025f2
10:40:10 root INFO   Overall f1 at 6 for retrieval: 0.4332671957671958


In [49]:
res = client.json().get(f"eval:{e2.settings.test_id}")
res["metrics"]["retrieval"]

{'precision_at_k': 0.3194444444444445,
 'recall_at_k': 0.8125,
 'f1_at_k': 0.4332671957671958}

# Solution approaches

### Create propositions from chunks

In [22]:
import ollama # could be whatever model you want to use


def call_ollama(prompt):
    response = ollama.chat(model='llama3.2', messages=[
      {
        'role': 'user',
        'content': prompt,
      },
    ])
    return response['message']['content']


In [54]:
async def create_dense_props(chunk):
    """Create dense representation of raw text content."""

    # The system message here should be HEAVILY customized for your specific use case
    prompt = f"""
    You are a helpful PDF extractor tool that takes an input and returns an output without preamble.

    Clean and summarize the raw content into clear and simple propositions,
    to make them more suitable for retrieval from a vector index.

    Consider the following rules:
    1. Split divergent topics into separate propositions.
    2. For any named entity that is accompanied by additional descriptive information,
    separate this information into its own distinct proposition.
    3. Decontextualize the proposition by adding necessary modifier to nouns or
    entire sentences and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that")
    with the full name of the entities they refer to.
    4. Return only the resulting string do not include any other text in output.

    raw_content: {chunk}
    """

    return call_ollama(prompt)

In [55]:
chunk="Mazda3_8Y64-EA-08A_Edition1 Page1 Tuesday, November 27 2007 9:0 AM\n\nForm No.8Y64-EA-08A\n\nBlack plate (1,1)\n\nMazda3_8Y64-EA-08A_Edition1 Page2 Tuesday, November 27 2007 9:0 AM\n\nForm No.8Y64-EA-08A\n\nBlack plate (2,1)\n\nMazda3_8Y64-EA-08A_Edition1 Page3 Tuesday, November 27 2007 9:0 AM\n\nBlack plate (3,1)\n\nA Word to Mazda Owners\n\nThank you for choosing a Mazda. We at Mazda design and build vehicles with complete customer satisfaction in mind.\n\nTo help ensure enjoyable and trouble-free operation of your Mazda, read this manual carefully and follow its recommendations.\n\nAn Authorized Mazda Dealer knows your vehicle best. So when maintenance or service is necessary, that's the place to go.\n\nOur nationwide network of Mazda professionals is dedicated to providing you with the best possible service.\n\nWe assure you that all of us at Mazda have an ongoing interest in your motoring pleasure and in your full satisfaction with your Mazda product.\n\nMazda Motor Corporation HIROSHIMA, JAPAN\n\nImportant Notes About This Manual Keep this manual in the glove box as a handy reference for the safe and enjoyable use of your Mazda. Should you resell the vehicle, leave this manual with it for the next owner.\n\nAll specifications and descriptions are accurate at the time of printing. Because improvement is a constant goal at Mazda, we reserve the right to make changes in specifications at any time without notice and without obligation.\n\nEvent Data Recorder This vehicle is equipped with an event data recorder. In the event of a crash, this device records data related to vehicle dynamics and safety systems for a short period of time. These data can help provide a better understanding of the circumstances in which crashes and injuries occur and lead to the designing of safer vehicles.\n\nAir Conditioning and the Environment Your Mazda's genuine air conditioner is filled with HFC134a (R134a), a refrigerant that has been found not to damage the earth's ozone layer. If the air conditioner does not operate properly, consult an Authorized Mazda Dealer.\n\nPerchlorate Certain components of this vehicle such as [air bag modules, seat belt pretensioners, lithium batteries, ...] may contain Perchlorate Material\u2013 Special handling may apply for service or vehicle end of life disposal. See www.dtsc.ca.gov/hazardouswaste/perchlorate.\n\nPlease be aware that this manual applies to all models, equipment and options. As a result, you may find some explanations for equipment not installed on your vehicle."
res = await create_dense_props(chunk)
res

10:49:48 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


"Mazda3_8Y64-EA-08A_Edition1 Page1 Tuesday, November 27 2007 9:0 AM\nForm No.8Y64-EA-08A\nBlack plate (1,1)\n\nMazda3_8Y64-EA-08A_Edition1 Page2 Tuesday, November 27 2007 9:0 AM\nForm No.8Y64-EA-08A\nBlack plate (2,1)\n\nMazda3_8Y64-EA-08A_Edition1 Page3 Tuesday, November 27 2007 9:0 AM\nBlack plate (3,1)\nMazda Motor Corporation HIROSHIMA JAPAN\n\nA Word to Mazda Owners\nWe design vehicles with complete customer satisfaction in mind.\nRead this manual carefully and follow its recommendations.\n\nAn Authorized Mazda Dealer knows your vehicle best.\nOur nationwide network of Mazda professionals is dedicated to providing the best possible service.\nWe have an ongoing interest in your motoring pleasure and full satisfaction with your Mazda product.\n\nImportant Notes About This Manual\nKeep this manual in the glove box as a reference for safe use.\nLeave this manual with the vehicle if you resell it.\n\nSpecifications and descriptions are accurate at time of printing.\nMazda may make chan

In [56]:
import json
with open("process_data/data/2008-mazda3-chunks.json", "r") as f:
    chunks = json.load(f)

In [60]:
import asyncio
tasks = [create_dense_props(chunk) for chunk in chunks[:5]]
props = await asyncio.gather(*tasks)

10:54:13 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
10:54:17 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
10:54:19 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
10:54:22 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
10:54:23 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


In [61]:
props

['Form No. 8Y64-EA-08A\nBlack plate (1,1)\n\nForm No. 8Y64-EA-08A\nBlack plate (2,1)\n\nForm No. 8Y64-EA-08A\nBlack plate (3,1)\n\nWe thank you for choosing a Mazda.\n\nWe design and build vehicles with complete customer satisfaction in mind.\nTo ensure enjoyable operation, read this manual carefully and follow its recommendations.\nAn Authorized Mazda Dealer knows your vehicle best.\nOur nationwide network of Mazda professionals is dedicated to providing the best service.\nMazda Motor Corporation - HIROSHIMA, JAPAN\n\nKeep this manual in the glove box for safe use.\nSpecifications may change without notice due to constant improvement goals at Mazda.\nThis vehicle has an event data recorder that records data related to vehicle dynamics and safety systems.\n\nThe air conditioner uses HFC134a (R134a), a refrigerant with minimal ozone layer damage.\nComponents such as air bag modules or seat belt pretensioners may contain Perchlorate Material – special handling may apply.',
 'Interior par

#### Next steps load new chunks into index and label

# Query re-writing

Implementing a layer to improve potential matches between queries

In [65]:
with open("process_data/data/mazda-labeled.json", "r") as f:
    labeled_data = json.load(f)

In [74]:
def rewrite_query(query):
    re_writing_prompt = f"""
    You are query rewriting helper. Provided a user question re-write it such that is not vague and optimized for vector retrieval against a car manual pdf.

    Return only the resulting rewritten query with no preamble.

    query to rewrite: {query}
    """

    res = call_ollama(re_writing_prompt)
    return res

In [75]:
rewritten_labeled = [{"query": rewrite_query(q["query"]), "relevant_item_ids": q["relevant_item_ids"]} for q in labeled_data]

11:58:31 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
11:58:31 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
11:58:31 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
11:58:32 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
11:58:32 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
11:58:32 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
11:58:33 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
11:58:33 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
11:58:33 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
11:58:34 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
11:58:34 httpx INFO   HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"

In [76]:
with open("process_data/data/mazda-labeled-rewritten.json", "w") as f:
    json.dump(rewritten_labeled, f)

In [77]:
raw_chunk_path = "process_data/data/2008-mazda3-chunks.json" # Ex: ["chunk1", "chunk2", ...]
labeled_chunks_path = "process_data/data/mazda-labeled-rewritten.json" # Ex: [{"query": "Some query?", "relevant_item_ids": ["86"]}]

e3 = Eval(
    model_provider="hf",
    model_str="sentence-transformers/all-MiniLM-L6-v2",
    embedding_dim=384,
    raw_data_path=raw_chunk_path,
    labeled_data_path=labeled_chunks_path,
    input_data_type="json",
    vector_data_type="float32",
    algorithm="flat",
    ret_k=4,
    find_threshold=False,
)

e3.calc_metrics()

11:59:00 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
11:59:01 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

11:59:03 root INFO   Indexing data...
11:59:03 root INFO   Data indexed. self.total_indexing_time=111.71800231933594s
11:59:03 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
11:59:04 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

11:59:05 root INFO   Calculating retrieval metrics for test_id: f042d180-85ea-4bdd-a047-bdb758a44430
11:59:05 root INFO   Overall f1 at 4 for retrieval: 0.5450396825396825


In [78]:
res = client.json().get(f"eval:{e3.settings.test_id}")
res["metrics"]["retrieval"]

{'precision_at_k': 0.4583333333333333,
 'recall_at_k': 0.7847222222222223,
 'f1_at_k': 0.5450396825396825}

# Fine tune retrieval config with optimization

In [36]:
from functools import partial
import numpy as np
import optuna
import yaml
from eval import Eval
from models import StudyConfig
from optimize import load_config, calc_baseline, objective
import nest_asyncio

nest_asyncio.apply()

config_file = "ex_study_config.yaml"
study_config = load_config(config_file)

# calculate baselines in order to normalize study
baseline_sm, baseline_lg = calc_baseline(study_config)

study = optuna.create_study(
    study_name="test",
    direction="maximize",
    sampler=optuna.samplers.TPESampler(),
    pruner=optuna.pruners.MedianPruner(),
)

obj = partial(
    objective,
    study_config=study_config,
    baseline_sm=baseline_sm,
    baseline_lg=baseline_lg,
)
study.optimize(obj, n_trials=study_config.n_trials, n_jobs=study_config.n_jobs)
print(f"Completed Bayesian optimization...")

best_trial = study.best_trial
print(f"Best Configuration: {best_trial.number}: {best_trial.params}:")
print(f"Best Score: {best_trial.values}")

10:32:42 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
10:32:44 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

10:32:45 root INFO   Indexing data...
10:32:45 root INFO   Data indexed. self.total_indexing_time=71.54100036621094s
10:32:45 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
10:32:47 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

10:32:47 root INFO   Calculating retrieval metrics for test_id: 11c40f7b-cc46-4620-b537-7726161025d8
10:32:47 root INFO   Overall f1 at 6 for retrieval: 0.5448412698412698
10:32:47 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: intfloat/e5-large-v2
10:32:49 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

10:33:04 root INFO   Indexing data...
10:33:04 root INFO   Data indexed. self.total_indexing_time=91.80500030517578s
10:33:04 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: intfloat/e5-large-v2
10:33:06 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

10:33:07 root INFO   Calculating retrieval metrics for test_id: 62563ead-b8fd-48ea-ae1a-df932cc55529
10:33:07 root INFO   Overall f1 at 6 for retrieval: 0.4332671957671958


[I 2024-10-30 10:33:07,693] A new study created in memory with name: test




 Running for: 
 model_str: intfloat/e5-large-v2 
 ef_runtime: 50 
 ef_construction: 186 
 m: 37 


10:33:07 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: intfloat/e5-large-v2
10:33:09 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

10:33:24 root INFO   Indexing data...
10:33:24 root INFO   Data indexed. self.total_indexing_time=239.02000427246094s
10:33:24 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: intfloat/e5-large-v2
10:33:26 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

10:33:27 root INFO   Calculating retrieval metrics for test_id: 348f9897-fb58-496c-9bc3-cb06124e0fbd
10:33:27 root INFO   Overall f1 at 9 for retrieval: 0.34572649572649566


[I 2024-10-30 10:33:27,116] Trial 0 finished with value: -1.6528976003051348 and parameters: {'model_info': {'provider': 'hf', 'model': 'intfloat/e5-large-v2', 'dim': 1024}, 'algorithm': 'hnsw', 'var_dtype': 'float16', 'ret_k': 9, 'ef_runtime': 50, 'ef_construction': 186, 'm': 37}. Best is trial 0 with value: -1.6528976003051348.


Metrics: [0.34572649572649566, -1.0, -0.9986240960316305]


 Running for: 
 model_str: sentence-transformers/all-MiniLM-L6-v2 
 algorithm: flat
10:33:27 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
10:33:27 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

10:33:28 root INFO   Indexing data...
10:33:28 root INFO   Data indexed. self.total_indexing_time=102.87000274658203s
10:33:29 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
10:33:30 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

10:33:30 root INFO   Calculating retrieval metrics for test_id: 02817277-935b-4362-942b-49b09386a8a2
10:33:30 root INFO   Overall f1 at 8 for retrieval: 0.4428090428090428


[I 2024-10-30 10:33:30,850] Trial 1 finished with value: -0.5578344774724163 and parameters: {'model_info': {'provider': 'hf', 'model': 'sentence-transformers/all-MiniLM-L6-v2', 'dim': 384}, 'algorithm': 'flat', 'var_dtype': 'float16', 'ret_k': 8}. Best is trial 1 with value: -0.5578344774724163.


Metrics: [0.4428090428090428, -1.0, -0.0006435202814590403]
Completed Bayesian optimization...
Best Configuration: 1: {'model_info': {'provider': 'hf', 'model': 'sentence-transformers/all-MiniLM-L6-v2', 'dim': 384}, 'algorithm': 'flat', 'var_dtype': 'float16', 'ret_k': 8}:
Best Score: [-0.5578344774724163]


In [37]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_algorithm,params_ef_construction,params_ef_runtime,params_m,params_model_info,params_ret_k,params_var_dtype,state
0,0,-1.652898,2024-10-30 10:33:07.693616,2024-10-30 10:33:27.115870,0 days 00:00:19.422254,hnsw,186.0,50.0,37.0,"{'provider': 'hf', 'model': 'intfloat/e5-large...",9,float16,COMPLETE
1,1,-0.557834,2024-10-30 10:33:27.116229,2024-10-30 10:33:30.850093,0 days 00:00:03.733864,flat,,,,"{'provider': 'hf', 'model': 'sentence-transfor...",8,float16,COMPLETE
