### Notebook to process benchmar results

Please run this notebook after running all the benchmarks and storing them in the `results` dir. This will export them in the desired format for the single node benchmark plots of [qdrant.tech/benchmarks](https://qdrant.tech/benchmarks)

In [None]:
from pathlib import Path
import re
import json
import pandas as pd
from datetime import datetime, timezone

In [None]:
DATA_DIR = Path().resolve().parent / "results"
DATA_DIR, list(DATA_DIR.glob("*.json"))[0].name

In [None]:
PATH_REGEX = re.compile(r"(?P<engine_name>("
                        r"?P<engine>[a-z\-]+)"
                        r"\-m\-(?P<m>[0-9]+)"
                        r"\-ef\-(?P<ef>[0-9]+)"
                        r")"
                        r"\-(?P<dataset>[a-zA-Z0-9\-]+)"
                        r"\-(?P<operation>(search)|(upload))"
                        r"(\-(?P<search_index>[0-9]{1,2})\-)?"
                        r"\-?(?P<date>.*)\.json")

In [None]:
upload_results, search_results = [], []

for path in DATA_DIR.glob("*.json"):
    match = PATH_REGEX.match(path.name)
    if match is None:
        continue

    experiment = match.groupdict()

    with open(path, "r") as fp:
        stats = json.load(fp)

    params = stats["params"]
    dataset = params.pop("dataset")
    engine = params.pop("engine")

    entry = {
        "dataset": dataset,
        "engine": engine,
        "m": match["m"],
        "ef": match["ef"],
        "date": match["date"],
        "params": params,
        "results": stats["results"],
    }

    if experiment["operation"] == "search":
        entry.update({"search_index": match["search_index"]})
        search_results.append(entry)
    elif experiment["operation"] == "upload":
        upload_results.append(entry)
    else:
        raise Exception("Unknown operation")

len(upload_results), len(search_results)

In [None]:
upload_results, search_results[0]

In [None]:
upload_df = pd.DataFrame(upload_results)
upload_df["date"] = pd.to_datetime(upload_df["date"], format="%Y-%m-%d-%H-%M-%S")
upload_df = upload_df.sort_values("date", ascending=False) \
    .groupby(["engine", "m", "ef", "dataset"]) \
    .first()

temp_df = upload_df.copy()
temp_df["total_time"] = temp_df["results"].apply(lambda x: x["total_time"])
temp_df.sort_values("total_time", ascending=True).head(n=5)

In [None]:
search_df = pd.DataFrame(search_results)
search_df["date"] = pd.to_datetime(search_df["date"], format="%Y-%m-%d-%H-%M-%S")
search_df = search_df.sort_values("date", ascending=False) \
    .groupby(["engine", "m", "ef", "dataset", "search_index"]) \
    .first()

temp_df = search_df.copy()
temp_df['rps'] = temp_df['results'].apply(lambda x: x["rps"])
temp_df.sort_values("rps", ascending=False).head(n=10)

In [None]:
_search = search_df.reset_index()
_upload = upload_df.reset_index()

joined_df = _search.merge(_upload, on=["engine", "m", "ef", "dataset"], how="left", suffixes=("_search", "_upload"))
print(len(joined_df))
joined_df

In [None]:
json_results = []

for index, row in joined_df.reset_index().iterrows():
    engine_params = {}
    
    if isinstance(row['params_upload'], dict):
        engine_params.update(row['params_upload'])
    if isinstance(row['params_search'], dict):
        search_params = row['params_search']
        engine_params.update(search_params.get('config', {}))
        engine_params.update(search_params.get('params', {}))
        engine_params.update(search_params.get('search_params', {}))
        engine_params.update(search_params.get('vectorIndexConfig', {}))

    engine_params.pop('experiment')
    engine_params.pop('parallel')

    engine_name = row['engine']

    if engine_name.startswith("qdrant-"):
        engine_name = "qdrant"

    json_object = {
        "engine_name": engine_name,
        "setup_name": f"{row['params_search']['experiment']}",
        "dataset_name": row['dataset'],
        "search_idx": row['search_index'],
        "upload_time": row['results_upload']['upload_time'],
        "total_upload_time": row['results_upload']['total_time'],
        "p95_time": row['results_search']['p95_time'],
        "rps": row['results_search']['rps'],
        "parallel": row['params_search']['parallel'],
        "p99_time": row['results_search']['p99_time'],
        "mean_time": row['results_search']['mean_time'],
        "mean_precisions": row['results_search']['mean_precisions'],
        "engine_params": engine_params,
    }
    json_results.append(json_object)

format = '%Y-%M-%dT%H:%M:%S'
now = datetime.now().replace(tzinfo=timezone.utc).strftime(format)

Path(f"results.json").write_text(json.dumps(json_results, indent=2))
Path(f"results-{now}.json").write_text(json.dumps(json_results, indent=2))

print(json_results[-1], len(json_results))

results_df = pd.DataFrame(json_results).sort_values("p99_time", ascending=True)
# results_df.to_csv('results.csv')
results_df