feat: Add ability to skip searches if results exist

KShivendu · KShivendu · commit ca8f161b75f9 · 2023-10-13T14:57:57.000+05:30
diff --git a/engine/base_client/client.py b/engine/base_client/client.py
@@ -1,5 +1,6 @@
 import json
 from datetime import datetime
+from pathlib import Path
 from typing import List
 
 from benchmark import ROOT_DIR
@@ -54,7 +55,11 @@ def save_upload_results(
             out.write(json.dumps(upload_stats, indent=2))
 
     def run_experiment(
-        self, dataset: Dataset, skip_upload: bool = False, skip_search: bool = False
+        self,
+        dataset: Dataset,
+        skip_upload: bool = False,
+        skip_search: bool = False,
+        skip_if_exists: bool = False,
     ):
         execution_params = self.configurator.execution_params(
             distance=dataset.config.distance, vector_size=dataset.config.vector_size
@@ -82,6 +87,18 @@ def run_experiment(
         if not skip_search:
             print("Experiment stage: Search")
             for search_id, searcher in enumerate(self.searchers):
+
+                if skip_if_exists:
+                    existing_results = RESULTS_DIR.glob(
+                        f"{self.name}-{dataset.config.name}-search-{search_id}-*.json"
+                    )
+                    if len(existing_results) == 1:
+                        print(
+                            f"Skipping search {search_id} as it already exists in",
+                            existing_results[0],
+                        )
+                        continue
+
                 search_params = {**searcher.search_params}
                 search_stats = searcher.search_all(
                     dataset.config.distance, reader.read_queries()
diff --git a/run.py b/run.py
@@ -20,6 +20,7 @@ def run(
     host: str = "localhost",
     skip_upload: bool = False,
     skip_search: bool = False,
+    skip_if_exists: bool = True,
     exit_on_error: bool = True,
     timeout: float = 86400.0,
 ):
@@ -49,7 +50,9 @@ def run(
             dataset.download()
             try:
                 with stopit.ThreadingTimeout(timeout) as tt:
-                    client.run_experiment(dataset, skip_upload, skip_search)
+                    client.run_experiment(
+                        dataset, skip_upload, skip_search, skip_if_exists
+                    )
 
                 # If the timeout is reached, the server might be still in the
                 # middle of some background processing, like creating the index.