From 2d41de6853fdb8905d5d3f9a04bd6655ce686b27 Mon Sep 17 00:00:00 2001
From: Jongsok Choi <jongsok.choi@gmail.com>
Date: Mon, 6 Oct 2025 02:23:01 +0000
Subject: [PATCH 1/5] Improve autotuning log messages.

Elaborate more on what's happening during autotuning so user can understands better.
Set the expectation in the beginning that autotuning will take a while.
For the progress bar, instead of using the rich backend of tqdm, we can use
rich itself, which allows removing the warning message,
"TqdmExperimentalWarning: rich is experimental/alpha", as well as
customize the progress bar better (colors, remove some of the clutter
with expected time, which is not accurate, and rounds/s speed).
If cache hits so we don't need to run autotuning, let the user know and
how the cache can be removed.
Overall, autotuner log messages are cleaner and elaborate more on what's
happening.
---
 helion/autotuner/base_cache.py             | 10 +++++
 helion/autotuner/base_search.py            | 19 ++++----
 helion/autotuner/benchmarking.py           | 16 ++++---
 helion/autotuner/differential_evolution.py |  3 +-
 helion/autotuner/local_cache.py            |  4 ++
 helion/autotuner/pattern_search.py         | 20 +++++----
 helion/autotuner/progress_bar.py           | 51 ++++++++++++++++++++++
 requirements.txt                           |  1 -
 8 files changed, 98 insertions(+), 26 deletions(-)
 create mode 100644 helion/autotuner/progress_bar.py

diff --git a/helion/autotuner/base_cache.py b/helion/autotuner/base_cache.py
index 361e83791..574001938 100644
--- a/helion/autotuner/base_cache.py
+++ b/helion/autotuner/base_cache.py
@@ -153,6 +153,10 @@ def get(self) -> Config | None:
     def put(self, config: Config) -> None:
         raise NotImplementedError
 
+    def _get_cache_info_message(self) -> str:
+        """Return a message describing where the cache is and how to clear it."""
+        return ""
+
     def autotune(self) -> Config:
         if os.environ.get("HELION_SKIP_CACHE", "") not in {"", "0", "false", "False"}:
             return self.autotuner.autotune()
@@ -160,11 +164,17 @@ def autotune(self) -> Config:
         if (config := self.get()) is not None:
             counters["autotune"]["cache_hit"] += 1
             log.debug("cache hit: %s", str(config))
+            cache_info = self._get_cache_info_message()
+            self.autotuner.log(
+                f"Found cached config for {self.kernel.kernel.name}, skipping autotuning.\n{cache_info}"
+            )
             return config
 
         counters["autotune"]["cache_miss"] += 1
         log.debug("cache miss")
 
+        self.autotuner.log("Starting autotuning process, this may take a while...")
+
         config = self.autotuner.autotune()
 
         self.put(config)
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
index dd5ea72ea..252771e9c 100644
--- a/helion/autotuner/base_search.py
+++ b/helion/autotuner/base_search.py
@@ -29,7 +29,6 @@
 import torch.multiprocessing as mp
 from torch.utils._pytree import tree_flatten
 from torch.utils._pytree import tree_map
-from tqdm.rich import tqdm
 from triton.testing import do_bench
 
 from .. import exc
@@ -40,6 +39,7 @@
 from .logger import LambdaLogger
 from .logger import classify_triton_exception
 from .logger import format_triton_compile_failure
+from .progress_bar import iter_with_progress
 
 log = logging.getLogger(__name__)
 
@@ -321,15 +321,14 @@ def parallel_benchmark(
         else:
             is_workings = [True] * len(configs)
         results = []
-        iterator = zip(configs, fns, is_workings, strict=True)
-        if self.settings.autotune_progress_bar:
-            iterator = tqdm(
-                iterator,
-                total=len(configs),
-                desc=desc,
-                unit="config",
-                disable=not self.settings.autotune_progress_bar,
-            )
+
+        # Render a progress bar only when the user requested it.
+        iterator = iter_with_progress(
+            zip(configs, fns, is_workings, strict=True),
+            total=len(configs),
+            description=desc,
+            enabled=self.settings.autotune_progress_bar,
+        )
         for config, fn, is_working in iterator:
             if is_working:
                 # benchmark one-by-one to avoid noisy results
diff --git a/helion/autotuner/benchmarking.py b/helion/autotuner/benchmarking.py
index e8d6313b0..63c9a8aa9 100644
--- a/helion/autotuner/benchmarking.py
+++ b/helion/autotuner/benchmarking.py
@@ -3,10 +3,8 @@
 import functools
 import statistics
 from typing import Callable
-
-from tqdm.rich import tqdm
 from triton import runtime
-
+from .progress_bar import iter_with_progress
 
 def interleaved_bench(
     fns: list[Callable[[], object]], *, repeat: int, desc: str | None = None
@@ -38,9 +36,15 @@ def interleaved_bench(
     ]
 
     di.synchronize()
-    iterator = range(repeat)
-    if desc is not None:
-        iterator = tqdm(iterator, desc=desc, total=repeat, unit="round")
+
+    # When a description is supplied we show a progress bar so the user can
+    # track the repeated benchmarking loop.
+    iterator = iter_with_progress(
+        range(repeat),
+        total=repeat,
+        description=desc,
+        enabled=desc is not None,
+    )
     for i in iterator:
         for j in range(len(fns)):
             clear_cache()
diff --git a/helion/autotuner/differential_evolution.py b/helion/autotuner/differential_evolution.py
index b1655b301..41a6a3fc1 100644
--- a/helion/autotuner/differential_evolution.py
+++ b/helion/autotuner/differential_evolution.py
@@ -95,7 +95,8 @@ def _autotune(self) -> Config:
         )
         self.initial_two_generations()
         for i in range(2, self.max_generations):
+            self.log(f"Generation {i} starting")
             replaced = self.evolve_population()
-            self.log(f"Generation {i}: replaced={replaced}", self.statistics)
+            self.log(f"Generation {i} complete: replaced={replaced}", self.statistics)
         self.rebenchmark_population()
         return self.best.config
diff --git a/helion/autotuner/local_cache.py b/helion/autotuner/local_cache.py
index 99afbfec6..3c67aba8a 100644
--- a/helion/autotuner/local_cache.py
+++ b/helion/autotuner/local_cache.py
@@ -94,6 +94,10 @@ def put(self, config: Config) -> None:
         path = self._get_local_cache_path()
         config.save(path)
 
+    def _get_cache_info_message(self) -> str:
+        cache_dir = self._get_local_cache_path().parent
+        return f"Cache directory: {cache_dir}. To clear cache, delete this directory or set HELION_SKIP_CACHE=1."
+
 
 class StrictLocalAutotuneCache(LocalAutotuneCache):
     """
diff --git a/helion/autotuner/pattern_search.py b/helion/autotuner/pattern_search.py
index de23f6218..a1b322aed 100644
--- a/helion/autotuner/pattern_search.py
+++ b/helion/autotuner/pattern_search.py
@@ -46,7 +46,7 @@ def __init__(
 
     def _autotune(self) -> Config:
         self.log(
-            f"Starting PatternSearch with initial_population={self.initial_population}, copies={self.copies}"
+            f"Starting PatternSearch with initial_population={self.initial_population}, copies={self.copies}, max_generations={self.max_generations}"
         )
         visited = set()
         self.population = []
@@ -59,7 +59,7 @@ def _autotune(self) -> Config:
                 self.population.append(member)
         self.parallel_benchmark_population(self.population, desc="Initial population")
         # again with higher accuracy
-        self.rebenchmark_population(self.population, desc="Initial rebench")
+        self.rebenchmark_population(self.population, desc="Verifying initial results")
         self.population.sort(key=performance)
         starting_points = []
         for member in self.population[: self.copies]:
@@ -88,21 +88,25 @@ def _autotune(self) -> Config:
                         new_population[id(member)] = member
             if num_active == 0:
                 break
+
+            # Log generation header before compiling/benchmarking
+            self.log(
+                f"Generation {generation} starting: {num_neighbors} neighbors, {num_active} active search path(s)"
+            )
+
             self.population = [*new_population.values()]
             # compile any unbenchmarked members in parallel
             unbenchmarked = [m for m in self.population if len(m.perfs) == 0]
             if unbenchmarked:
                 self.parallel_benchmark_population(
-                    unbenchmarked, desc=f"Gen {generation} neighbors"
+                    unbenchmarked, desc=f"Generation {generation}: Exploring neighbors"
                 )
             # higher-accuracy rebenchmark
             self.rebenchmark_population(
-                self.population, desc=f"Gen {generation} rebench"
-            )
-            self.log(
-                f"Generation {generation}, {num_neighbors} neighbors, {num_active} active:",
-                self.statistics,
+                self.population, desc=f"Generation {generation}: Verifying top configs"
             )
+            # Log final statistics for this generation
+            self.log(f"Generation {generation} complete:", self.statistics)
         return self.best.config
 
     def _pattern_search_from(
diff --git a/helion/autotuner/progress_bar.py b/helion/autotuner/progress_bar.py
new file mode 100644
index 000000000..b1b3370c9
--- /dev/null
+++ b/helion/autotuner/progress_bar.py
@@ -0,0 +1,51 @@
+"""Progress-bar utilities used by the autotuner.
+
+We rely on `rich` to render colored, full-width progress bars that
+show the description, percentage complete, and how many items have been
+processed.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterable, Iterator
+from typing import TypeVar
+
+from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn
+
+T = TypeVar("T")
+
+
+def iter_with_progress(
+    iterable: Iterable[T], *, total: int, description: str | None, enabled: bool
+) -> Iterator[T]:
+    """Yield items from *iterable*, optionally showing a progress bar.
+
+    Parameters
+    ----------
+    iterable:
+        Any iterable whose items should be yielded.
+    total:
+        Total number of items expected from the iterable.
+    description:
+        Text displayed on the left side of the bar.  Defaults to ``"Progress"``.
+    enabled:
+        When ``False`` the iterable is returned unchanged so there is zero
+        overhead; when ``True`` a Rich progress bar is rendered.
+    """
+
+    if not enabled:
+        yield from iterable
+        return
+
+    with Progress(
+        TextColumn("[progress.description]{task.description}"),
+        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+        BarColumn(bar_width=None, complete_style="yellow", finished_style="green"),
+        MofNCompleteColumn(),
+    ) as progress:
+        task = progress.add_task(description or "Progress", total=total)
+        for item in iterable:
+            # Yield before updating so the consumer sees the item immediately.
+            yield item
+            progress.update(task, advance=1)
+        progress.refresh()
diff --git a/requirements.txt b/requirements.txt
index 894173feb..965858186 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,5 +4,4 @@ pre-commit
 filecheck
 expecttest
 numpy
-tqdm
 rich

From e48add66cbc1d7c6e73586405f7f48ffb01e8148 Mon Sep 17 00:00:00 2001
From: Jongsok Choi <jongsok.choi@gmail.com>
Date: Mon, 6 Oct 2025 02:37:30 +0000
Subject: [PATCH 2/5] Fix linting error for docs.

---
 docs/conf.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index c22f18285..13f2f8506 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -129,10 +129,11 @@ def connect(self, event: str, callback: Callable[..., None]) -> None:
 }
 
 theme_variables = pytorch_sphinx_theme2.get_theme_variables()
-templates_path = [
-    "_templates",
-    os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
-]
+templates_path = ["_templates"]
+if pytorch_sphinx_theme2.__file__ is not None:
+    templates_path.append(
+        os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates")
+    )
 
 html_context = {
     "theme_variables": theme_variables,

From 4585a6fb41f57e89efd26f4a425afe052602782e Mon Sep 17 00:00:00 2001
From: Jongsok Choi <jongsok.choi@gmail.com>
Date: Mon, 6 Oct 2025 02:53:45 +0000
Subject: [PATCH 3/5] Make the cache hit message clearer.

---
 helion/autotuner/local_cache.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helion/autotuner/local_cache.py b/helion/autotuner/local_cache.py
index 3c67aba8a..22543dd71 100644
--- a/helion/autotuner/local_cache.py
+++ b/helion/autotuner/local_cache.py
@@ -96,7 +96,7 @@ def put(self, config: Config) -> None:
 
     def _get_cache_info_message(self) -> str:
         cache_dir = self._get_local_cache_path().parent
-        return f"Cache directory: {cache_dir}. To clear cache, delete this directory or set HELION_SKIP_CACHE=1."
+        return f"Cache directory: {cache_dir}. To run autotuning again, delete the cache directory or set HELION_SKIP_CACHE=1."
 
 
 class StrictLocalAutotuneCache(LocalAutotuneCache):

From e4112a04c868650e7d0e5529e22c9424190048f5 Mon Sep 17 00:00:00 2001
From: Jongsok Choi <jongsok.choi@gmail.com>
Date: Mon, 6 Oct 2025 03:09:50 +0000
Subject: [PATCH 4/5] Fix linting errors.

---
 helion/autotuner/benchmarking.py |  3 +++
 helion/autotuner/progress_bar.py | 11 +++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/helion/autotuner/benchmarking.py b/helion/autotuner/benchmarking.py
index 63c9a8aa9..b9cacbc1d 100644
--- a/helion/autotuner/benchmarking.py
+++ b/helion/autotuner/benchmarking.py
@@ -3,9 +3,12 @@
 import functools
 import statistics
 from typing import Callable
+
 from triton import runtime
+
 from .progress_bar import iter_with_progress
 
+
 def interleaved_bench(
     fns: list[Callable[[], object]], *, repeat: int, desc: str | None = None
 ) -> list[float]:
diff --git a/helion/autotuner/progress_bar.py b/helion/autotuner/progress_bar.py
index b1b3370c9..e3caa35a3 100644
--- a/helion/autotuner/progress_bar.py
+++ b/helion/autotuner/progress_bar.py
@@ -7,10 +7,17 @@
 
 from __future__ import annotations
 
-from collections.abc import Iterable, Iterator
+from typing import TYPE_CHECKING
 from typing import TypeVar
 
-from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn
+from rich.progress import BarColumn
+from rich.progress import MofNCompleteColumn
+from rich.progress import Progress
+from rich.progress import TextColumn
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+    from collections.abc import Iterator
 
 T = TypeVar("T")
 

From 40bd3fbc712a7f01848581e2a99cc3d7d2c7dc21 Mon Sep 17 00:00:00 2001
From: Jongsok Choi <jongsok.choi@gmail.com>
Date: Tue, 7 Oct 2025 03:11:28 +0000
Subject: [PATCH 5/5] Add processing rate to progress bar and simplify its
 code.

---
 helion/autotuner/progress_bar.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/helion/autotuner/progress_bar.py b/helion/autotuner/progress_bar.py
index e3caa35a3..f69896837 100644
--- a/helion/autotuner/progress_bar.py
+++ b/helion/autotuner/progress_bar.py
@@ -13,17 +13,31 @@
 from rich.progress import BarColumn
 from rich.progress import MofNCompleteColumn
 from rich.progress import Progress
+from rich.progress import ProgressColumn
 from rich.progress import TextColumn
+from rich.text import Text
 
 if TYPE_CHECKING:
     from collections.abc import Iterable
     from collections.abc import Iterator
 
+    from rich.progress import Task
+
 T = TypeVar("T")
 
 
+class SpeedColumn(ProgressColumn):
+    """Render the processing speed in configs per second."""
+
+    def render(self, task: Task) -> Text:
+        return Text(
+            f"{task.speed:.1f} configs/s" if task.speed is not None else "- configs/s",
+            style="magenta",
+        )
+
+
 def iter_with_progress(
-    iterable: Iterable[T], *, total: int, description: str | None, enabled: bool
+    iterable: Iterable[T], *, total: int, description: str | None = None, enabled: bool
 ) -> Iterator[T]:
     """Yield items from *iterable*, optionally showing a progress bar.
 
@@ -39,20 +53,18 @@ def iter_with_progress(
         When ``False`` the iterable is returned unchanged so there is zero
         overhead; when ``True`` a Rich progress bar is rendered.
     """
-
     if not enabled:
         yield from iterable
         return
 
+    if description is None:
+        description = "Progress"
+
     with Progress(
         TextColumn("[progress.description]{task.description}"),
         TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
         BarColumn(bar_width=None, complete_style="yellow", finished_style="green"),
         MofNCompleteColumn(),
+        SpeedColumn(),
     ) as progress:
-        task = progress.add_task(description or "Progress", total=total)
-        for item in iterable:
-            # Yield before updating so the consumer sees the item immediately.
-            yield item
-            progress.update(task, advance=1)
-        progress.refresh()
+        yield from progress.track(iterable, total=total, description=description)