From 2d41de6853fdb8905d5d3f9a04bd6655ce686b27 Mon Sep 17 00:00:00 2001 From: Jongsok Choi Date: Mon, 6 Oct 2025 02:23:01 +0000 Subject: [PATCH 1/5] Improve autotuning log messages. Elaborate more on what's happening during autotuning so user can understands better. Set the expectation in the beginning that autotuning will take a while. For the progress bar, instead of using the rich backend of tqdm, we can use rich itself, which allows removing the warning message, "TqdmExperimentalWarning: rich is experimental/alpha", as well as customize the progress bar better (colors, remove some of the clutter with expected time, which is not accurate, and rounds/s speed). If cache hits so we don't need to run autotuning, let the user know and how the cache can be removed. Overall, autotuner log messages are cleaner and elaborate more on what's happening. --- helion/autotuner/base_cache.py | 10 +++++ helion/autotuner/base_search.py | 19 ++++---- helion/autotuner/benchmarking.py | 16 ++++--- helion/autotuner/differential_evolution.py | 3 +- helion/autotuner/local_cache.py | 4 ++ helion/autotuner/pattern_search.py | 20 +++++---- helion/autotuner/progress_bar.py | 51 ++++++++++++++++++++++ requirements.txt | 1 - 8 files changed, 98 insertions(+), 26 deletions(-) create mode 100644 helion/autotuner/progress_bar.py diff --git a/helion/autotuner/base_cache.py b/helion/autotuner/base_cache.py index 361e83791..574001938 100644 --- a/helion/autotuner/base_cache.py +++ b/helion/autotuner/base_cache.py @@ -153,6 +153,10 @@ def get(self) -> Config | None: def put(self, config: Config) -> None: raise NotImplementedError + def _get_cache_info_message(self) -> str: + """Return a message describing where the cache is and how to clear it.""" + return "" + def autotune(self) -> Config: if os.environ.get("HELION_SKIP_CACHE", "") not in {"", "0", "false", "False"}: return self.autotuner.autotune() @@ -160,11 +164,17 @@ def autotune(self) -> Config: if (config := self.get()) is not None: counters["autotune"]["cache_hit"] += 1 log.debug("cache hit: %s", str(config)) + cache_info = self._get_cache_info_message() + self.autotuner.log( + f"Found cached config for {self.kernel.kernel.name}, skipping autotuning.\n{cache_info}" + ) return config counters["autotune"]["cache_miss"] += 1 log.debug("cache miss") + self.autotuner.log("Starting autotuning process, this may take a while...") + config = self.autotuner.autotune() self.put(config) diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py index dd5ea72ea..252771e9c 100644 --- a/helion/autotuner/base_search.py +++ b/helion/autotuner/base_search.py @@ -29,7 +29,6 @@ import torch.multiprocessing as mp from torch.utils._pytree import tree_flatten from torch.utils._pytree import tree_map -from tqdm.rich import tqdm from triton.testing import do_bench from .. import exc @@ -40,6 +39,7 @@ from .logger import LambdaLogger from .logger import classify_triton_exception from .logger import format_triton_compile_failure +from .progress_bar import iter_with_progress log = logging.getLogger(__name__) @@ -321,15 +321,14 @@ def parallel_benchmark( else: is_workings = [True] * len(configs) results = [] - iterator = zip(configs, fns, is_workings, strict=True) - if self.settings.autotune_progress_bar: - iterator = tqdm( - iterator, - total=len(configs), - desc=desc, - unit="config", - disable=not self.settings.autotune_progress_bar, - ) + + # Render a progress bar only when the user requested it. + iterator = iter_with_progress( + zip(configs, fns, is_workings, strict=True), + total=len(configs), + description=desc, + enabled=self.settings.autotune_progress_bar, + ) for config, fn, is_working in iterator: if is_working: # benchmark one-by-one to avoid noisy results diff --git a/helion/autotuner/benchmarking.py b/helion/autotuner/benchmarking.py index e8d6313b0..63c9a8aa9 100644 --- a/helion/autotuner/benchmarking.py +++ b/helion/autotuner/benchmarking.py @@ -3,10 +3,8 @@ import functools import statistics from typing import Callable - -from tqdm.rich import tqdm from triton import runtime - +from .progress_bar import iter_with_progress def interleaved_bench( fns: list[Callable[[], object]], *, repeat: int, desc: str | None = None @@ -38,9 +36,15 @@ def interleaved_bench( ] di.synchronize() - iterator = range(repeat) - if desc is not None: - iterator = tqdm(iterator, desc=desc, total=repeat, unit="round") + + # When a description is supplied we show a progress bar so the user can + # track the repeated benchmarking loop. + iterator = iter_with_progress( + range(repeat), + total=repeat, + description=desc, + enabled=desc is not None, + ) for i in iterator: for j in range(len(fns)): clear_cache() diff --git a/helion/autotuner/differential_evolution.py b/helion/autotuner/differential_evolution.py index b1655b301..41a6a3fc1 100644 --- a/helion/autotuner/differential_evolution.py +++ b/helion/autotuner/differential_evolution.py @@ -95,7 +95,8 @@ def _autotune(self) -> Config: ) self.initial_two_generations() for i in range(2, self.max_generations): + self.log(f"Generation {i} starting") replaced = self.evolve_population() - self.log(f"Generation {i}: replaced={replaced}", self.statistics) + self.log(f"Generation {i} complete: replaced={replaced}", self.statistics) self.rebenchmark_population() return self.best.config diff --git a/helion/autotuner/local_cache.py b/helion/autotuner/local_cache.py index 99afbfec6..3c67aba8a 100644 --- a/helion/autotuner/local_cache.py +++ b/helion/autotuner/local_cache.py @@ -94,6 +94,10 @@ def put(self, config: Config) -> None: path = self._get_local_cache_path() config.save(path) + def _get_cache_info_message(self) -> str: + cache_dir = self._get_local_cache_path().parent + return f"Cache directory: {cache_dir}. To clear cache, delete this directory or set HELION_SKIP_CACHE=1." + class StrictLocalAutotuneCache(LocalAutotuneCache): """ diff --git a/helion/autotuner/pattern_search.py b/helion/autotuner/pattern_search.py index de23f6218..a1b322aed 100644 --- a/helion/autotuner/pattern_search.py +++ b/helion/autotuner/pattern_search.py @@ -46,7 +46,7 @@ def __init__( def _autotune(self) -> Config: self.log( - f"Starting PatternSearch with initial_population={self.initial_population}, copies={self.copies}" + f"Starting PatternSearch with initial_population={self.initial_population}, copies={self.copies}, max_generations={self.max_generations}" ) visited = set() self.population = [] @@ -59,7 +59,7 @@ def _autotune(self) -> Config: self.population.append(member) self.parallel_benchmark_population(self.population, desc="Initial population") # again with higher accuracy - self.rebenchmark_population(self.population, desc="Initial rebench") + self.rebenchmark_population(self.population, desc="Verifying initial results") self.population.sort(key=performance) starting_points = [] for member in self.population[: self.copies]: @@ -88,21 +88,25 @@ def _autotune(self) -> Config: new_population[id(member)] = member if num_active == 0: break + + # Log generation header before compiling/benchmarking + self.log( + f"Generation {generation} starting: {num_neighbors} neighbors, {num_active} active search path(s)" + ) + self.population = [*new_population.values()] # compile any unbenchmarked members in parallel unbenchmarked = [m for m in self.population if len(m.perfs) == 0] if unbenchmarked: self.parallel_benchmark_population( - unbenchmarked, desc=f"Gen {generation} neighbors" + unbenchmarked, desc=f"Generation {generation}: Exploring neighbors" ) # higher-accuracy rebenchmark self.rebenchmark_population( - self.population, desc=f"Gen {generation} rebench" - ) - self.log( - f"Generation {generation}, {num_neighbors} neighbors, {num_active} active:", - self.statistics, + self.population, desc=f"Generation {generation}: Verifying top configs" ) + # Log final statistics for this generation + self.log(f"Generation {generation} complete:", self.statistics) return self.best.config def _pattern_search_from( diff --git a/helion/autotuner/progress_bar.py b/helion/autotuner/progress_bar.py new file mode 100644 index 000000000..b1b3370c9 --- /dev/null +++ b/helion/autotuner/progress_bar.py @@ -0,0 +1,51 @@ +"""Progress-bar utilities used by the autotuner. + +We rely on `rich` to render colored, full-width progress bars that +show the description, percentage complete, and how many items have been +processed. +""" + +from __future__ import annotations + +from collections.abc import Iterable, Iterator +from typing import TypeVar + +from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn + +T = TypeVar("T") + + +def iter_with_progress( + iterable: Iterable[T], *, total: int, description: str | None, enabled: bool +) -> Iterator[T]: + """Yield items from *iterable*, optionally showing a progress bar. + + Parameters + ---------- + iterable: + Any iterable whose items should be yielded. + total: + Total number of items expected from the iterable. + description: + Text displayed on the left side of the bar. Defaults to ``"Progress"``. + enabled: + When ``False`` the iterable is returned unchanged so there is zero + overhead; when ``True`` a Rich progress bar is rendered. + """ + + if not enabled: + yield from iterable + return + + with Progress( + TextColumn("[progress.description]{task.description}"), + TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + BarColumn(bar_width=None, complete_style="yellow", finished_style="green"), + MofNCompleteColumn(), + ) as progress: + task = progress.add_task(description or "Progress", total=total) + for item in iterable: + # Yield before updating so the consumer sees the item immediately. + yield item + progress.update(task, advance=1) + progress.refresh() diff --git a/requirements.txt b/requirements.txt index 894173feb..965858186 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,4 @@ pre-commit filecheck expecttest numpy -tqdm rich From e48add66cbc1d7c6e73586405f7f48ffb01e8148 Mon Sep 17 00:00:00 2001 From: Jongsok Choi Date: Mon, 6 Oct 2025 02:37:30 +0000 Subject: [PATCH 2/5] Fix linting error for docs. --- docs/conf.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index c22f18285..13f2f8506 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -129,10 +129,11 @@ def connect(self, event: str, callback: Callable[..., None]) -> None: } theme_variables = pytorch_sphinx_theme2.get_theme_variables() -templates_path = [ - "_templates", - os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"), -] +templates_path = ["_templates"] +if pytorch_sphinx_theme2.__file__ is not None: + templates_path.append( + os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates") + ) html_context = { "theme_variables": theme_variables, From 4585a6fb41f57e89efd26f4a425afe052602782e Mon Sep 17 00:00:00 2001 From: Jongsok Choi Date: Mon, 6 Oct 2025 02:53:45 +0000 Subject: [PATCH 3/5] Make the cache hit message clearer. --- helion/autotuner/local_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helion/autotuner/local_cache.py b/helion/autotuner/local_cache.py index 3c67aba8a..22543dd71 100644 --- a/helion/autotuner/local_cache.py +++ b/helion/autotuner/local_cache.py @@ -96,7 +96,7 @@ def put(self, config: Config) -> None: def _get_cache_info_message(self) -> str: cache_dir = self._get_local_cache_path().parent - return f"Cache directory: {cache_dir}. To clear cache, delete this directory or set HELION_SKIP_CACHE=1." + return f"Cache directory: {cache_dir}. To run autotuning again, delete the cache directory or set HELION_SKIP_CACHE=1." class StrictLocalAutotuneCache(LocalAutotuneCache): From e4112a04c868650e7d0e5529e22c9424190048f5 Mon Sep 17 00:00:00 2001 From: Jongsok Choi Date: Mon, 6 Oct 2025 03:09:50 +0000 Subject: [PATCH 4/5] Fix linting errors. --- helion/autotuner/benchmarking.py | 3 +++ helion/autotuner/progress_bar.py | 11 +++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/helion/autotuner/benchmarking.py b/helion/autotuner/benchmarking.py index 63c9a8aa9..b9cacbc1d 100644 --- a/helion/autotuner/benchmarking.py +++ b/helion/autotuner/benchmarking.py @@ -3,9 +3,12 @@ import functools import statistics from typing import Callable + from triton import runtime + from .progress_bar import iter_with_progress + def interleaved_bench( fns: list[Callable[[], object]], *, repeat: int, desc: str | None = None ) -> list[float]: diff --git a/helion/autotuner/progress_bar.py b/helion/autotuner/progress_bar.py index b1b3370c9..e3caa35a3 100644 --- a/helion/autotuner/progress_bar.py +++ b/helion/autotuner/progress_bar.py @@ -7,10 +7,17 @@ from __future__ import annotations -from collections.abc import Iterable, Iterator +from typing import TYPE_CHECKING from typing import TypeVar -from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn +from rich.progress import BarColumn +from rich.progress import MofNCompleteColumn +from rich.progress import Progress +from rich.progress import TextColumn + +if TYPE_CHECKING: + from collections.abc import Iterable + from collections.abc import Iterator T = TypeVar("T") From 40bd3fbc712a7f01848581e2a99cc3d7d2c7dc21 Mon Sep 17 00:00:00 2001 From: Jongsok Choi Date: Tue, 7 Oct 2025 03:11:28 +0000 Subject: [PATCH 5/5] Add processing rate to progress bar and simplify its code. --- helion/autotuner/progress_bar.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/helion/autotuner/progress_bar.py b/helion/autotuner/progress_bar.py index e3caa35a3..f69896837 100644 --- a/helion/autotuner/progress_bar.py +++ b/helion/autotuner/progress_bar.py @@ -13,17 +13,31 @@ from rich.progress import BarColumn from rich.progress import MofNCompleteColumn from rich.progress import Progress +from rich.progress import ProgressColumn from rich.progress import TextColumn +from rich.text import Text if TYPE_CHECKING: from collections.abc import Iterable from collections.abc import Iterator + from rich.progress import Task + T = TypeVar("T") +class SpeedColumn(ProgressColumn): + """Render the processing speed in configs per second.""" + + def render(self, task: Task) -> Text: + return Text( + f"{task.speed:.1f} configs/s" if task.speed is not None else "- configs/s", + style="magenta", + ) + + def iter_with_progress( - iterable: Iterable[T], *, total: int, description: str | None, enabled: bool + iterable: Iterable[T], *, total: int, description: str | None = None, enabled: bool ) -> Iterator[T]: """Yield items from *iterable*, optionally showing a progress bar. @@ -39,20 +53,18 @@ def iter_with_progress( When ``False`` the iterable is returned unchanged so there is zero overhead; when ``True`` a Rich progress bar is rendered. """ - if not enabled: yield from iterable return + if description is None: + description = "Progress" + with Progress( TextColumn("[progress.description]{task.description}"), TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), BarColumn(bar_width=None, complete_style="yellow", finished_style="green"), MofNCompleteColumn(), + SpeedColumn(), ) as progress: - task = progress.add_task(description or "Progress", total=total) - for item in iterable: - # Yield before updating so the consumer sees the item immediately. - yield item - progress.update(task, advance=1) - progress.refresh() + yield from progress.track(iterable, total=total, description=description)