Skip to content

Commit

Permalink
Lazily import parallelized format modules
Browse files Browse the repository at this point in the history
`black.reformat_many` depends on a lot of slow-to-import modules. When
formatting simply a single file, the time paid to import those modules
is totally wasted. So I moved `black.reformat_many` and its helpers
to `black.concurrency` which is now *only* imported if there's more
than one file to reformat. This way, running Black over a single file
is snappier

Here are the numbers before and after this patch running `python -m
black --version`:

- interpreted: 411 ms +- 9 ms -> 342 ms +- 7 ms: 1.20x faster
- compiled: 365 ms +- 15 ms -> 304 ms +- 7 ms: 1.20x faster

Co-authored-by: Fabio Zadrozny <fabiofz@gmail.com>
  • Loading branch information
ichard26 and fabioz committed Aug 27, 2022
1 parent c47b91f commit e269f44
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 145 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Expand Up @@ -87,6 +87,8 @@

<!-- Changes that improve Black's performance. -->

- Reduce Black's startup time when formatting a single file by 15-30% (#3211)

## 22.6.0

### Style
Expand Down
4 changes: 2 additions & 2 deletions docs/contributing/reference/reference_functions.rst
Expand Up @@ -52,7 +52,7 @@ Formatting

.. autofunction:: black.reformat_one

.. autofunction:: black.schedule_formatting
.. autofunction:: black.concurrency.schedule_formatting

File operations
---------------
Expand Down Expand Up @@ -173,7 +173,7 @@ Utilities

.. autofunction:: black.linegen.should_split_line

.. autofunction:: black.shutdown
.. autofunction:: black.concurrency.shutdown

.. autofunction:: black.strings.sub_twice

Expand Down
153 changes: 14 additions & 139 deletions src/black/__init__.py
@@ -1,10 +1,8 @@
import asyncio
import io
import json
import os
import platform
import re
import signal
import sys
import tokenize
import traceback
Expand All @@ -13,10 +11,8 @@
from datetime import datetime
from enum import Enum
from json.decoder import JSONDecodeError
from multiprocessing import Manager, freeze_support
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Dict,
Generator,
Expand All @@ -32,15 +28,19 @@
Union,
)

if sys.version_info >= (3, 8):
from typing import Final
else:
from typing_extensions import Final

import click
from click.core import ParameterSource
from mypy_extensions import mypyc_attr
from pathspec.patterns.gitwildmatch import GitWildMatchPatternError

from _black_version import version as __version__
from black.cache import Cache, filter_cached, get_cache_info, read_cache, write_cache
from black.cache import Cache, get_cache_info, read_cache, write_cache
from black.comments import normalize_fmt_off
from black.concurrency import cancel, maybe_install_uvloop, shutdown
from black.const import (
DEFAULT_EXCLUDES,
DEFAULT_INCLUDES,
Expand Down Expand Up @@ -91,10 +91,8 @@
from blib2to3.pgen2 import token
from blib2to3.pytree import Leaf, Node

if TYPE_CHECKING:
from concurrent.futures import Executor

COMPILED = Path(__file__).suffix in (".pyd", ".so")
DEFAULT_WORKERS: Final = os.cpu_count()

# types
FileContent = str
Expand Down Expand Up @@ -125,8 +123,6 @@ def from_configuration(
# Legacy name, left for integrations.
FileMode = Mode

DEFAULT_WORKERS = os.cpu_count()


def read_pyproject_toml(
ctx: click.Context, param: click.Parameter, value: Optional[str]
Expand Down Expand Up @@ -592,6 +588,8 @@ def main( # noqa: C901
report=report,
)
else:
from black.concurrency import reformat_many

reformat_many(
sources=sources,
fast=fast,
Expand Down Expand Up @@ -776,132 +774,6 @@ def reformat_one(
report.failed(src, str(exc))


# diff-shades depends on being to monkeypatch this function to operate. I know it's
# not ideal, but this shouldn't cause any issues ... hopefully. ~ichard26
@mypyc_attr(patchable=True)
def reformat_many(
sources: Set[Path],
fast: bool,
write_back: WriteBack,
mode: Mode,
report: "Report",
workers: Optional[int],
) -> None:
"""Reformat multiple files using a ProcessPoolExecutor."""
from concurrent.futures import Executor, ProcessPoolExecutor, ThreadPoolExecutor

executor: Executor
worker_count = workers if workers is not None else DEFAULT_WORKERS
if sys.platform == "win32":
# Work around https://bugs.python.org/issue26903
assert worker_count is not None
worker_count = min(worker_count, 60)
try:
executor = ProcessPoolExecutor(max_workers=worker_count)
except (ImportError, NotImplementedError, OSError):
# we arrive here if the underlying system does not support multi-processing
# like in AWS Lambda or Termux, in which case we gracefully fallback to
# a ThreadPoolExecutor with just a single worker (more workers would not do us
# any good due to the Global Interpreter Lock)
executor = ThreadPoolExecutor(max_workers=1)

loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
schedule_formatting(
sources=sources,
fast=fast,
write_back=write_back,
mode=mode,
report=report,
loop=loop,
executor=executor,
)
)
finally:
try:
shutdown(loop)
finally:
asyncio.set_event_loop(None)
if executor is not None:
executor.shutdown()


async def schedule_formatting(
sources: Set[Path],
fast: bool,
write_back: WriteBack,
mode: Mode,
report: "Report",
loop: asyncio.AbstractEventLoop,
executor: "Executor",
) -> None:
"""Run formatting of `sources` in parallel using the provided `executor`.
(Use ProcessPoolExecutors for actual parallelism.)
`write_back`, `fast`, and `mode` options are passed to
:func:`format_file_in_place`.
"""
cache: Cache = {}
if write_back not in (WriteBack.DIFF, WriteBack.COLOR_DIFF):
cache = read_cache(mode)
sources, cached = filter_cached(cache, sources)
for src in sorted(cached):
report.done(src, Changed.CACHED)
if not sources:
return

cancelled = []
sources_to_cache = []
lock = None
if write_back in (WriteBack.DIFF, WriteBack.COLOR_DIFF):
# For diff output, we need locks to ensure we don't interleave output
# from different processes.
manager = Manager()
lock = manager.Lock()
tasks = {
asyncio.ensure_future(
loop.run_in_executor(
executor, format_file_in_place, src, fast, mode, write_back, lock
)
): src
for src in sorted(sources)
}
pending = tasks.keys()
try:
loop.add_signal_handler(signal.SIGINT, cancel, pending)
loop.add_signal_handler(signal.SIGTERM, cancel, pending)
except NotImplementedError:
# There are no good alternatives for these on Windows.
pass
while pending:
done, _ = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
for task in done:
src = tasks.pop(task)
if task.cancelled():
cancelled.append(task)
elif task.exception():
report.failed(src, str(task.exception()))
else:
changed = Changed.YES if task.result() else Changed.NO
# If the file was written back or was successfully checked as
# well-formatted, store this information in the cache.
if write_back is WriteBack.YES or (
write_back is WriteBack.CHECK and changed is Changed.NO
):
sources_to_cache.append(src)
report.done(src, changed)
if cancelled:
if sys.version_info >= (3, 7):
await asyncio.gather(*cancelled, return_exceptions=True)
else:
await asyncio.gather(*cancelled, loop=loop, return_exceptions=True)
if sources_to_cache:
write_cache(cache, sources_to_cache, mode)


def format_file_in_place(
src: Path,
fast: bool,
Expand Down Expand Up @@ -1506,8 +1378,11 @@ def patch_click() -> None:


def patched_main() -> None:
maybe_install_uvloop()
freeze_support()
if sys.platform == "win32" and getattr(sys, "frozen", False):
from multiprocessing import freeze_support

freeze_support()

patch_click()
main()

Expand Down

0 comments on commit e269f44

Please sign in to comment.