From c715e7679902c7744a208567706b93586c8829b1 Mon Sep 17 00:00:00 2001
From: Xu Han <xu.han@outlook.com>
Date: Thu, 25 Apr 2024 23:27:52 +0000
Subject: [PATCH] [inductor] optimize isa dry compile time. (#124602)

Fixes #100378
Original issue caused by startup dry compile need cost almost 1 second.

This PR add compiler version info, isa build options and pytorch version info to the test binary path hash.
So same compile, same isa and same pytorch can skip the dry compile.

Local test:
First time:
<img width="1588" alt="image" src="https://github.com/pytorch/pytorch/assets/8433590/d0b83f5d-849e-4f37-9977-3b0276e5a5a5">
We need to compile all c++ modules and it cost 16.5s.

Second time:
<img width="1589" alt="image" src="https://github.com/pytorch/pytorch/assets/8433590/44f07fb0-5a15-4342-b0f6-dfe2c880b5d3">
We skipped dry compile due to the same isa fingerprint. It is only cost 0.36s.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/124602
Approved by: https://github.com/jgong5, https://github.com/ezyang
---
 torch/_inductor/codecache.py | 47 ++++++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 72ec8eb0c57f..2eb677b4d623 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -109,6 +109,8 @@ def use_global_cache() -> bool:
 
 LOCK_TIMEOUT = 600
 
+_IS_WINDOWS = sys.platform == "win32"
+
 # timing metrics for time spent in the compilation
 _cumulative_compile_time = 0.0
 _t0: Optional[float] = None
@@ -1049,6 +1051,40 @@ def is_clang() -> bool:
     return bool(re.search(r"(clang|clang\+\+)", cpp_compiler()))
 
 
+def get_compiler_version_info(compiler):
+    SUBPROCESS_DECODE_ARGS = ("oem",) if _IS_WINDOWS else ()
+    env = os.environ.copy()
+    env["LC_ALL"] = "C"  # Don't localize output
+    try:
+        version_string = subprocess.check_output(
+            [compiler, "-v"], stderr=subprocess.STDOUT, env=env
+        ).decode(*SUBPROCESS_DECODE_ARGS)
+    except Exception as e:
+        try:
+            version_string = subprocess.check_output(
+                [compiler, "--version"], stderr=subprocess.STDOUT, env=env
+            ).decode(*SUBPROCESS_DECODE_ARGS)
+        except Exception as e:
+            return ""
+    # Mutiple lines to one line string.
+    version_string = version_string.replace("\r", "_")
+    version_string = version_string.replace("\n", "_")
+    return version_string
+
+
+def _get_isa_dry_compile_fingerprint(isa_flags: str) -> str:
+    # ISA dry compile will cost about 1 sec time each startup time.
+    # Please check the issue: https://github.com/pytorch/pytorch/issues/100378
+    # Actually, dry compile is checking compile capability for ISA.
+    # We just record the compiler version, isa options and pytorch version info,
+    # and generated them to output binary hash path.
+    # It would optimize and skip compile existing binary.
+    compiler_info = get_compiler_version_info(cpp_compiler())
+    torch_version = torch.__version__
+    fingerprint = f"{compiler_info}={isa_flags}={torch_version}"
+    return fingerprint
+
+
 class VecISA:
     _bit_width: int
     _macro: str
@@ -1114,7 +1150,11 @@ def __bool__(self) -> bool:
         if config.is_fbcode():
             return True
 
-        key, input_path = write(VecISA._avx_code, "cpp")
+        key, input_path = write(
+            VecISA._avx_code,
+            "cpp",
+            extra=_get_isa_dry_compile_fingerprint(self._arch_flags),
+        )
         from filelock import FileLock
 
         lock_dir = get_lock_dir()
@@ -1127,8 +1167,11 @@ def __bool__(self) -> bool:
                 )
             )
             try:
+                # Check if the output file exist, and compile when not.
+                if not os.path.isfile(output_path):
+                    compile_file(input_path, output_path, build_cmd)
+
                 # Check build result
-                compile_file(input_path, output_path, build_cmd)
                 subprocess.check_call(
                     [
                         sys.executable,