/
codecache.py
3446 lines (2952 loc) · 122 KB
/
codecache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from __future__ import annotations
import base64
import copyreg
import dataclasses
import functools
import hashlib
import importlib
import io
import json
import logging
import multiprocessing
import os
import pickle
import pkgutil
import platform
import re
import shlex
import shutil
import signal
import struct
import subprocess
import sys
import sysconfig
import tempfile
import textwrap
import threading
import warnings
from bisect import bisect_right
from concurrent.futures import Future, ProcessPoolExecutor, ThreadPoolExecutor
from copy import copy
from ctypes import c_void_p, cdll, CDLL
from functools import partial
from pathlib import Path
from threading import Thread
from time import sleep, time, time_ns
from types import ModuleType
from typing import (
Any,
Callable,
cast,
Dict,
Generator,
List,
Optional,
Sequence,
Set,
Tuple,
TYPE_CHECKING,
Union,
)
import torch
from torch._dynamo.device_interface import get_registered_device_interfaces
from torch._dynamo.utils import counters, dynamo_timed
from torch._inductor import config, exc, metrics
from torch._inductor.codegen.cuda import cuda_env
from torch._inductor.runtime.compile_tasks import (
_module_to_triton_kernel,
_reload_python_module,
_reload_python_module_in_subproc,
_set_triton_ptxas_path,
_worker_compile_triton,
)
from torch._inductor.runtime.hints import HalideMeta
from torch._inductor.runtime.runtime_utils import cache_dir
from torch._inductor.utils import clear_on_fresh_inductor_cache, is_linux
from torch._logging import trace_structured
from torch._subclasses.fake_tensor import (
extract_tensor_metadata,
FakeTensor,
TensorMetadata,
)
from torch.fx.experimental.symbolic_shapes import has_hint, hint_int, ShapeEnv
if TYPE_CHECKING:
from torch._inductor.graph import GraphLowering
from torch._inductor.ir import ChoiceCaller
from torch.hub import _Faketqdm, tqdm
_HERE = os.path.abspath(__file__)
_TORCH_PATH = os.path.dirname(os.path.dirname(_HERE))
_LINKER_SCRIPT = os.path.join(_TORCH_PATH, "_inductor/script.ld")
if config.is_fbcode():
from triton.fb import build_paths
from triton.fb.build import _run_build_command
from torch._inductor.fb.utils import (
log_global_cache_errors,
log_global_cache_stats,
log_global_cache_vals,
use_global_cache,
)
else:
def log_global_cache_errors(*args, **kwargs):
pass
def log_global_cache_stats(*args, **kwargs):
pass
def log_global_cache_vals(*args, **kwargs):
pass
def use_global_cache() -> bool:
return False
output_code_log = torch._logging.getArtifactLogger(__name__, "output_code")
kernel_code_log = torch._logging.getArtifactLogger(__name__, "kernel_code")
LOCK_TIMEOUT = 600
_IS_WINDOWS = sys.platform == "win32"
# timing metrics for time spent in the compilation
_cumulative_compile_time = 0.0
_t0: Optional[float] = None
def _compile_start() -> None:
global _t0
if _t0 is None:
_t0 = time()
def _compile_end() -> None:
global _cumulative_compile_time, _t0
if _t0 is not None:
t1 = time()
_cumulative_compile_time += t1 - _t0
_t0 = None
# print("CUMULATIVE COMPILE TIME", _cumulative_compile_time)
log = logging.getLogger(__name__)
def cpp_wrapper_cache_dir(name: str) -> str:
cu_str = (
"cpu"
if torch.version.cuda is None
else f'cu{torch.version.cuda.replace(".", "")}'
)
python_version = f"py{sys.version_info.major}{sys.version_info.minor}"
build_folder = f"{python_version}_{cu_str}"
cpp_wrapper_dir = os.path.join(cache_dir(), build_folder)
cpp_wrapper_build_directory = os.path.join(cpp_wrapper_dir, name)
os.makedirs(cpp_wrapper_build_directory, exist_ok=True)
return cpp_wrapper_build_directory
def get_cpp_wrapper_cubin_path_name():
return "cubin_path" if torch.version.hip is None else "hsaco_path"
class CacheBase:
@staticmethod
@functools.lru_cache(None)
def get_system() -> Dict[str, Any]:
try:
from triton.compiler.compiler import triton_key
# Use triton_key instead of triton.__version__ as the version
# is not updated with each code change
triton_version = triton_key()
except ModuleNotFoundError:
triton_version = None
try:
system: Dict[str, Any] = {
"device": {
"name": torch.cuda.get_device_properties(
torch.cuda.current_device()
).name,
},
"version": {
"cuda": torch.version.cuda,
"triton": triton_version,
},
}
except (AssertionError, RuntimeError):
# If cuda is not installed, none of the above config is relevant.
system = {}
system["hash"] = hashlib.sha256(
json.dumps(system, sort_keys=True).encode("utf-8")
).hexdigest()
return system
@staticmethod
@clear_on_fresh_inductor_cache
@functools.lru_cache(None)
def get_local_cache_path() -> Path:
return Path(os.path.join(cache_dir(), "cache", CacheBase.get_system()["hash"]))
@staticmethod
@functools.lru_cache(None)
def get_global_cache_path() -> Optional[Path]:
return (
Path(os.path.join(config.global_cache_dir, CacheBase.get_system()["hash"]))
if config.global_cache_dir is not None
else None
)
def __init__(self) -> None:
self.system = CacheBase.get_system()
def get_local_cache(self) -> Dict[str, Any]:
local_cache_path = self.get_local_cache_path()
if not local_cache_path.is_file():
return {}
with open(local_cache_path) as local_cache_fp:
local_cache = json.load(local_cache_fp)
return local_cache["cache"]
def update_local_cache(self, local_cache: Dict[str, Any]) -> None:
local_cache_path = self.get_local_cache_path()
write_atomic(
str(local_cache_path),
json.dumps({"system": self.system, "cache": local_cache}, indent=4),
make_dirs=True,
)
class LocalCache(CacheBase):
def lookup(self, *keys: str) -> Optional[Dict[str, Any]]:
cache = self.get_local_cache()
sub_cache = cache
for key in keys:
if key in cache:
sub_cache = cache[key]
else:
return None
return sub_cache
def set_value(self, *keys: str, value: Any) -> None:
cache = self.get_local_cache()
sub_cache = cache
for key in keys[0:-1]:
sub_cache.setdefault(key, {})
sub_cache = sub_cache[key]
sub_cache[keys[-1]] = value
self.update_local_cache(cache)
class PersistentCache(CacheBase):
@functools.lru_cache(None)
def get_global_cache(self):
global_cache_path = self.get_global_cache_path()
if global_cache_path is None or not global_cache_path.is_file():
return {}
with open(global_cache_path) as global_cache_fp:
global_cache = json.load(global_cache_fp)
return global_cache["cache"]
def lookup(
self,
choices: List[ChoiceCaller],
op: str,
inputs: str,
benchmark: Optional[Callable[[Any], Dict[ChoiceCaller, float]]],
) -> Dict[ChoiceCaller, float]:
"""
Check to see if we have benchmarked the given choice callers. For each
choice caller:
1. Check global_cache[op][inputs][choice][precision], return benchmark if cached.
2. Check local_cache[op][inputs][choice][precision], return benchmark if cached.
3. If benchmark is not None:
a. `max_autotune_gemm=True`: benchmark the choice, update
local_cache[op][inputs][choice], and return the benchmark.
b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing.
"""
precision = torch.get_float32_matmul_precision()
log_stats = partial(log_global_cache_stats, self.system, op, inputs, precision)
log_vals = partial(log_global_cache_vals, self.system, op, inputs, precision)
log_errors = partial(
log_global_cache_errors, self.system, op, inputs, precision
)
timings = {}
def check_cache(cache, callback=None) -> bool:
"""Check if `cache` contains data for all the choices"""
hit = True
for choice in choices:
choice_hash = choice.hash_key()
if choice_hash in cache.get(op, {}).get(inputs, {}).get(precision, {}):
# cache hit
timings[choice] = cache[op][inputs][precision][choice_hash]
else:
# cache miss
hit = False
break
if callback:
callback(cached=hit)
return hit
if config.max_autotune or config.max_autotune_gemm:
local_cache = self.get_local_cache() if config.autotune_local_cache else {}
# check local cache first since it is data specific to the current machine
if (
not check_cache(local_cache)
and not (
use_global_cache()
and check_cache(self.get_global_cache(), callback=log_stats)
)
and benchmark is not None
):
try:
# re-benchmark everything to try to get consistent numbers from the same machine
timings = benchmark(choices)
assert all(choice in timings for choice in choices)
local_cache.setdefault(op, {})
local_cache[op].setdefault(inputs, {}).setdefault(precision, {})
for choice, timing in timings.items():
local_cache[op][inputs][precision][choice.hash_key()] = timing
except RuntimeError as e:
# catch and log autotuning failures
log_errors(e)
raise e
self.update_local_cache(local_cache)
timings_to_log = {
choice.hash_key(): timings[choice] for choice in choices
}
log_vals(timings_to_log)
elif use_global_cache():
# only check global cache, not local one
check_cache(self.get_global_cache(), callback=log_stats)
# may have a partial cache hit, where not everything is benchmarked
return timings
def get_lock_dir() -> str:
lock_dir = os.path.join(cache_dir(), "locks")
if not os.path.exists(lock_dir):
os.makedirs(lock_dir, exist_ok=True)
return lock_dir
def sha256_hash(data: bytes) -> str:
# [:51] to strip off the "Q====" suffix common to every hash value.
return base64.b32encode(hashlib.sha256(data).digest())[:51].decode("utf-8").lower()
def code_hash(code: Union[str, bytes], extra: str = ""):
hashing_str = code if isinstance(code, bytes) else code.encode("utf-8")
if extra != "":
hashing_str = hashing_str + b"||" + extra.encode("utf-8")
return "c" + sha256_hash(hashing_str)
def get_path(
basename: str, extension: str, specified_dir: str = ""
) -> Tuple[str, str, str]:
if specified_dir:
if os.path.isabs(specified_dir):
subdir = specified_dir
else:
subdir = os.path.join(cache_dir(), specified_dir)
else:
subdir = os.path.join(cache_dir(), basename[1:3])
path = os.path.join(subdir, f"{basename}.{extension}")
return basename, subdir, path
def get_hash(content: Union[str, bytes], extra: str = "", hash_type: str = "code"):
if hash_type == "code":
return code_hash(content, extra)
if hash_type in ["cubin", "hsaco"]:
return code_hash(repr(content))
raise AssertionError(f"Unknown hash type {hash_type}")
def write(
content: Union[str, bytes],
extension: str,
extra: str = "",
hash_type: str = "code",
specified_dir: str = "",
) -> Tuple[str, str]:
# use striped content to compute hash so we don't end up with different
# hashes just because the content begins/ends with different number of
# spaces.
key: str = get_hash(content.strip(), extra, hash_type)
basename, subdir, path = get_path(key, extension, specified_dir)
if not os.path.exists(path):
write_atomic(path, content, make_dirs=True)
return basename, path
def write_text(text: str) -> str:
"""
Write the `text` to a file and return the path computed based on the hash.
"""
return write(text, "txt")[1]
def write_atomic(
path: str, content: Union[str, bytes], make_dirs: bool = False
) -> None:
# Write into temporary file first to avoid conflicts between threads
# Avoid using a named temporary file, as those have restricted permissions
assert isinstance(
content, (str, bytes)
), "Only strings and byte arrays can be saved in the cache"
path = Path(path)
if make_dirs:
path.parent.mkdir(parents=True, exist_ok=True)
tmp_path = path.parent / f".{os.getpid()}.{threading.get_ident()}.tmp"
write_mode = "w" if isinstance(content, str) else "wb"
with tmp_path.open(write_mode) as f:
f.write(content)
tmp_path.rename(path)
@dataclasses.dataclass
class TensorMetadataAndValues:
"""
TensorMetadata plus the elements as a list of raw values.
Used for hashing inlined constants.
"""
tensor_metadata: TensorMetadata
values: List[Any]
def _ident(x: Any) -> Any:
return x
def _reduce_fake_tensor(t):
"""
See FxGraphCachePickler. Custom reducer to pickle FakeTensors.
"""
metadata = extract_tensor_metadata(t)
return (_ident, (metadata,))
def _reduce_tensor(t):
"""
See FxGraphCachePickler. Custom reducer to pickle Tensors.
If we see tensors, we know they're constants stored as attributes on
the GraphModule. Include the values in the key calculation. Small
tensors will be inlined, so we can't serve the same cache entry for
different values anyway. Large constants are treated as parameters,
so we could conceivably reuse a cache entry. To do that, however,
PyCodeCache would need more complexity to create a new module from its
cache, but with the right constants attached as attributes.
"""
if t.is_mkldnn:
# TODO: These tensors don't currently pickle, so we can't cache a
# compiled graph containing them. Just fail now. If mkldnn tensors
# get pickling support, we can remove this.
raise BypassFxGraphCache
# Very large tensors could be expensive to copy to cpu and hash. Let's
# at least report if we find slowness.
start = time()
values = t.tolist()
elapsed = time() - start
if elapsed > 1.0:
warnings.warn(
f"FX graph cache handling of a large constant took {elapsed:.1}s. Please file an issue."
)
metadata = extract_tensor_metadata(t)
return (_ident, (TensorMetadataAndValues(metadata, values),))
def _reduce_symint(s):
"""
See FxGraphCachePickler. Custom reducer to pickle SymInts.
"""
# For hashing purposes, we only care about the name of the symbol and
# not the backed value. We evaluate guards stored with a cached graph
# to ensure a cached entity with SymInt args is safe to reuse.
return (_ident, (str(s),))
def _reduce_unsupported(s):
"""
See FxGraphCachePickler. Custom reducer to handle any objects that we don't
support and therefore raise to bypass caching.
"""
raise BypassFxGraphCache
class FxGraphCachePickler(pickle.Pickler):
"""
Custom pickler to customize the pickling of some objects (Tensors), only for the
purpose of computing a hash for keying into the FxGraphCache. Tensors contain
objects that don't pickle and/or vary between runs, and we want to capture the
data that allow us to compute a stable, but safe hash.
"""
dispatch_table = copyreg.dispatch_table.copy()
dispatch_table[FakeTensor] = _reduce_fake_tensor
dispatch_table[torch.Tensor] = _reduce_tensor
dispatch_table[torch.SymInt] = _reduce_symint
dispatch_table[
torch.fx.experimental._backward_state.BackwardState
] = _reduce_unsupported
@classmethod
def dumps(cls, obj) -> bytes:
"""
Pickle an object using the FxGraphCachePickler.
"""
with io.BytesIO() as stream:
pickler = cls(stream)
try:
pickler.dump(obj)
except (TypeError, AttributeError) as e:
# Some configs options are callables, e.g., post_grad_custom_pre_pass,
# and may not pickle.
log.warning("Can't pickle", exc_info=True)
raise BypassFxGraphCache from e
return stream.getvalue()
@classmethod
def get_hash(cls, obj: Any) -> str:
"""
Serialize an object using the FxGraphCachePickler and return a hash
of the pickled object.
"""
serialized_data = cls.dumps(obj)
return sha256_hash(serialized_data)
@classmethod
def debug_str(cls, inp: Any) -> str:
"""
Get a printable string describing in more detail all the attributes
comprising an object. Useful for debugging when one graph hashes
to a different value than another.
"""
def get_str(obj) -> str:
if isinstance(obj, torch.Tensor):
return str(extract_tensor_metadata(obj))
elif isinstance(obj, bytes):
return "<bytes>"
else:
return str(obj)
lines = []
for attr, obj in vars(inp).items():
if isinstance(obj, list):
for ii in range(len(obj)):
h = cls.get_hash(obj[ii])
lines.append(f"[{h}] {attr}[{ii}]: {get_str(obj[ii])}")
elif isinstance(obj, dict):
for k, v in obj.items():
h = cls.get_hash(v)
lines.append(f"[{h}] {attr}[{k}]: {get_str(v)}")
else:
h = cls.get_hash(obj)
lines.append(f"[{h}] {attr}: {get_str(obj)}")
return "\n".join(lines)
def get_code_hash(roots):
contents: Dict[str, bytes] = {torch.__version__: b""}
for lib in pkgutil.iter_modules(roots):
spec = lib.module_finder.find_spec(lib.name, None)
assert spec is not None
module = spec.origin
assert module is not None
with open(module, "rb") as f:
contents[module] = f.read()
return hashlib.sha256(pickle.dumps(contents)).digest()
@functools.lru_cache(None)
def torch_key():
"""
Compute a key that contains relevant information about torch source files
"""
if not config.is_fbcode():
inductor_root = os.path.dirname(__file__)
return get_code_hash([inductor_root])
from libfb.py import parutil
return parutil.get_file_contents("torch/src_hash.txt").rstrip()
def get_inductor_root():
return os.path.dirname(__file__)
@dataclasses.dataclass
class OrderedSetHolder:
"""
See FxGraphHashDetails. Holds a sorted list to support stable hashing
of set kwargs.
"""
items: List[Any]
class BypassFxGraphCache(Exception):
"""
Exception to indicate that the FxGraphCache should be bypassed.
"""
pass
class FxGraphHashDetails:
"""
Object to capture all the details for a compiled FX graph relevant to computing
a safe and stable cache key.
"""
# Excluded kwargs param that are not stable between runs
EXCLUDED_KWARGS = ["graph_id"]
def __init__(
self,
gm: torch.fx.GraphModule,
example_inputs: List[torch.Tensor],
fx_kwargs: Dict[str, Any],
):
self.gm = gm
self.example_inputs = example_inputs
# Order kwargs so hashing is stable to changes in kwarg order.
self.fx_kwargs = {}
for k in sorted(fx_kwargs):
if k not in self.EXCLUDED_KWARGS:
if type(fx_kwargs[k]) is set:
# Special case to handle set params. Python sets can't be
# ordered, so sort the elements and store them in a proxy.
self.fx_kwargs[k] = OrderedSetHolder(sorted(fx_kwargs[k]))
else:
self.fx_kwargs[k] = fx_kwargs[k]
# 'Deterministic algorithms' can affect codegen via lowering to cuda kernels.
self.deterministic_algorithms_settings = (
torch.are_deterministic_algorithms_enabled(),
torch.is_deterministic_algorithms_warn_only_enabled(),
torch.utils.deterministic.fill_uninitialized_memory, # type: ignore[attr-defined]
)
# Global settings affecting matmul codegen.
self.cuda_matmul_settings = (
torch.backends.cuda.matmul.allow_tf32,
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction,
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction,
)
# Also hash on various system info (including the triton compiler version).
self.torch_version = torch_key()
self.system_info = CacheBase.get_system()
self.inductor_config = config.save_config_portable()
def debug_str(self) -> str:
"""
Get a printable string describing in more detail all the attributes
comprising this object. Useful for debugging when one graph hashes
to a different value than another.
"""
return FxGraphCachePickler.debug_str(self)
def compiled_fx_graph_hash(
gm: torch.fx.GraphModule,
example_inputs: List[torch.Tensor],
fx_kwargs: Dict[str, Any],
) -> str:
"""
Generate a unique hash of the FX graph for caching.
"""
details = FxGraphHashDetails(gm, example_inputs, fx_kwargs)
# The prefix distinguishes among the other kinds of objects we
# cache in this module.
key = "f" + FxGraphCachePickler.get_hash(details)
log.debug(
"FX graph cache hash details for key %s:\n%s",
key,
details.debug_str(),
)
return key
class FxGraphCache:
"""
Supports caching and reusing compiled Fx graphs.
The overall strategy is as follows:
- This cache stores entries on disk. When saving an entry, we can't
serialize callables (that could be C++, Triton, etc.), so we serialize
their own disk cache location. We then recreate the compiled artifact
after fetching from disk.
- For indexing the cache, we gather the fields relevant to identifying an
FxGraph (the graph module, graph inputs, system settings etc.) into an
FxGraphCacheDetails object, pickle it, and compute a hash for the key.
See FxGraphCachePickler.
- Among the metadata we store, we also include a guards expression that's
appropriate for validating any symbols for Tensor arguments that have
symbolic bounds. On cache lookup then, we evaluate those guards in the
current context to validate that a cached entry can be served.
- A given graph could have multiple compiled versions, corresponding to
different sets of guards. Therefore, we store cache entries in the form:
<temp dir>/<fx graph hash>/<serialized metatdata>
- On lookup, we compute the key from the graph details, iterate over all
leaf files in the corresponding subdirectory, deserialize the entry, and
evaluate its guards expression. If the evaluation succeeds, we have a
cache hit. If it fails, we compile the graph and store a new entry.
- Finally, on a cache hit, we need to make sure any guards that would
have been created during compilation are added to the current context.
"""
# TODO(masnesral): Investigate whether it's beneficial to store compiled graphs
# in an in-memory cache after loading from disk.
@staticmethod
def _get_tmp_dir() -> str:
"""
Get the toplevel temporary directory for storing compiled graphs.
"""
return os.path.join(cache_dir(), "fxgraph")
@staticmethod
def _get_tmp_dir_for_key(key: str) -> str:
"""
Return the disk location for a given cache key.
"""
return os.path.join(FxGraphCache._get_tmp_dir(), key[1:3], key)
@staticmethod
def _filter_backed_symints(inputs: List[Any]) -> List[torch.SymInt]:
"""
Get the backed SymInt objects from the input list. Note that we can never
have guards that depend on unbacked symint.
"""
return [s for s in inputs if isinstance(s, torch.SymInt) and has_hint(s)]
@staticmethod
def _get_shape_env() -> Optional[ShapeEnv]:
"""
Helper to get the shape env from the tracing context.
"""
ctx = torch._guards.TracingContext.try_get()
if not ctx:
return None
return ctx.fake_mode.shape_env
@staticmethod
def _lookup_graph(
key: str,
example_inputs: List[torch.Tensor],
local,
remote_cache,
) -> Optional[CompiledFxGraph]:
"""
Lookup a compiled graph in the cache by key. On a hit, return the
deserialized CompiledFxGraph object. On a miss, return None.
"""
shape_env = FxGraphCache._get_shape_env()
assert shape_env is not None
symints = FxGraphCache._filter_backed_symints(example_inputs)
hints = [hint_int(s) for s in symints]
def iterate_over_candidates() -> Generator[CompiledFxGraph, None, None]:
if local:
subdir = FxGraphCache._get_tmp_dir_for_key(key)
if os.path.exists(subdir):
for path in sorted(os.listdir(subdir)):
try:
with open(os.path.join(subdir, path), "rb") as f:
yield pickle.load(f)
except Exception:
log.warning(
"fx graph cache unable to load compiled graph",
exc_info=True,
)
if remote_cache:
try:
if (data := remote_cache.get(key)) is not None:
yield pickle.loads(data)
except Exception:
log.warning(
"fx graph cache unable to load compiled graph", exc_info=True
)
# Iterate over any entries in the subdir for this key and evaluate
# their guards to determine whether there's a hit.
graph = None
for candidate in iterate_over_candidates():
if not candidate.guards_expr:
# No guards to evaluate, so this is a hit.
graph = candidate
break
# Evaluate the guard expression in the current context.
# If there's not a cache hit, we don't want the evaluation to
# affect the current env, e.g., cause the creation of new guards,
# so we evaluate with the hints instead of the symbols.
hit = bool(
shape_env.evaluate_guards_expression(candidate.guards_expr, hints)
)
log.debug(
"fx graph cache key %s evaluating guards [%s] with values %s => hit=%s",
key,
candidate.guards_expr,
hints,
hit,
)
if hit:
graph = candidate
break
if graph is None:
return None
# See _save_graph(); we don't store the callable in the cache entry so
# recreate it here from the PyCodeCache disk cache.
artifact_path = get_path(graph.cache_key, "py")[2]
if not os.path.exists(artifact_path):
counters["inductor"]["fxgraph_lookup_write_file"] += 1
Path(os.path.dirname(artifact_path)).mkdir(parents=True, exist_ok=True)
code = graph.source_code
cpp_pp = cpp_prefix_path()
if os.path.basename(cpp_pp) in code:
if cpp_pp in code:
# Great the name is correct
pass
else:
# Old dir name is included, replace it
pattern = rf'#include\s*"[^"]+{os.path.basename(cpp_pp)}"'
code = re.sub(pattern, f'#include "{cpp_pp}"', code)
write_atomic(artifact_path, code, make_dirs=True)
try:
graph.current_callable = PyCodeCache.load_by_key_path(
graph.cache_key,
artifact_path,
graph.cache_linemap,
graph.constants,
).call
except OSError:
# Not expected, but in case the PyCodeCache entry is removed from
# underneath us, treat it as a cache miss and recompile.
log.error("Failed to load cached artifact: %s", artifact_path)
return None
# Now re-evaluate with the symints to add any guards to the current env.
if graph.guards_expr:
check = bool(
shape_env.evaluate_guards_expression(graph.guards_expr, symints)
)
assert check is True
log.debug(
"fx graph cache key %s post-load guards: %s", key, shape_env.guards
)
# Increment the cached metrics by the amounts recorded when the FX
# graph was compiled for this cache entry. Pretending these counters
# were incremented normally is useful for testing with the cache enabled.
metrics.CachedMetricsHelper.apply_deltas(graph.metrics_deltas)
return graph
@staticmethod
def _save_graph(
key: str,
compiled_graph: CompiledFxGraph,
example_inputs: List[torch.Tensor],
time_taken_ns,
local,
remote_cache,
):
"""
Store a serialized CompiledFxGraph on disk.
"""
disk_compiled_graph = copy(compiled_graph)
# We can't really serialize callables that may be C++/Triton/etc.,
# so we serialize their PyCodeCache disk cache location instead.
# TODO: This could be better if we're ever able to serialize compiled
# models to disk.
disk_compiled_graph.current_callable = None
# Before serializing, compute the guard expression that will be used to
# ensure that a CompiledFxGraph is valid when loaded from the cache. It's
# sufficient to consider only the SymInt args to the fx graph since the
# Tensor shapes are already captured in the hash for the cache key. Any
# Tensor arg with a symbolic shape will have a SymInt arg for the graph.
shape_env = FxGraphCache._get_shape_env()
assert shape_env is not None
symints = FxGraphCache._filter_backed_symints(example_inputs)
disk_compiled_graph.guards_expr = shape_env.produce_guards_expression(symints)
try:
content = pickle.dumps(disk_compiled_graph)
except Exception:
log.warning(
"fx graph cache unable to serialize compiled graph", exc_info=True
)
counters["inductor"]["fxgraph_cache_pickle_error"] += 1
return
try:
if local:
subdir = FxGraphCache._get_tmp_dir_for_key(key)
if not os.path.exists(subdir):
os.makedirs(subdir, exist_ok=True)
# Use a hash of the serialized CompiledFxGraph to get a unique file
# name. The specific name doesn't matter since a lookup involves
# iterating over all entries in the parent subdir.
path = os.path.join(subdir, sha256_hash(content))
write_atomic(path, content, make_dirs=True)
if remote_cache:
cache_data = (
{
"data": content,
"time_taken_ms": time_taken_ns
// 1000000, # Convert from NS to MS
}
if config.is_fbcode()
else content
)
remote_cache.put(key, cache_data)
except Exception:
log.warning("fx graph unable to write to cache", exc_info=True)
counters["inductor"]["fxgraph_cache_write_error"] += 1
@staticmethod
def _check_can_cache(gm: torch.fx.GraphModule):
"""
Check some conditions that would preclude caching and raise BypassFxGraphCache
to bypass in case caching is not possible.
"""
# Freezing can embed constants that wouldn't be static across runs.
if config.freezing or config.aot_inductor.use_runtime_constant_folding:
raise BypassFxGraphCache
# The treatment of guards in the caching implementation requires that
# we have a shape env.
if FxGraphCache._get_shape_env() is None:
log.debug("fx graph cache no shape env")
raise BypassFxGraphCache
# HigherOrderOperators should be handled on a case-by-case basis.
# Currently, we just skip caching if we have any.
# We also skip if there are any torchbind objects.
for node in gm.graph.nodes:
if isinstance(node.target, torch._ops.HigherOrderOperator):
raise BypassFxGraphCache
if node.op == "getattr" and isinstance(
getattr(gm, node.target), torch._C.ScriptObject
):
raise BypassFxGraphCache
@staticmethod
def load(
compile_fx_fn: Callable[..., Any],
gm: torch.fx.GraphModule,
example_inputs: List[torch.Tensor],
fx_kwargs: Dict[str, Any],
local: bool,
remote: bool,
):
"""
Load a compiled graph from the cache. If a cached entry does not exist,
compile the graph and save it to the cache.
"""
assert local or remote, "at least one of them needs to be enabled"
compiled_graph = None
try:
FxGraphCache._check_can_cache(gm)
key = compiled_fx_graph_hash(gm, example_inputs, fx_kwargs)
remote_cache = None
if remote:
cache_id = "fx-graph-v1"
try:
import triton
if config.is_fbcode():