-
-
Notifications
You must be signed in to change notification settings - Fork 1.7k
/
xla_pb2.pyi
2245 lines (2032 loc) · 124 KB
/
xla_pb2.pyi
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
@generated by mypy-protobuf. Do not edit manually!
isort:skip_file
Copyright 2017 The OpenXLA Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================
"""
import builtins
import collections.abc
import sys
import typing
import google.protobuf.any_pb2
import google.protobuf.descriptor
import google.protobuf.internal.containers
import google.protobuf.internal.enum_type_wrapper
import google.protobuf.message
import tensorflow.compiler.xla.service.hlo_pb2
import tensorflow.compiler.xla.xla_data_pb2
if sys.version_info >= (3, 10):
import typing as typing_extensions
else:
import typing_extensions
DESCRIPTOR: google.protobuf.descriptor.FileDescriptor
@typing.final
class CompilationEnvironmentsProto(google.protobuf.message.Message):
"""Proto version of `xla::CompilationEnvironments`."""
DESCRIPTOR: google.protobuf.descriptor.Descriptor
ENVIRONMENTS_FIELD_NUMBER: builtins.int
@property
def environments(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[google.protobuf.any_pb2.Any]: ...
def __init__(
self,
*,
environments: collections.abc.Iterable[google.protobuf.any_pb2.Any] | None = ...,
) -> None: ...
def ClearField(self, field_name: typing.Literal["environments", b"environments"]) -> None: ...
global___CompilationEnvironmentsProto = CompilationEnvironmentsProto
@typing.final
class DebugOptions(google.protobuf.message.Message):
"""Debugging options for XLA. These options may change at any time - there are
no guarantees about backward or forward compatibility for these fields.
"""
DESCRIPTOR: google.protobuf.descriptor.Descriptor
class _ShapeChecks:
ValueType = typing.NewType("ValueType", builtins.int)
V: typing_extensions.TypeAlias = ValueType
class _ShapeChecksEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[DebugOptions._ShapeChecks.ValueType], builtins.type):
DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor
IGNORE: DebugOptions._ShapeChecks.ValueType # 0
"""Do not insert any shape checks for dynamically shaped operations; output
buffers might contain garbage data if shapes don't match.
"""
RUNTIME: DebugOptions._ShapeChecks.ValueType # 1
"""Check shapes at runtime, will insert an extra synchronization if shapes
cannot be proven correct at compile time.
"""
COMPILE_TIME: DebugOptions._ShapeChecks.ValueType # 2
"""Will refuse to compile any program where shape correctness can not be
established at compile time.
"""
class ShapeChecks(_ShapeChecks, metaclass=_ShapeChecksEnumTypeWrapper): ...
IGNORE: DebugOptions.ShapeChecks.ValueType # 0
"""Do not insert any shape checks for dynamically shaped operations; output
buffers might contain garbage data if shapes don't match.
"""
RUNTIME: DebugOptions.ShapeChecks.ValueType # 1
"""Check shapes at runtime, will insert an extra synchronization if shapes
cannot be proven correct at compile time.
"""
COMPILE_TIME: DebugOptions.ShapeChecks.ValueType # 2
"""Will refuse to compile any program where shape correctness can not be
established at compile time.
"""
class _StepMarkerLocation:
ValueType = typing.NewType("ValueType", builtins.int)
V: typing_extensions.TypeAlias = ValueType
class _StepMarkerLocationEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[DebugOptions._StepMarkerLocation.ValueType], builtins.type):
DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor
STEP_MARK_AT_ENTRY: DebugOptions._StepMarkerLocation.ValueType # 0
"""Generate a step marker at the program entry. This handles the case where
each step is done by one or multiple program execution(s). Only the first
program will be tagged for generating a step marker at the program entry.
This is the default.
"""
STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP: DebugOptions._StepMarkerLocation.ValueType # 1
"""Generate a step marker at each iteration of the top level while loop,
which is assumed to be a training loop.
"""
STEP_MARK_AT_SECOND_LEVEL_WHILE_LOOP: DebugOptions._StepMarkerLocation.ValueType # 3
"""Generate a step marker at each iteration of the second level while loops,
which is assumed to be a training or eval loop.
"""
STEP_MARK_NONE: DebugOptions._StepMarkerLocation.ValueType # 2
"""No step marker generated."""
class StepMarkerLocation(_StepMarkerLocation, metaclass=_StepMarkerLocationEnumTypeWrapper): ...
STEP_MARK_AT_ENTRY: DebugOptions.StepMarkerLocation.ValueType # 0
"""Generate a step marker at the program entry. This handles the case where
each step is done by one or multiple program execution(s). Only the first
program will be tagged for generating a step marker at the program entry.
This is the default.
"""
STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP: DebugOptions.StepMarkerLocation.ValueType # 1
"""Generate a step marker at each iteration of the top level while loop,
which is assumed to be a training loop.
"""
STEP_MARK_AT_SECOND_LEVEL_WHILE_LOOP: DebugOptions.StepMarkerLocation.ValueType # 3
"""Generate a step marker at each iteration of the second level while loops,
which is assumed to be a training or eval loop.
"""
STEP_MARK_NONE: DebugOptions.StepMarkerLocation.ValueType # 2
"""No step marker generated."""
class _CommandBufferCmdType:
ValueType = typing.NewType("ValueType", builtins.int)
V: typing_extensions.TypeAlias = ValueType
class _CommandBufferCmdTypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[DebugOptions._CommandBufferCmdType.ValueType], builtins.type):
DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor
INVALID: DebugOptions._CommandBufferCmdType.ValueType # 0
FUSION: DebugOptions._CommandBufferCmdType.ValueType # 1
CUBLAS: DebugOptions._CommandBufferCmdType.ValueType # 2
CUDNN: DebugOptions._CommandBufferCmdType.ValueType # 3
COLLECTIVES: DebugOptions._CommandBufferCmdType.ValueType # 4
CONDITIONALS: DebugOptions._CommandBufferCmdType.ValueType # 5
CUSTOM_CALL: DebugOptions._CommandBufferCmdType.ValueType # 6
class CommandBufferCmdType(_CommandBufferCmdType, metaclass=_CommandBufferCmdTypeEnumTypeWrapper):
"""Commands are categorized into 5 types:
FUSION represents regular fusion kernels.
CUBLAS, CUDNN, and COLLECTIVES represent library calls.
CONDITIONALS represents control flow.
"""
INVALID: DebugOptions.CommandBufferCmdType.ValueType # 0
FUSION: DebugOptions.CommandBufferCmdType.ValueType # 1
CUBLAS: DebugOptions.CommandBufferCmdType.ValueType # 2
CUDNN: DebugOptions.CommandBufferCmdType.ValueType # 3
COLLECTIVES: DebugOptions.CommandBufferCmdType.ValueType # 4
CONDITIONALS: DebugOptions.CommandBufferCmdType.ValueType # 5
CUSTOM_CALL: DebugOptions.CommandBufferCmdType.ValueType # 6
class _PartitioningAlgorithm:
ValueType = typing.NewType("ValueType", builtins.int)
V: typing_extensions.TypeAlias = ValueType
class _PartitioningAlgorithmEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[DebugOptions._PartitioningAlgorithm.ValueType], builtins.type):
DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor
PARTITIONING_ALGORITHM_NOOP: DebugOptions._PartitioningAlgorithm.ValueType # 0
PARTITIONING_ALGORITHM_EXP0: DebugOptions._PartitioningAlgorithm.ValueType # 1
PARTITIONING_ALGORITHM_EXP1: DebugOptions._PartitioningAlgorithm.ValueType # 2
PARTITIONING_ALGORITHM_EXP2: DebugOptions._PartitioningAlgorithm.ValueType # 3
class PartitioningAlgorithm(_PartitioningAlgorithm, metaclass=_PartitioningAlgorithmEnumTypeWrapper): ...
PARTITIONING_ALGORITHM_NOOP: DebugOptions.PartitioningAlgorithm.ValueType # 0
PARTITIONING_ALGORITHM_EXP0: DebugOptions.PartitioningAlgorithm.ValueType # 1
PARTITIONING_ALGORITHM_EXP1: DebugOptions.PartitioningAlgorithm.ValueType # 2
PARTITIONING_ALGORITHM_EXP2: DebugOptions.PartitioningAlgorithm.ValueType # 3
@typing.final
class XlaBackendExtraOptionsEntry(google.protobuf.message.Message):
DESCRIPTOR: google.protobuf.descriptor.Descriptor
KEY_FIELD_NUMBER: builtins.int
VALUE_FIELD_NUMBER: builtins.int
key: builtins.str
value: builtins.str
def __init__(
self,
*,
key: builtins.str | None = ...,
value: builtins.str | None = ...,
) -> None: ...
def ClearField(self, field_name: typing.Literal["key", b"key", "value", b"value"]) -> None: ...
XLA_HLO_GRAPH_ADDRESSES_FIELD_NUMBER: builtins.int
XLA_HLO_PROFILE_FIELD_NUMBER: builtins.int
XLA_DISABLE_HLO_PASSES_FIELD_NUMBER: builtins.int
XLA_ENABLE_HLO_PASSES_ONLY_FIELD_NUMBER: builtins.int
XLA_DISABLE_ALL_HLO_PASSES_FIELD_NUMBER: builtins.int
XLA_BACKEND_OPTIMIZATION_LEVEL_FIELD_NUMBER: builtins.int
XLA_EMBED_IR_IN_EXECUTABLE_FIELD_NUMBER: builtins.int
XLA_ELIMINATE_HLO_IMPLICIT_BROADCAST_FIELD_NUMBER: builtins.int
XLA_CPU_MULTI_THREAD_EIGEN_FIELD_NUMBER: builtins.int
XLA_GPU_CUDA_DATA_DIR_FIELD_NUMBER: builtins.int
XLA_GPU_FTZ_FIELD_NUMBER: builtins.int
XLA_LLVM_ENABLE_ALIAS_SCOPE_METADATA_FIELD_NUMBER: builtins.int
XLA_LLVM_ENABLE_NOALIAS_METADATA_FIELD_NUMBER: builtins.int
XLA_LLVM_ENABLE_INVARIANT_LOAD_METADATA_FIELD_NUMBER: builtins.int
XLA_LLVM_DISABLE_EXPENSIVE_PASSES_FIELD_NUMBER: builtins.int
XLA_TEST_ALL_OUTPUT_LAYOUTS_FIELD_NUMBER: builtins.int
XLA_TEST_ALL_INPUT_LAYOUTS_FIELD_NUMBER: builtins.int
XLA_HLO_GRAPH_SHARDING_COLOR_FIELD_NUMBER: builtins.int
XLA_CPU_USE_MKL_DNN_FIELD_NUMBER: builtins.int
XLA_CPU_USE_XLA_RUNTIME_FIELD_NUMBER: builtins.int
XLA_CPU_ENABLE_FAST_MATH_FIELD_NUMBER: builtins.int
XLA_CPU_FAST_MATH_HONOR_NANS_FIELD_NUMBER: builtins.int
XLA_CPU_FAST_MATH_HONOR_INFS_FIELD_NUMBER: builtins.int
XLA_CPU_FAST_MATH_HONOR_DIVISION_FIELD_NUMBER: builtins.int
XLA_CPU_FAST_MATH_HONOR_FUNCTIONS_FIELD_NUMBER: builtins.int
XLA_CPU_ENABLE_FAST_MIN_MAX_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_FAST_MIN_MAX_FIELD_NUMBER: builtins.int
XLA_CPU_SPARSE_CUDA_THREADS_FIELD_NUMBER: builtins.int
XLA_ALLOW_EXCESS_PRECISION_FIELD_NUMBER: builtins.int
XLA_GPU_CRASH_ON_VERIFICATION_FAILURES_FIELD_NUMBER: builtins.int
XLA_GPU_AUTOTUNE_LEVEL_FIELD_NUMBER: builtins.int
XLA_FORCE_HOST_PLATFORM_DEVICE_COUNT_FIELD_NUMBER: builtins.int
XLA_GPU_DISABLE_GPUASM_OPTIMIZATIONS_FIELD_NUMBER: builtins.int
XLA_GPU_SHAPE_CHECKS_FIELD_NUMBER: builtins.int
XLA_HLO_EVALUATOR_USE_FAST_PATH_FIELD_NUMBER: builtins.int
XLA_ALLOW_SCALAR_INDEX_DYNAMIC_OPS_FIELD_NUMBER: builtins.int
XLA_STEP_MARKER_LOCATION_FIELD_NUMBER: builtins.int
XLA_DUMP_TO_FIELD_NUMBER: builtins.int
XLA_DUMP_HLO_MODULE_RE_FIELD_NUMBER: builtins.int
XLA_DUMP_HLO_PASS_RE_FIELD_NUMBER: builtins.int
XLA_DUMP_HLO_AS_TEXT_FIELD_NUMBER: builtins.int
XLA_DUMP_HLO_AS_PROTO_FIELD_NUMBER: builtins.int
XLA_DUMP_HLO_AS_DOT_FIELD_NUMBER: builtins.int
XLA_DUMP_HLO_AS_URL_FIELD_NUMBER: builtins.int
XLA_DUMP_HLO_AS_HTML_FIELD_NUMBER: builtins.int
XLA_DUMP_FUSION_VISUALIZATION_FIELD_NUMBER: builtins.int
XLA_DUMP_HLO_SNAPSHOTS_FIELD_NUMBER: builtins.int
XLA_DUMP_INCLUDE_TIMESTAMP_FIELD_NUMBER: builtins.int
XLA_DUMP_MAX_HLO_MODULES_FIELD_NUMBER: builtins.int
XLA_DUMP_MODULE_METADATA_FIELD_NUMBER: builtins.int
XLA_DUMP_COMPRESS_PROTOS_FIELD_NUMBER: builtins.int
XLA_DUMP_HLO_AS_LONG_TEXT_FIELD_NUMBER: builtins.int
XLA_GPU_FORCE_CONV_NCHW_FIELD_NUMBER: builtins.int
XLA_GPU_FORCE_CONV_NHWC_FIELD_NUMBER: builtins.int
XLA_GPU_PTX_FILE_FIELD_NUMBER: builtins.int
XLA_GPU_DUMP_LLVMIR_FIELD_NUMBER: builtins.int
XLA_DUMP_ENABLE_MLIR_PRETTY_FORM_FIELD_NUMBER: builtins.int
XLA_GPU_ALGORITHM_DENYLIST_PATH_FIELD_NUMBER: builtins.int
XLA_TPU_DETECT_NAN_FIELD_NUMBER: builtins.int
XLA_TPU_DETECT_INF_FIELD_NUMBER: builtins.int
XLA_CPU_ENABLE_XPROF_TRACEME_FIELD_NUMBER: builtins.int
XLA_GPU_UNSAFE_FALLBACK_TO_DRIVER_ON_PTXAS_NOT_FOUND_FIELD_NUMBER: builtins.int
XLA_GPU_ASM_EXTRA_FLAGS_FIELD_NUMBER: builtins.int
XLA_MULTIHEAP_SIZE_CONSTRAINT_PER_HEAP_FIELD_NUMBER: builtins.int
XLA_DETAILED_LOGGING_FIELD_NUMBER: builtins.int
XLA_ENABLE_DUMPING_FIELD_NUMBER: builtins.int
XLA_GPU_FORCE_COMPILATION_PARALLELISM_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_LLVM_MODULE_COMPILATION_PARALLELISM_FIELD_NUMBER: builtins.int
XLA_GPU_DETERMINISTIC_OPS_FIELD_NUMBER: builtins.int
XLA_GPU_LLVM_IR_FILE_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_ASYNC_COLLECTIVES_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_ASYNC_ALL_REDUCE_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_ASYNC_COLLECTIVE_PERMUTE_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_ASYNC_ALL_GATHER_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_ASYNC_REDUCE_SCATTER_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_ASYNC_ALL_TO_ALL_FIELD_NUMBER: builtins.int
XLA_GPU_ALL_REDUCE_COMBINE_THRESHOLD_BYTES_FIELD_NUMBER: builtins.int
XLA_GPU_ALL_GATHER_COMBINE_THRESHOLD_BYTES_FIELD_NUMBER: builtins.int
XLA_GPU_REDUCE_SCATTER_COMBINE_THRESHOLD_BYTES_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_ALL_GATHER_COMBINE_BY_DIM_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_REDUCE_SCATTER_COMBINE_BY_DIM_FIELD_NUMBER: builtins.int
XLA_GPU_ALL_REDUCE_CONTIGUOUS_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_REASSOCIATION_FOR_CONVERTED_AR_FIELD_NUMBER: builtins.int
XLA_GPU_ALL_REDUCE_BLUECONNECT_NUM_DEVICES_PER_HOST_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_WHILE_LOOP_REDUCE_SCATTER_CODE_MOTION_FIELD_NUMBER: builtins.int
XLA_GPU_COLLECTIVE_INFLATION_FACTOR_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_CUDNN_FRONTEND_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_CUDNN_FMHA_FIELD_NUMBER: builtins.int
XLA_GPU_FUSED_ATTENTION_USE_CUDNN_RNG_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_CUDNN_LAYER_NORM_FIELD_NUMBER: builtins.int
XLA_DUMP_DISABLE_METADATA_FIELD_NUMBER: builtins.int
XLA_DUMP_HLO_PIPELINE_RE_FIELD_NUMBER: builtins.int
XLA_GPU_STRICT_CONV_ALGORITHM_PICKER_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_XLA_RUNTIME_EXECUTABLE_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_CUSTOM_FUSIONS_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_CUSTOM_FUSIONS_RE_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_ADDRESS_COMPUTATION_FUSION_FIELD_NUMBER: builtins.int
XLA_GPU_NCCL_TERMINATION_TIMEOUT_SECONDS_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_SHARED_CONSTANTS_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_CUBLASLT_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_COMMAND_BUFFER_FIELD_NUMBER: builtins.int
XLA_GPU_GRAPH_NUM_RUNS_TO_INSTANTIATE_FIELD_NUMBER: builtins.int
XLA_GPU_GRAPH_MIN_GRAPH_SIZE_FIELD_NUMBER: builtins.int
XLA_GPU_GRAPH_ENABLE_CONCURRENT_REGION_FIELD_NUMBER: builtins.int
XLA_GPU_GRAPH_EVICTION_TIMEOUT_SECONDS_FIELD_NUMBER: builtins.int
XLA_GPU_REDZONE_SCRATCH_MAX_MEGABYTES_FIELD_NUMBER: builtins.int
XLA_GPU_REDZONE_PADDING_BYTES_FIELD_NUMBER: builtins.int
XLA_GPU_SIMPLIFY_ALL_FP_CONVERSIONS_FIELD_NUMBER: builtins.int
XLA_GPU_NORMALIZE_LAYOUTS_FIELD_NUMBER: builtins.int
XLA_CPU_USE_ACL_FIELD_NUMBER: builtins.int
XLA_CPU_STRICT_DOT_CONV_MATH_FIELD_NUMBER: builtins.int
XLA_GPU_USE_RUNTIME_FUSION_FIELD_NUMBER: builtins.int
XLA_DUMP_LATENCY_HIDING_SCHEDULE_FIELD_NUMBER: builtins.int
XLA_CPU_ENABLE_MLIR_TILING_AND_FUSION_FIELD_NUMBER: builtins.int
XLA_CPU_ENABLE_CUSTOM_MATMUL_TILING_FIELD_NUMBER: builtins.int
XLA_CPU_MATMUL_TILING_M_DIM_FIELD_NUMBER: builtins.int
XLA_CPU_MATMUL_TILING_N_DIM_FIELD_NUMBER: builtins.int
XLA_CPU_MATMUL_TILING_K_DIM_FIELD_NUMBER: builtins.int
XLA_CPU_ENABLE_MLIR_FUSION_OUTLINING_FIELD_NUMBER: builtins.int
XLA_CPU_ENABLE_EXPERIMENTAL_DEALLOCATION_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_LATENCY_HIDING_SCHEDULER_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_HIGHEST_PRIORITY_ASYNC_STREAM_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_ANALYTICAL_LATENCY_ESTIMATOR_FIELD_NUMBER: builtins.int
XLA_GPU_LHS_ENABLE_GPU_ASYNC_TRACKER_FIELD_NUMBER: builtins.int
XLA_GPU_PGLE_PROFILE_FILE_OR_DIRECTORY_PATH_FIELD_NUMBER: builtins.int
XLA_GPU_MEMORY_LIMIT_SLOP_FACTOR_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_PIPELINED_COLLECTIVES_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_PIPELINED_ALL_REDUCE_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_PIPELINED_ALL_GATHER_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_PIPELINED_REDUCE_SCATTER_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_PIPELINED_P2P_FIELD_NUMBER: builtins.int
XLA_GPU_COLLECTIVE_PERMUTE_DECOMPOSER_THRESHOLD_FIELD_NUMBER: builtins.int
XLA_PARTITIONING_ALGORITHM_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_TRITON_GEMM_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_CUDNN_INT8X32_CONVOLUTION_REORDERING_FIELD_NUMBER: builtins.int
XLA_GPU_TRITON_GEMM_ANY_FIELD_NUMBER: builtins.int
XLA_GPU_EXHAUSTIVE_TILING_SEARCH_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_TRITON_SOFTMAX_FUSION_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_PRIORITY_FUSION_FIELD_NUMBER: builtins.int
XLA_GPU_DUMP_AUTOTUNE_RESULTS_TO_FIELD_NUMBER: builtins.int
XLA_GPU_LOAD_AUTOTUNE_RESULTS_FROM_FIELD_NUMBER: builtins.int
XLA_GPU_TARGET_CONFIG_FILENAME_FIELD_NUMBER: builtins.int
XLA_GPU_AUTO_SPMD_PARTITIONING_MEMORY_BUDGET_GB_FIELD_NUMBER: builtins.int
XLA_GPU_AUTO_SPMD_PARTITIONING_MEMORY_BUDGET_RATIO_FIELD_NUMBER: builtins.int
XLA_GPU_TRITON_GEMM_DISABLE_REDUCED_PRECISION_REDUCTION_FIELD_NUMBER: builtins.int
XLA_GPU_TRITON_FUSION_LEVEL_FIELD_NUMBER: builtins.int
XLA_GPU_DUMP_AUTOTUNED_TRITON_FUSIONS_FIELD_NUMBER: builtins.int
XLA_GPU_COPY_INSERTION_USE_REGION_ANALYSIS_FIELD_NUMBER: builtins.int
XLA_GPU_COLLECT_COST_MODEL_STATS_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_SPLIT_K_AUTOTUNING_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_REDUCTION_EPILOGUE_FUSION_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_NCCL_CLIQUE_OPTIMIZATION_FIELD_NUMBER: builtins.int
XLA_GPU_MOCK_CUSTOM_CALLS_FIELD_NUMBER: builtins.int
XLA_GPU_CUBLAS_FALLBACK_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_WHILE_LOOP_DOUBLE_BUFFERING_FIELD_NUMBER: builtins.int
XLA_GPU_ENSURE_MINOR_DOT_CONTRACTION_DIMS_FIELD_NUMBER: builtins.int
XLA_GPU_FILTER_KERNELS_SPILLING_REGISTERS_ON_AUTOTUNING_FIELD_NUMBER: builtins.int
XLA_DEBUG_BUFFER_ASSIGNMENT_SHOW_MAX_FIELD_NUMBER: builtins.int
XLA_GPU_LLVM_VERIFICATION_LEVEL_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_CUB_RADIX_SORT_FIELD_NUMBER: builtins.int
XLA_GPU_THRESHOLD_FOR_WINDOWED_EINSUM_MIB_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_TRITON_HOPPER_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_NCCL_USER_BUFFERS_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_LIBNVPTXCOMPILER_FIELD_NUMBER: builtins.int
XLA_GPU_ENABLE_DOT_STRENGTH_REDUCTION_FIELD_NUMBER: builtins.int
XLA_BACKEND_EXTRA_OPTIONS_FIELD_NUMBER: builtins.int
xla_hlo_graph_addresses: builtins.bool
"""Show addresses of HLO ops in graph dump."""
xla_hlo_profile: builtins.bool
"""Instrument the computation to collect per-HLO cycle counts."""
xla_disable_all_hlo_passes: builtins.bool
"""Disables all HLO passes. Notes that some passes are necessary for
correctness and the invariants that must be satisfied by "fully optimized"
HLO are different for different devices and may change over time. The only
"guarantee", such as it is, is that if you compile XLA and dump the
optimized HLO for some graph, you should be able to run it again on the
same device with the same build of XLA.
"""
xla_backend_optimization_level: builtins.int
"""Numerical optimization level for the XLA compiler backend; the specific
interpretation of this value is left to the backends.
"""
xla_embed_ir_in_executable: builtins.bool
"""Embed the compiler IR as a string in the executable."""
xla_eliminate_hlo_implicit_broadcast: builtins.bool
"""Eliminate implicit broadcasts when lowering user computations to HLO
instructions; use explicit broadcast instead.
"""
xla_cpu_multi_thread_eigen: builtins.bool
"""When generating calls to Eigen in the CPU backend, use multi-threaded Eigen
mode.
"""
xla_gpu_cuda_data_dir: builtins.str
"""Path to directory with cuda/ptx tools and libraries."""
xla_gpu_ftz: builtins.bool
"""Enable flush-to-zero semantics in the GPU backend."""
xla_llvm_enable_alias_scope_metadata: builtins.bool
"""If true, in LLVM-based backends, emit !alias.scope metadata in
generated IR.
"""
xla_llvm_enable_noalias_metadata: builtins.bool
"""If true, in LLVM-based backends, emit !noalias metadata in the
generated IR.
"""
xla_llvm_enable_invariant_load_metadata: builtins.bool
"""If true, in LLVM-based backends, emit !invariant.load metadata in
the generated IR.
"""
xla_llvm_disable_expensive_passes: builtins.bool
"""If true, a set of expensive LLVM optimization passes will not be run."""
xla_test_all_output_layouts: builtins.bool
"""This is used by ClientLibraryTestBase::ComputeAndCompare*. If true, the
computation will run n! times with all permunations of layouts for the
output shape in rank n. For example, with a 3D shape, all permutations of
the set {0, 1, 2} are tried.
"""
xla_test_all_input_layouts: builtins.bool
"""This is used by ClientLibraryTestBase::ComputeAndCompare*. If true, the
computation will run for all permunations of layouts of all input
arguments. For example, with 2 input arguments in 2D and 4D shapes, the
computation will run 2! * 4! times.
"""
xla_hlo_graph_sharding_color: builtins.bool
"""Assign colors based on sharding information when generating the Graphviz
HLO graph.
"""
xla_cpu_use_mkl_dnn: builtins.bool
"""Generate calls to MKL-DNN in the CPU backend."""
xla_cpu_use_xla_runtime: builtins.bool
"""Enable XLA Runtime in the CPU backend."""
xla_cpu_enable_fast_math: builtins.bool
"""When true, "unsafe" mathematical optimizations are enabled. These
transformations include but are not limited to:
- Reducing the precision of operations (e.g. using an approximate sin
function, or transforming x/y into x * (1/y)).
- Assuming that operations never produce or consume NaN or +/- Inf (this
behavior can be adjusted using xla_cpu_fast_math_allow_{nans|infs}).
- Assuming that +0 and -0 are indistinguishable.
"""
xla_cpu_fast_math_honor_nans: builtins.bool
"""When xla_cpu_enable_fast_math is true then this controls whether we allow
operations to produce NaNs. Ignored when xla_cpu_enable_fast_math is
false.
"""
xla_cpu_fast_math_honor_infs: builtins.bool
"""When xla_cpu_enable_fast_math is true then this controls whether we allow
operations to produce infinites. Ignored when xla_cpu_enable_fast_math is
false.
"""
xla_cpu_fast_math_honor_division: builtins.bool
"""When xla_cpu_enable_fast_math is true then this controls whether we forbid
to use the reciprocal of an argument instead of division. Ignored when
xla_cpu_enable_fast_math is false.
"""
xla_cpu_fast_math_honor_functions: builtins.bool
"""When xla_cpu_enable_fast_math is true then this controls whether we forbid
to approximate calculations for functions. Ignored when
xla_cpu_enable_fast_math is false.
"""
xla_cpu_enable_fast_min_max: builtins.bool
"""When false we lower the Minimum and Maximum hlos in the CPU backend such
that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NaN. In other words, if flag
this is false we always propagate NaNs through Min and Max.
Note, this does not correspond to the exact same behavior as the gpu flag
below!
"""
xla_gpu_enable_fast_min_max: builtins.bool
"""When true we lower the Minimum and Maximum hlos in the GPU backend such
that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN. In other words, if flag
this is true we don't propagate NaNs through Min and Max.
Note, this does not correspond to the exact same behavior as the cpu flag
above!
"""
xla_cpu_sparse_cuda_threads: builtins.int
"""Defines the number of CUDA threads that can be used to accelerate
a sparse computation compiled for the XLA Runtime and CPU backend.
By default (value 0), no acceleration is used. Otherwise, this
many threads may be used to accelerate sparse operations, typically
useful when accelerating structured sparsity.
"""
xla_allow_excess_precision: builtins.bool
"""Allows xla to increase the output precision of floating point operations."""
xla_gpu_crash_on_verification_failures: builtins.bool
"""Crashes the program when any kind of verification fails, instead of just
logging the failures. One example is cross checking of convolution results
among different algorithms.
"""
xla_gpu_autotune_level: builtins.int
"""0: Disable gemm and convolution autotuning.
1: Enable autotuning, but disable correctness checking.
2: Also set output buffers to random numbers during autotuning.
3: Also reset output buffers to random numbers after autotuning each
algorithm.
4+: Also check for correct outputs and for out-of-bounds reads/writes.
Default: 4.
"""
xla_force_host_platform_device_count: builtins.int
"""Force the host platform to pretend that there are these many host
"devices". All these devices are backed by the same threadpool. Defaults
to 1.
Setting this to anything other than 1 can increase overhead from context
switching but we let the user override this behavior to help run tests on
the host that run models in parallel across multiple devices.
"""
xla_gpu_disable_gpuasm_optimizations: builtins.bool
"""If set to true XLA:GPU invokes `ptxas` with -O0 (default is -O3)."""
xla_gpu_shape_checks: global___DebugOptions.ShapeChecks.ValueType
xla_hlo_evaluator_use_fast_path: builtins.bool
"""Enable fast math with eigen in the HLO evaluator."""
xla_allow_scalar_index_dynamic_ops: builtins.bool
"""Temporary option to allow support for both the R1 and the scalar index
versions of DynamicSlice and DynamicUpdateSlice. Only used for testing.
"""
xla_step_marker_location: global___DebugOptions.StepMarkerLocation.ValueType
"""Option to emit a target-specific marker to indicate the start of a training
step. The location of the marker (if any) is determined by the option
value.
"""
xla_dump_to: builtins.str
"""
BEGIN flags controlling dumping HLO modules for debugging.
When dumping is enabled, HLO modules dumped at the very beginning and end
of compilation, and optionally also during the pass pipeline.
In general, if you set one of these flags, we will try to infer reasonable
defaults for the others. For example:
* Setting --xla_dump_to=/tmp/foo without specifying a format
with --xla_dump_hlo_as_* will turn on --xla_dump_hlo_as_text.
* Setting --xla_dump_hlo_as_text without specifying --xla_dump_to will
dump to stdout.
Directory to dump into.
"""
xla_dump_hlo_module_re: builtins.str
"""If specified, will only dump modules which match this regexp."""
xla_dump_hlo_pass_re: builtins.str
"""If this flag is specified, will also dump HLO before and after passes that
match this regular expression. Set to .* to dump before/after all passes.
"""
xla_dump_hlo_as_text: builtins.bool
"""Specifies the format that HLO is dumped in. Multiple of these may be
specified.
"""
xla_dump_hlo_as_proto: builtins.bool
xla_dump_hlo_as_dot: builtins.bool
xla_dump_hlo_as_url: builtins.bool
xla_dump_hlo_as_html: builtins.bool
"""Dump HLO graphs as an HTML (DOT -> SVG inlined in HTML)"""
xla_dump_fusion_visualization: builtins.bool
"""Dump the visualization of the fusion progress."""
xla_dump_hlo_snapshots: builtins.bool
"""If true, every time an HLO module is run, we will dump an HloSnapshot
(essentially, a serialized module plus its inputs) to the --xla_dump_to
directory.
"""
xla_dump_include_timestamp: builtins.bool
"""Include a timestamp in the dumped filenames."""
xla_dump_max_hlo_modules: builtins.int
"""Max number of hlo module dumps in a directory. Set to < 0 for unbounded."""
xla_dump_module_metadata: builtins.bool
"""Dump HloModuleMetadata as a text proto for each HLO module."""
xla_dump_compress_protos: builtins.bool
"""GZip-compress protos dumped via --xla_dump_hlo_as_proto."""
xla_dump_hlo_as_long_text: builtins.bool
"""Dump HLO in long text format. Ignored unless xla_dump_hlo_as_text is true."""
xla_gpu_force_conv_nchw: builtins.bool
"""
END flags controlling dumping HLO modules.
Overrides for XLA GPU's convolution layout heuristic.
"""
xla_gpu_force_conv_nhwc: builtins.bool
xla_gpu_dump_llvmir: builtins.bool
"""Whether to dump llvm ir when compiling to ptx."""
xla_dump_enable_mlir_pretty_form: builtins.bool
"""Whether to dump mlir using pretty print form."""
xla_gpu_algorithm_denylist_path: builtins.str
"""Denylist for cuDNN convolutions."""
xla_tpu_detect_nan: builtins.bool
"""Debug options that trigger execution errors when NaN or Inf are detected."""
xla_tpu_detect_inf: builtins.bool
xla_cpu_enable_xprof_traceme: builtins.bool
"""True if TraceMe annotations are enabled for XLA:CPU."""
xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found: builtins.bool
"""It is usually preferable to not fallback to the driver; it can consume more
memory, or have bugs.
"""
xla_gpu_asm_extra_flags: builtins.str
"""Extra parameters to pass the GPU assembler."""
xla_multiheap_size_constraint_per_heap: builtins.int
"""Per-heap size constraint. New heaps will be created if per-heap max size is
reached.
"""
xla_detailed_logging: builtins.bool
"""Enable detailed logging into vlog. If this is disabled, no
compilation summary will be printed in the end of computation.
"""
xla_enable_dumping: builtins.bool
"""Enable HLO dumping. If this is disabled, no HLO modules will be dumped."""
xla_gpu_force_compilation_parallelism: builtins.int
"""Overrides normal multi-threaded compilation setting to use this many
threads. Setting to 0 (the default value) means no enforcement.
"""
xla_gpu_enable_llvm_module_compilation_parallelism: builtins.bool
xla_gpu_deterministic_ops: builtins.bool
"""Guarantees run-to-run determinism. At present, the HLO ops Scatter and
SelectAndScatter do not have deterministic XLA:GPU implementations.
Compilation errors out if these ops are encountered.
"""
xla_gpu_enable_async_collectives: builtins.bool
"""Convert synchronous collective ops into asynchronous."""
xla_gpu_enable_async_all_reduce: builtins.bool
xla_gpu_enable_async_collective_permute: builtins.bool
xla_gpu_enable_async_all_gather: builtins.bool
xla_gpu_enable_async_reduce_scatter: builtins.bool
xla_gpu_enable_async_all_to_all: builtins.bool
xla_gpu_all_reduce_combine_threshold_bytes: builtins.int
"""Size threshold (in bytes) for the GPU collective combiners."""
xla_gpu_all_gather_combine_threshold_bytes: builtins.int
xla_gpu_reduce_scatter_combine_threshold_bytes: builtins.int
xla_gpu_enable_all_gather_combine_by_dim: builtins.bool
"""Combine all-gather/scatter-reduce ops with the same dimension or
irrespective of their dimension.
"""
xla_gpu_enable_reduce_scatter_combine_by_dim: builtins.bool
xla_gpu_all_reduce_contiguous: builtins.bool
"""Combine GPU all-reduces into a single operation over a contiguous buffer."""
xla_gpu_enable_reassociation_for_converted_ar: builtins.bool
"""Enable allreduce reassociation on allreduces that are converted to a wider
type. The resulting allreduce will be promoted to a wider-typed allreduce.
"""
xla_gpu_all_reduce_blueconnect_num_devices_per_host: builtins.int
"""Number of devices per host for first stage of BlueConnect decomposition
pass. The pass will attempt to decompose all-reduces ops into a
ReduceScatter-AllReduce-AllGather sequence, with the initial ReduceScatter
being performed over all of the devices in the same host. Set to < 1 to
disable all-reduce decomposition.
"""
xla_gpu_enable_while_loop_reduce_scatter_code_motion: builtins.bool
"""Enable hoisting of reduce-scatter out of while loops."""
xla_gpu_collective_inflation_factor: builtins.int
"""Inflate collective cost by running each collective multiple times."""
xla_gpu_enable_cudnn_frontend: builtins.bool
"""Whether to use the cuDNN frontend API for convolutions when possible."""
xla_gpu_enable_cudnn_fmha: builtins.bool
xla_gpu_fused_attention_use_cudnn_rng: builtins.bool
xla_gpu_enable_cudnn_layer_norm: builtins.bool
"""Rewrite layer norm patterns into cuDNN library calls."""
xla_dump_disable_metadata: builtins.bool
"""Disable dumping metadata in HLO dumps."""
xla_dump_hlo_pipeline_re: builtins.str
"""If this flag is specified, will only dump HLO before and after passes in
the pass pipeline that matches this regular expression. Default empty value
enables dumping in all pipelines.
"""
xla_gpu_strict_conv_algorithm_picker: builtins.bool
"""If true, abort immediately when conv algorithm picker fails, rather than
logging a warning and proceeding with fallback.
"""
xla_gpu_enable_xla_runtime_executable: builtins.bool
"""If true, use XLA runtime for XLA:GPU backend."""
xla_gpu_enable_custom_fusions: builtins.bool
"""If true, XLA will try to pattern match subgraphs of HLO operations into
custom fusions registered in the current process (pre-compiled hand written
kernels, e.g. various GEMM fusions writtent in CUTLASS).
"""
xla_gpu_enable_custom_fusions_re: builtins.str
"""A regular expression enabling only a subset of custom fusions. Enabled only
if `xla_gpu_enable_custom_fusion` set to true.
"""
xla_gpu_enable_address_computation_fusion: builtins.bool
"""If true, use XLA runtime for XLA:GPU backend."""
xla_gpu_nccl_termination_timeout_seconds: builtins.int
"""Timeout in seconds before terminating jobs that are stuck in a NCCL
Rendezvous. Negative value disables the timeout and will not terminate.
"""
xla_gpu_enable_shared_constants: builtins.bool
"""Enables shared constants for XLA/GPU. This allows large constants to be
shared among multiple GPU executables.
"""
xla_gpu_enable_cublaslt: builtins.bool
"""Whether to use cuBLASLt for GEMMs on GPUs."""
xla_gpu_graph_num_runs_to_instantiate: builtins.int
"""Only instantiates a GPU graph after the captured function execution count
reaches the threshold. This constant is a heuristic to avoid creating a
large number of CUDA graph instances in memory.
"""
xla_gpu_graph_min_graph_size: builtins.int
"""This number determines how many moved instructions like fusion kernels are
required for a region to be captured as a function to be launched as a GPU
graph.
"""
xla_gpu_graph_enable_concurrent_region: builtins.bool
"""Identify concurrent regions in GPU graphs and execute them concurrently."""
xla_gpu_graph_eviction_timeout_seconds: builtins.int
"""Timeout in seconds to evict instantiated Gpu graphs from device. When XLA
instantiates new Gpu graphs, it evicts graphs that were not recently
executed to free space on device.
"""
xla_gpu_redzone_scratch_max_megabytes: builtins.int
"""Size threshold (in megabytes) for the GPU redzone scratch allocator."""
xla_gpu_redzone_padding_bytes: builtins.int
"""Amount of padding the redzone allocator will put on one side of each buffer
it allocates. (So the buffer's total size will be increased by 2x this
value.)
Higher values make it more likely that we'll catch an out-of-bounds read or
write. Smaller values consume less memory during autotuning. Note that a
fused cudnn conv has up to 6 total buffers (4 inputs, 1 output, and 1
scratch), so this can be multiplied by quite a lot.
"""
xla_gpu_simplify_all_fp_conversions: builtins.bool
"""Allows all floating-point conversions to be simplified, including those
that affect the numerics. The `FloatNormalization` pass inserts many
`f32 -> bf16 -> f32` conversion pairs. These are not removed by the
`AlgebraicSimplifier`, as that will only simplify conversions that are
no-ops, e.g. `bf16 -> f32 -> bf16`. Removing these improves accuracy.
"""
xla_gpu_normalize_layouts: builtins.bool
"""An experimental option to force all layouts present in the
after-optimizations HLO to be descending, e.g.
ShapeUtil::MakeShapeWithDescendingLayout is an identity on all
instructions.
"""
xla_cpu_use_acl: builtins.bool
"""Generate calls to Arm Compute Library in the CPU backend."""
xla_cpu_strict_dot_conv_math: builtins.bool
"""By default, XLA:CPU will run fp16 dot/conv as fp32, as this is generally
(much) faster on our hardware. Set this flag to disable this behavior.
"""
xla_gpu_use_runtime_fusion: builtins.bool
"""An option to enable using cuDNN runtime compiled fusion kernels which is
available and recommended for Ampere+ GPUs.
"""
xla_dump_latency_hiding_schedule: builtins.bool
xla_cpu_enable_mlir_tiling_and_fusion: builtins.bool
"""By default, MLIR lowering will use Linalg elementwise fusion. If this flag
is enabled, the pipeline will use tiling, fusion, peeling, vectorization
instead.
"""
xla_cpu_enable_custom_matmul_tiling: builtins.bool
"""XLA:CPU-Next tiling parameters for matmul."""
xla_cpu_matmul_tiling_m_dim: builtins.int
xla_cpu_matmul_tiling_n_dim: builtins.int
xla_cpu_matmul_tiling_k_dim: builtins.int
xla_cpu_enable_mlir_fusion_outlining: builtins.bool
xla_cpu_enable_experimental_deallocation: builtins.bool
"""If set, use the experimental deallocation pass from mlir-hlo."""
xla_gpu_enable_latency_hiding_scheduler: builtins.bool
xla_gpu_enable_highest_priority_async_stream: builtins.bool
xla_gpu_enable_analytical_latency_estimator: builtins.bool
xla_gpu_lhs_enable_gpu_async_tracker: builtins.bool
xla_gpu_pgle_profile_file_or_directory_path: builtins.str
xla_gpu_memory_limit_slop_factor: builtins.int
xla_gpu_enable_pipelined_collectives: builtins.bool
xla_gpu_enable_pipelined_all_reduce: builtins.bool
xla_gpu_enable_pipelined_all_gather: builtins.bool
xla_gpu_enable_pipelined_reduce_scatter: builtins.bool
xla_gpu_enable_pipelined_p2p: builtins.bool
xla_gpu_collective_permute_decomposer_threshold: builtins.int
"""The minimum data size in bytes to trigger collective-permute-decomposer
transformation.
"""
xla_partitioning_algorithm: global___DebugOptions.PartitioningAlgorithm.ValueType
"""The partitioning algorithm to be used in the PartitionAssignment pass."""
xla_gpu_enable_triton_gemm: builtins.bool
xla_gpu_enable_cudnn_int8x32_convolution_reordering: builtins.bool
xla_gpu_triton_gemm_any: builtins.bool
"""Creates triton fusion for all supported gemms.
To make sure only triton gemm is chosen by the autotuner run with
`xla_gpu_cublas_fallback` set to false.
"""
xla_gpu_exhaustive_tiling_search: builtins.bool
xla_gpu_enable_triton_softmax_fusion: builtins.bool
xla_gpu_enable_priority_fusion: builtins.bool
xla_gpu_dump_autotune_results_to: builtins.str
"""File to write autotune results to. It will be a binary file unless the name
ends with .txt or .textproto. Warning: The results are written at every
compilation, possibly multiple times per process. This only works on CUDA.
"""
xla_gpu_load_autotune_results_from: builtins.str
"""File to load autotune results from. It will be considered a binary file
unless the name ends with .txt or .textproto. At most one loading will
happen during the lifetime of one process, even if the first one is
unsuccessful or different file paths are passed here. This only works on
CUDA.
"""
xla_gpu_target_config_filename: builtins.str
"""Description of the target platform in GpuTargetConfigProto format; if
provided, deviceless compilation is assumed, and the current device is
ignored.
"""
xla_gpu_auto_spmd_partitioning_memory_budget_gb: builtins.int
"""Memory budget in GB per device for AutoSharding."""
xla_gpu_auto_spmd_partitioning_memory_budget_ratio: builtins.float
"""See the definition of the
xla_gpu_auto_spmd_partitioning_memory_budget_ratio flag for the meaning of
this field.
"""
xla_gpu_triton_gemm_disable_reduced_precision_reduction: builtins.bool
xla_gpu_triton_fusion_level: builtins.int
xla_gpu_dump_autotuned_triton_fusions: builtins.bool
xla_gpu_copy_insertion_use_region_analysis: builtins.bool
xla_gpu_collect_cost_model_stats: builtins.bool
"""If true, each fusion instruction will have a cost model runtime estimate in
backend config after compilation.
"""
xla_gpu_enable_split_k_autotuning: builtins.bool
xla_gpu_enable_reduction_epilogue_fusion: builtins.bool
"""Whether reduction epilogue fusion is enabled in fusion passes."""
xla_gpu_enable_nccl_clique_optimization: builtins.bool
"""Allow early return when acquiring NCCL cliques."""
xla_gpu_mock_custom_calls: builtins.bool
"""Replace custom calls with noop operations."""
xla_gpu_cublas_fallback: builtins.bool
"""Allow Triton GEMM autotuning to fall back to cuBLAS when that is
faster.
"""
xla_gpu_enable_while_loop_double_buffering: builtins.bool
"""Enable double buffering for loops."""
xla_gpu_ensure_minor_dot_contraction_dims: builtins.bool
"""Change the layout of the second triton dot operand to be column major.
Only works for (bf16 x bf16) -> bf16.
"""
xla_gpu_filter_kernels_spilling_registers_on_autotuning: builtins.bool
"""Filter out kernels that spill registers during autotuning."""
xla_debug_buffer_assignment_show_max: builtins.int
"""Maximum number of buffers to print when debugging buffer assignment."""
xla_gpu_llvm_verification_level: builtins.int
xla_gpu_enable_cub_radix_sort: builtins.bool
"""Enable radix sort using CUB."""
xla_gpu_threshold_for_windowed_einsum_mib: builtins.int
"""Threshold to enable windowed einsum (collective matmul) in MB."""
xla_gpu_enable_triton_hopper: builtins.bool
"""Enables currently disabled features within Triton for Hopper."""
xla_gpu_enable_nccl_user_buffers: builtins.bool
"""Enable NCCL user buffers."""
xla_gpu_enable_libnvptxcompiler: builtins.bool
"""If enabled, uses the libnvptxcompiler library to compile PTX to cuBIN."""
xla_gpu_enable_dot_strength_reduction: builtins.bool
@property
def xla_disable_hlo_passes(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
"""List of HLO passes to disable/enable. These names must exactly match the
pass names as specified by the HloPassInterface::name() method.
At least one of xla_disable_hlo_passes and xla_enable_hlo_passes_only must
be empty.
"""
@property
def xla_enable_hlo_passes_only(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ...
@property
def xla_gpu_ptx_file(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
"""Paths to files with ptx code."""
@property
def xla_gpu_llvm_ir_file(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
"""Paths to files with LLVM code."""
@property
def xla_gpu_enable_command_buffer(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[global___DebugOptions.CommandBufferCmdType.ValueType]:
"""Determine the types of commands that are recorded into command buffers."""
@property
def xla_backend_extra_options(self) -> google.protobuf.internal.containers.ScalarMap[builtins.str, builtins.str]:
"""Next id: 271
Extra options to pass to the compilation backend (e.g. LLVM); specific
interpretation of these values is left to the backend.
"""
def __init__(
self,
*,
xla_hlo_graph_addresses: builtins.bool | None = ...,
xla_hlo_profile: builtins.bool | None = ...,
xla_disable_hlo_passes: collections.abc.Iterable[builtins.str] | None = ...,
xla_enable_hlo_passes_only: collections.abc.Iterable[builtins.str] | None = ...,
xla_disable_all_hlo_passes: builtins.bool | None = ...,
xla_backend_optimization_level: builtins.int | None = ...,
xla_embed_ir_in_executable: builtins.bool | None = ...,
xla_eliminate_hlo_implicit_broadcast: builtins.bool | None = ...,
xla_cpu_multi_thread_eigen: builtins.bool | None = ...,
xla_gpu_cuda_data_dir: builtins.str | None = ...,
xla_gpu_ftz: builtins.bool | None = ...,
xla_llvm_enable_alias_scope_metadata: builtins.bool | None = ...,
xla_llvm_enable_noalias_metadata: builtins.bool | None = ...,
xla_llvm_enable_invariant_load_metadata: builtins.bool | None = ...,
xla_llvm_disable_expensive_passes: builtins.bool | None = ...,
xla_test_all_output_layouts: builtins.bool | None = ...,
xla_test_all_input_layouts: builtins.bool | None = ...,
xla_hlo_graph_sharding_color: builtins.bool | None = ...,
xla_cpu_use_mkl_dnn: builtins.bool | None = ...,
xla_cpu_use_xla_runtime: builtins.bool | None = ...,
xla_cpu_enable_fast_math: builtins.bool | None = ...,
xla_cpu_fast_math_honor_nans: builtins.bool | None = ...,
xla_cpu_fast_math_honor_infs: builtins.bool | None = ...,
xla_cpu_fast_math_honor_division: builtins.bool | None = ...,
xla_cpu_fast_math_honor_functions: builtins.bool | None = ...,
xla_cpu_enable_fast_min_max: builtins.bool | None = ...,
xla_gpu_enable_fast_min_max: builtins.bool | None = ...,
xla_cpu_sparse_cuda_threads: builtins.int | None = ...,
xla_allow_excess_precision: builtins.bool | None = ...,
xla_gpu_crash_on_verification_failures: builtins.bool | None = ...,
xla_gpu_autotune_level: builtins.int | None = ...,
xla_force_host_platform_device_count: builtins.int | None = ...,
xla_gpu_disable_gpuasm_optimizations: builtins.bool | None = ...,
xla_gpu_shape_checks: global___DebugOptions.ShapeChecks.ValueType | None = ...,
xla_hlo_evaluator_use_fast_path: builtins.bool | None = ...,
xla_allow_scalar_index_dynamic_ops: builtins.bool | None = ...,
xla_step_marker_location: global___DebugOptions.StepMarkerLocation.ValueType | None = ...,
xla_dump_to: builtins.str | None = ...,
xla_dump_hlo_module_re: builtins.str | None = ...,
xla_dump_hlo_pass_re: builtins.str | None = ...,
xla_dump_hlo_as_text: builtins.bool | None = ...,
xla_dump_hlo_as_proto: builtins.bool | None = ...,
xla_dump_hlo_as_dot: builtins.bool | None = ...,
xla_dump_hlo_as_url: builtins.bool | None = ...,
xla_dump_hlo_as_html: builtins.bool | None = ...,
xla_dump_fusion_visualization: builtins.bool | None = ...,
xla_dump_hlo_snapshots: builtins.bool | None = ...,
xla_dump_include_timestamp: builtins.bool | None = ...,
xla_dump_max_hlo_modules: builtins.int | None = ...,
xla_dump_module_metadata: builtins.bool | None = ...,
xla_dump_compress_protos: builtins.bool | None = ...,
xla_dump_hlo_as_long_text: builtins.bool | None = ...,
xla_gpu_force_conv_nchw: builtins.bool | None = ...,
xla_gpu_force_conv_nhwc: builtins.bool | None = ...,
xla_gpu_ptx_file: collections.abc.Iterable[builtins.str] | None = ...,
xla_gpu_dump_llvmir: builtins.bool | None = ...,
xla_dump_enable_mlir_pretty_form: builtins.bool | None = ...,
xla_gpu_algorithm_denylist_path: builtins.str | None = ...,
xla_tpu_detect_nan: builtins.bool | None = ...,
xla_tpu_detect_inf: builtins.bool | None = ...,
xla_cpu_enable_xprof_traceme: builtins.bool | None = ...,
xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found: builtins.bool | None = ...,
xla_gpu_asm_extra_flags: builtins.str | None = ...,
xla_multiheap_size_constraint_per_heap: builtins.int | None = ...,
xla_detailed_logging: builtins.bool | None = ...,
xla_enable_dumping: builtins.bool | None = ...,
xla_gpu_force_compilation_parallelism: builtins.int | None = ...,
xla_gpu_enable_llvm_module_compilation_parallelism: builtins.bool | None = ...,
xla_gpu_deterministic_ops: builtins.bool | None = ...,
xla_gpu_llvm_ir_file: collections.abc.Iterable[builtins.str] | None = ...,
xla_gpu_enable_async_collectives: builtins.bool | None = ...,
xla_gpu_enable_async_all_reduce: builtins.bool | None = ...,
xla_gpu_enable_async_collective_permute: builtins.bool | None = ...,
xla_gpu_enable_async_all_gather: builtins.bool | None = ...,
xla_gpu_enable_async_reduce_scatter: builtins.bool | None = ...,
xla_gpu_enable_async_all_to_all: builtins.bool | None = ...,
xla_gpu_all_reduce_combine_threshold_bytes: builtins.int | None = ...,
xla_gpu_all_gather_combine_threshold_bytes: builtins.int | None = ...,
xla_gpu_reduce_scatter_combine_threshold_bytes: builtins.int | None = ...,
xla_gpu_enable_all_gather_combine_by_dim: builtins.bool | None = ...,
xla_gpu_enable_reduce_scatter_combine_by_dim: builtins.bool | None = ...,
xla_gpu_all_reduce_contiguous: builtins.bool | None = ...,
xla_gpu_enable_reassociation_for_converted_ar: builtins.bool | None = ...,
xla_gpu_all_reduce_blueconnect_num_devices_per_host: builtins.int | None = ...,
xla_gpu_enable_while_loop_reduce_scatter_code_motion: builtins.bool | None = ...,
xla_gpu_collective_inflation_factor: builtins.int | None = ...,
xla_gpu_enable_cudnn_frontend: builtins.bool | None = ...,
xla_gpu_enable_cudnn_fmha: builtins.bool | None = ...,
xla_gpu_fused_attention_use_cudnn_rng: builtins.bool | None = ...,
xla_gpu_enable_cudnn_layer_norm: builtins.bool | None = ...,
xla_dump_disable_metadata: builtins.bool | None = ...,
xla_dump_hlo_pipeline_re: builtins.str | None = ...,
xla_gpu_strict_conv_algorithm_picker: builtins.bool | None = ...,
xla_gpu_enable_xla_runtime_executable: builtins.bool | None = ...,
xla_gpu_enable_custom_fusions: builtins.bool | None = ...,
xla_gpu_enable_custom_fusions_re: builtins.str | None = ...,
xla_gpu_enable_address_computation_fusion: builtins.bool | None = ...,
xla_gpu_nccl_termination_timeout_seconds: builtins.int | None = ...,
xla_gpu_enable_shared_constants: builtins.bool | None = ...,
xla_gpu_enable_cublaslt: builtins.bool | None = ...,
xla_gpu_enable_command_buffer: collections.abc.Iterable[global___DebugOptions.CommandBufferCmdType.ValueType] | None = ...,
xla_gpu_graph_num_runs_to_instantiate: builtins.int | None = ...,
xla_gpu_graph_min_graph_size: builtins.int | None = ...,
xla_gpu_graph_enable_concurrent_region: builtins.bool | None = ...,
xla_gpu_graph_eviction_timeout_seconds: builtins.int | None = ...,
xla_gpu_redzone_scratch_max_megabytes: builtins.int | None = ...,
xla_gpu_redzone_padding_bytes: builtins.int | None = ...,
xla_gpu_simplify_all_fp_conversions: builtins.bool | None = ...,
xla_gpu_normalize_layouts: builtins.bool | None = ...,
xla_cpu_use_acl: builtins.bool | None = ...,
xla_cpu_strict_dot_conv_math: builtins.bool | None = ...,
xla_gpu_use_runtime_fusion: builtins.bool | None = ...,
xla_dump_latency_hiding_schedule: builtins.bool | None = ...,
xla_cpu_enable_mlir_tiling_and_fusion: builtins.bool | None = ...,
xla_cpu_enable_custom_matmul_tiling: builtins.bool | None = ...,
xla_cpu_matmul_tiling_m_dim: builtins.int | None = ...,
xla_cpu_matmul_tiling_n_dim: builtins.int | None = ...,