From d14d487c45eb5d711971374dfa89a88ff7e2442c Mon Sep 17 00:00:00 2001 From: lanluo-nvidia Date: Mon, 8 Sep 2025 10:51:31 -0700 Subject: [PATCH 1/2] replace allow_complex_guards_as_runtime_assertswithprefer_deferred_runtime_asserts_over_guards --- .../torch_export_flux_dev.ipynb | 2 +- .../torch_export_flux_dev.py | 2 +- .../weight_streaming_example.ipynb | 2 +- .../weight_streaming_example.py | 2 +- .../dynamo/runtime/_MutableTorchTensorRTModule.html | 10 +++++----- .../dynamo/torch_export_flux_dev.rst.txt | 2 +- .../dynamo/weight_streaming_example.rst.txt | 2 +- docs/py_api/torch_tensorrt.html | 4 ++-- .../dynamo/torch_export_flux_dev.html | 2 +- .../dynamo/weight_streaming_example.html | 2 +- examples/apps/flux_demo.py | 2 +- examples/dynamo/torch_export_flux_dev.py | 2 +- examples/dynamo/weight_streaming_example.py | 2 +- .../dynamo/runtime/_MutableTorchTensorRTModule.py | 10 +++++----- tests/py/dynamo/models/test_engine_cache.py | 2 +- tools/llm/test_llama_components.py | 4 ++-- tools/llm/utils.py | 2 +- tools/perf/utils.py | 2 +- 18 files changed, 28 insertions(+), 28 deletions(-) diff --git a/docs/_downloads/0c66936e5fb86b43de3fa45f5f060b9d/torch_export_flux_dev.ipynb b/docs/_downloads/0c66936e5fb86b43de3fa45f5f060b9d/torch_export_flux_dev.ipynb index b35d07b032..d916f18eec 100644 --- a/docs/_downloads/0c66936e5fb86b43de3fa45f5f060b9d/torch_export_flux_dev.ipynb +++ b/docs/_downloads/0c66936e5fb86b43de3fa45f5f060b9d/torch_export_flux_dev.ipynb @@ -69,7 +69,7 @@ }, "outputs": [], "source": [ - "batch_size = 2\nBATCH = torch.export.Dim(\"batch\", min=1, max=2)\nSEQ_LEN = torch.export.Dim(\"seq_len\", min=1, max=512)\n# This particular min, max values for img_id input are recommended by torch dynamo during the export of the model.\n# To see this recommendation, you can try exporting using min=1, max=4096\nIMG_ID = torch.export.Dim(\"img_id\", min=3586, max=4096)\ndynamic_shapes = {\n \"hidden_states\": {0: BATCH},\n \"encoder_hidden_states\": {0: BATCH, 1: SEQ_LEN},\n \"pooled_projections\": {0: BATCH},\n \"timestep\": {0: BATCH},\n \"txt_ids\": {0: SEQ_LEN},\n \"img_ids\": {0: IMG_ID},\n \"guidance\": {0: BATCH},\n \"joint_attention_kwargs\": {},\n \"return_dict\": None,\n}\n# The guidance factor is of type torch.float32\ndummy_inputs = {\n \"hidden_states\": torch.randn((batch_size, 4096, 64), dtype=torch.float16).to(\n DEVICE\n ),\n \"encoder_hidden_states\": torch.randn(\n (batch_size, 512, 4096), dtype=torch.float16\n ).to(DEVICE),\n \"pooled_projections\": torch.randn((batch_size, 768), dtype=torch.float16).to(\n DEVICE\n ),\n \"timestep\": torch.tensor([1.0, 1.0], dtype=torch.float16).to(DEVICE),\n \"txt_ids\": torch.randn((512, 3), dtype=torch.float16).to(DEVICE),\n \"img_ids\": torch.randn((4096, 3), dtype=torch.float16).to(DEVICE),\n \"guidance\": torch.tensor([1.0, 1.0], dtype=torch.float32).to(DEVICE),\n \"joint_attention_kwargs\": {},\n \"return_dict\": False,\n}\n# This will create an exported program which is going to be compiled with Torch-TensorRT\nep = _export(\n backbone,\n args=(),\n kwargs=dummy_inputs,\n dynamic_shapes=dynamic_shapes,\n strict=False,\n allow_complex_guards_as_runtime_asserts=True,\n)" + "batch_size = 2\nBATCH = torch.export.Dim(\"batch\", min=1, max=2)\nSEQ_LEN = torch.export.Dim(\"seq_len\", min=1, max=512)\n# This particular min, max values for img_id input are recommended by torch dynamo during the export of the model.\n# To see this recommendation, you can try exporting using min=1, max=4096\nIMG_ID = torch.export.Dim(\"img_id\", min=3586, max=4096)\ndynamic_shapes = {\n \"hidden_states\": {0: BATCH},\n \"encoder_hidden_states\": {0: BATCH, 1: SEQ_LEN},\n \"pooled_projections\": {0: BATCH},\n \"timestep\": {0: BATCH},\n \"txt_ids\": {0: SEQ_LEN},\n \"img_ids\": {0: IMG_ID},\n \"guidance\": {0: BATCH},\n \"joint_attention_kwargs\": {},\n \"return_dict\": None,\n}\n# The guidance factor is of type torch.float32\ndummy_inputs = {\n \"hidden_states\": torch.randn((batch_size, 4096, 64), dtype=torch.float16).to(\n DEVICE\n ),\n \"encoder_hidden_states\": torch.randn(\n (batch_size, 512, 4096), dtype=torch.float16\n ).to(DEVICE),\n \"pooled_projections\": torch.randn((batch_size, 768), dtype=torch.float16).to(\n DEVICE\n ),\n \"timestep\": torch.tensor([1.0, 1.0], dtype=torch.float16).to(DEVICE),\n \"txt_ids\": torch.randn((512, 3), dtype=torch.float16).to(DEVICE),\n \"img_ids\": torch.randn((4096, 3), dtype=torch.float16).to(DEVICE),\n \"guidance\": torch.tensor([1.0, 1.0], dtype=torch.float32).to(DEVICE),\n \"joint_attention_kwargs\": {},\n \"return_dict\": False,\n}\n# This will create an exported program which is going to be compiled with Torch-TensorRT\nep = _export(\n backbone,\n args=(),\n kwargs=dummy_inputs,\n dynamic_shapes=dynamic_shapes,\n strict=False,\n prefer_deferred_runtime_asserts_over_guards=True,\n)" ] }, { diff --git a/docs/_downloads/11bd814a14cab34bab72bf8a16425e4a/torch_export_flux_dev.py b/docs/_downloads/11bd814a14cab34bab72bf8a16425e4a/torch_export_flux_dev.py index 8f471668f1..e46ad9ba46 100644 --- a/docs/_downloads/11bd814a14cab34bab72bf8a16425e4a/torch_export_flux_dev.py +++ b/docs/_downloads/11bd814a14cab34bab72bf8a16425e4a/torch_export_flux_dev.py @@ -92,7 +92,7 @@ kwargs=dummy_inputs, dynamic_shapes=dynamic_shapes, strict=False, - allow_complex_guards_as_runtime_asserts=True, + prefer_deferred_runtime_asserts_over_guards=True, ) # %% diff --git a/docs/_downloads/3e4586a9107efae8f87a361bd207b6e0/weight_streaming_example.ipynb b/docs/_downloads/3e4586a9107efae8f87a361bd207b6e0/weight_streaming_example.ipynb index 192555d68e..512d266542 100644 --- a/docs/_downloads/3e4586a9107efae8f87a361bd207b6e0/weight_streaming_example.ipynb +++ b/docs/_downloads/3e4586a9107efae8f87a361bd207b6e0/weight_streaming_example.ipynb @@ -22,7 +22,7 @@ }, "outputs": [], "source": [ - "import copy\nimport timeit\n\nimport numpy as np\nimport torch\nimport torch_tensorrt\nfrom transformers import AutoModelForCausalLM\n\n\ndef export_llm(model, inputs, min_seq_len=1, max_seq_len=16):\n \"\"\"\n Exports the LLM model into an ExportedProgram with dynamic shapes.\n In the case of guard failures due to some PyTorch kernel implements, we also\n try to re-export the graph by expressing them as runtime assert nodes\n \"\"\"\n with torch.no_grad():\n # max=1024 has contraint violation error. https://github.com/pytorch/pytorch/issues/125604\n seq_len = torch.export.Dim(\"seq_len\", min=min_seq_len, max=max_seq_len)\n position_ids = torch.arange(inputs.shape[1]).unsqueeze(0).to(inputs.device)\n try:\n print(\"Trying to export the model using torch.export.export()..\")\n # strict=False only enables aotautograd tracing and excludes dynamo.\n ep = torch.export.export(\n model,\n args=(inputs,),\n kwargs={\"position_ids\": position_ids},\n dynamic_shapes=({1: seq_len}, {1: seq_len}),\n strict=False,\n )\n except:\n print(\n \"Trying torch.export._trace._export to trace the graph since torch.export.export() failed\"\n )\n # This API is used to express the constraint violation guards as asserts in the graph.\n ep = torch.export._trace._export(\n model,\n args=(inputs,),\n kwargs={\"position_ids\": position_ids},\n dynamic_shapes=({1: seq_len}, {1: seq_len}),\n strict=False,\n allow_complex_guards_as_runtime_asserts=True,\n )\n\n return ep\n\n\ndef time_generate(model, inputs, output_seq_length, iterations=10):\n \"\"\"\n Measure the time for generating a sentence over certain number of iterations\n \"\"\"\n # We only support single input (B x seq_len) for LLMs now\n input_seq = inputs[0]\n with torch.no_grad():\n timings = []\n for _ in range(iterations):\n start_time = timeit.default_timer()\n inputs_copy = copy.copy(input_seq)\n # Greedy decoding of the model. This generates up to max_tokens.\n while inputs_copy.shape[1] <= output_seq_length:\n outputs = model(inputs_copy)\n logits = outputs.logits\n next_token_logits = logits[:, -1, :]\n next_tokens = torch.argmax(next_token_logits, dim=-1)\n inputs_copy = torch.cat([inputs_copy, next_tokens[:, None]], dim=-1)\n torch.cuda.synchronize()\n end_time = timeit.default_timer()\n timings.append(end_time - start_time)\n\n times = np.array(timings)\n time_mean_ms = np.mean(times) * 1000\n\n return time_mean_ms\n\n\n# Load the LLaMA-2 model\nDEVICE = torch.device(\"cuda:0\")\nllama_path = \"meta-llama/Llama-2-7b-chat-hf\"\nwith torch.no_grad():\n model = AutoModelForCausalLM.from_pretrained(\n llama_path, use_cache=False, attn_implementation=\"eager\"\n ).eval()\n\n# Set input and output sequence lengths\nisl = 128\nosl = 256\n\n# Create random input tensors\ninput_tensors = [torch.randint(0, 5, (1, isl), dtype=torch.int64).cuda()]\n# Convert the model to half precision (FP16)\nmodel = model.half()\n# Exports the LLM model into an ExportedProgram with dynamic shapes.\nllama2_ep = export_llm(model, input_tensors[0], max_seq_len=osl)" + "import copy\nimport timeit\n\nimport numpy as np\nimport torch\nimport torch_tensorrt\nfrom transformers import AutoModelForCausalLM\n\n\ndef export_llm(model, inputs, min_seq_len=1, max_seq_len=16):\n \"\"\"\n Exports the LLM model into an ExportedProgram with dynamic shapes.\n In the case of guard failures due to some PyTorch kernel implements, we also\n try to re-export the graph by expressing them as runtime assert nodes\n \"\"\"\n with torch.no_grad():\n # max=1024 has contraint violation error. https://github.com/pytorch/pytorch/issues/125604\n seq_len = torch.export.Dim(\"seq_len\", min=min_seq_len, max=max_seq_len)\n position_ids = torch.arange(inputs.shape[1]).unsqueeze(0).to(inputs.device)\n try:\n print(\"Trying to export the model using torch.export.export()..\")\n # strict=False only enables aotautograd tracing and excludes dynamo.\n ep = torch.export.export(\n model,\n args=(inputs,),\n kwargs={\"position_ids\": position_ids},\n dynamic_shapes=({1: seq_len}, {1: seq_len}),\n strict=False,\n )\n except:\n print(\n \"Trying torch.export._trace._export to trace the graph since torch.export.export() failed\"\n )\n # This API is used to express the constraint violation guards as asserts in the graph.\n ep = torch.export._trace._export(\n model,\n args=(inputs,),\n kwargs={\"position_ids\": position_ids},\n dynamic_shapes=({1: seq_len}, {1: seq_len}),\n strict=False,\n prefer_deferred_runtime_asserts_over_guards=True,\n )\n\n return ep\n\n\ndef time_generate(model, inputs, output_seq_length, iterations=10):\n \"\"\"\n Measure the time for generating a sentence over certain number of iterations\n \"\"\"\n # We only support single input (B x seq_len) for LLMs now\n input_seq = inputs[0]\n with torch.no_grad():\n timings = []\n for _ in range(iterations):\n start_time = timeit.default_timer()\n inputs_copy = copy.copy(input_seq)\n # Greedy decoding of the model. This generates up to max_tokens.\n while inputs_copy.shape[1] <= output_seq_length:\n outputs = model(inputs_copy)\n logits = outputs.logits\n next_token_logits = logits[:, -1, :]\n next_tokens = torch.argmax(next_token_logits, dim=-1)\n inputs_copy = torch.cat([inputs_copy, next_tokens[:, None]], dim=-1)\n torch.cuda.synchronize()\n end_time = timeit.default_timer()\n timings.append(end_time - start_time)\n\n times = np.array(timings)\n time_mean_ms = np.mean(times) * 1000\n\n return time_mean_ms\n\n\n# Load the LLaMA-2 model\nDEVICE = torch.device(\"cuda:0\")\nllama_path = \"meta-llama/Llama-2-7b-chat-hf\"\nwith torch.no_grad():\n model = AutoModelForCausalLM.from_pretrained(\n llama_path, use_cache=False, attn_implementation=\"eager\"\n ).eval()\n\n# Set input and output sequence lengths\nisl = 128\nosl = 256\n\n# Create random input tensors\ninput_tensors = [torch.randint(0, 5, (1, isl), dtype=torch.int64).cuda()]\n# Convert the model to half precision (FP16)\nmodel = model.half()\n# Exports the LLM model into an ExportedProgram with dynamic shapes.\nllama2_ep = export_llm(model, input_tensors[0], max_seq_len=osl)" ] }, { diff --git a/docs/_downloads/b26ba3d33b5fc57e738fb2f26cabe4e8/weight_streaming_example.py b/docs/_downloads/b26ba3d33b5fc57e738fb2f26cabe4e8/weight_streaming_example.py index 601292ba95..c477ba6df8 100644 --- a/docs/_downloads/b26ba3d33b5fc57e738fb2f26cabe4e8/weight_streaming_example.py +++ b/docs/_downloads/b26ba3d33b5fc57e738fb2f26cabe4e8/weight_streaming_example.py @@ -65,7 +65,7 @@ def export_llm(model, inputs, min_seq_len=1, max_seq_len=16): kwargs={"position_ids": position_ids}, dynamic_shapes=({1: seq_len}, {1: seq_len}), strict=False, - allow_complex_guards_as_runtime_asserts=True, + prefer_deferred_runtime_asserts_over_guards=True, ) return ep diff --git a/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html b/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html index c1023a8a76..f2907b1d60 100644 --- a/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html +++ b/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html @@ -556,7 +556,7 @@

Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModuleuse_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME, immutable_weights: bool = False, strict: bool = True, - allow_complex_guards_as_runtime_asserts: bool = False, + prefer_deferred_runtime_asserts_over_guards: bool = False, weight_streaming_budget: Optional[int] = None, enabled_precisions: Optional[Set[Union[torch.dtype, dtype]]] = None, **kwargs: Any, @@ -622,8 +622,8 @@

Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModuleself.kwarg_inputs: dict[str, Any] = {} self.additional_settings = kwargs self.strict = strict - self.allow_complex_guards_as_runtime_asserts = ( - allow_complex_guards_as_runtime_asserts + self.prefer_deferred_runtime_asserts_over_guards = ( + prefer_deferred_runtime_asserts_over_guards ) self.use_python_runtime = use_python_runtime self.trt_device = to_torch_tensorrt_device(device) @@ -800,14 +800,14 @@

Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModuledef get_exported_program(self) -> torch.export.ExportedProgram: def export_fn() -> torch.export.ExportedProgram: - if self.allow_complex_guards_as_runtime_asserts: + if self.prefer_deferred_runtime_asserts_over_guards: return _export( self.original_model, self.arg_inputs, kwargs=self.kwarg_inputs, dynamic_shapes=self._get_total_dynamic_shapes(), strict=self.strict, - allow_complex_guards_as_runtime_asserts=self.allow_complex_guards_as_runtime_asserts, + prefer_deferred_runtime_asserts_over_guards=self.prefer_deferred_runtime_asserts_over_guards, ) else: return torch.export.export( diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.rst.txt index 3336f3bb48..84b4dbf25d 100644 --- a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.rst.txt +++ b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.rst.txt @@ -136,7 +136,7 @@ due to `0/1 specialization Functions

-class torch_tensorrt.MutableTorchTensorRTModule(pytorch_model: Module, *, device: Optional[Union[Device, device, str]] = None, use_python_runtime: bool = False, immutable_weights: bool = False, strict: bool = True, allow_complex_guards_as_runtime_asserts: bool = False, weight_streaming_budget: Optional[int] = None, enabled_precisions: Optional[Set[Union[dtype, dtype]]] = None, **kwargs: Any)[source]
+class torch_tensorrt.MutableTorchTensorRTModule(pytorch_model: Module, *, device: Optional[Union[Device, device, str]] = None, use_python_runtime: bool = False, immutable_weights: bool = False, strict: bool = True, prefer_deferred_runtime_asserts_over_guards: bool = False, weight_streaming_budget: Optional[int] = None, enabled_precisions: Optional[Set[Union[dtype, dtype]]] = None, **kwargs: Any)[source]

Initialize a MutableTorchTensorRTModule to seamlessly manipulate it like a regular PyTorch module. All TensorRT compilation and refitting processes are handled automatically as you work with the module. Any changes to its attributes or loading a different state_dict will trigger refitting or recompilation, @@ -654,7 +654,7 @@

Classes
-__init__(pytorch_model: Module, *, device: Optional[Union[Device, device, str]] = None, use_python_runtime: bool = False, immutable_weights: bool = False, strict: bool = True, allow_complex_guards_as_runtime_asserts: bool = False, weight_streaming_budget: Optional[int] = None, enabled_precisions: Optional[Set[Union[dtype, dtype]]] = None, **kwargs: Any) None[source]
+__init__(pytorch_model: Module, *, device: Optional[Union[Device, device, str]] = None, use_python_runtime: bool = False, immutable_weights: bool = False, strict: bool = True, prefer_deferred_runtime_asserts_over_guards: bool = False, weight_streaming_budget: Optional[int] = None, enabled_precisions: Optional[Set[Union[dtype, dtype]]] = None, **kwargs: Any) None[source]
Parameters

pytorch_model (torch.nn.module) – Source module that needs to be accelerated

diff --git a/docs/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.html b/docs/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.html index fa0c6a8e5d..e81bfe898e 100644 --- a/docs/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.html +++ b/docs/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.html @@ -581,7 +581,7 @@

Export the backbone using torch.exportkwargs=dummy_inputs, dynamic_shapes=dynamic_shapes, strict=False, - allow_complex_guards_as_runtime_asserts=True, + prefer_deferred_runtime_asserts_over_guards=True, ) diff --git a/docs/tutorials/_rendered_examples/dynamo/weight_streaming_example.html b/docs/tutorials/_rendered_examples/dynamo/weight_streaming_example.html index 5b4b2ed275..422e845131 100644 --- a/docs/tutorials/_rendered_examples/dynamo/weight_streaming_example.html +++ b/docs/tutorials/_rendered_examples/dynamo/weight_streaming_example.html @@ -556,7 +556,7 @@

Imports and Model Definitionkwargs={"position_ids": position_ids}, dynamic_shapes=({1: seq_len}, {1: seq_len}), strict=False, - allow_complex_guards_as_runtime_asserts=True, + prefer_deferred_runtime_asserts_over_guards=True, ) return ep diff --git a/examples/apps/flux_demo.py b/examples/apps/flux_demo.py index c061bb5d81..2a4e1f9d5f 100644 --- a/examples/apps/flux_demo.py +++ b/examples/apps/flux_demo.py @@ -121,7 +121,7 @@ def forward_loop(mod): settings = { "strict": False, - "allow_complex_guards_as_runtime_asserts": True, + "prefer_deferred_runtime_asserts_over_guards": True, "enabled_precisions": enabled_precisions, "truncate_double": True, "min_block_size": 1, diff --git a/examples/dynamo/torch_export_flux_dev.py b/examples/dynamo/torch_export_flux_dev.py index 8f471668f1..e46ad9ba46 100644 --- a/examples/dynamo/torch_export_flux_dev.py +++ b/examples/dynamo/torch_export_flux_dev.py @@ -92,7 +92,7 @@ kwargs=dummy_inputs, dynamic_shapes=dynamic_shapes, strict=False, - allow_complex_guards_as_runtime_asserts=True, + prefer_deferred_runtime_asserts_over_guards=True, ) # %% diff --git a/examples/dynamo/weight_streaming_example.py b/examples/dynamo/weight_streaming_example.py index 601292ba95..c477ba6df8 100644 --- a/examples/dynamo/weight_streaming_example.py +++ b/examples/dynamo/weight_streaming_example.py @@ -65,7 +65,7 @@ def export_llm(model, inputs, min_seq_len=1, max_seq_len=16): kwargs={"position_ids": position_ids}, dynamic_shapes=({1: seq_len}, {1: seq_len}), strict=False, - allow_complex_guards_as_runtime_asserts=True, + prefer_deferred_runtime_asserts_over_guards=True, ) return ep diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py index 258449ad7b..1cffec77c2 100644 --- a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py @@ -68,7 +68,7 @@ def __init__( use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME, immutable_weights: bool = False, strict: bool = True, - allow_complex_guards_as_runtime_asserts: bool = False, + prefer_deferred_runtime_asserts_over_guards: bool = False, weight_streaming_budget: Optional[int] = None, enabled_precisions: Optional[Set[Union[torch.dtype, dtype]]] = None, **kwargs: Any, @@ -134,8 +134,8 @@ def __init__( self.kwarg_inputs: dict[str, Any] = {} self.additional_settings = kwargs self.strict = strict - self.allow_complex_guards_as_runtime_asserts = ( - allow_complex_guards_as_runtime_asserts + self.prefer_deferred_runtime_asserts_over_guards = ( + prefer_deferred_runtime_asserts_over_guards ) self.use_python_runtime = use_python_runtime self.trt_device = to_torch_tensorrt_device(device) @@ -312,14 +312,14 @@ def refit_gm(self) -> None: def get_exported_program(self) -> torch.export.ExportedProgram: def export_fn() -> torch.export.ExportedProgram: - if self.allow_complex_guards_as_runtime_asserts: + if self.prefer_deferred_runtime_asserts_over_guards: return _export( self.original_model, self.arg_inputs, kwargs=self.kwarg_inputs, dynamic_shapes=self._get_total_dynamic_shapes(), strict=self.strict, - allow_complex_guards_as_runtime_asserts=self.allow_complex_guards_as_runtime_asserts, + prefer_deferred_runtime_asserts_over_guards=self.prefer_deferred_runtime_asserts_over_guards, ) else: return torch.export.export( diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py index 61b5d74679..5e310900aa 100644 --- a/tests/py/dynamo/models/test_engine_cache.py +++ b/tests/py/dynamo/models/test_engine_cache.py @@ -856,7 +856,7 @@ def export_llm(model, inputs, min_seq_len=1, max_seq_len=16): (inputs,), dynamic_shapes=({1: seq_len},), strict=False, - allow_complex_guards_as_runtime_asserts=True, + prefer_deferred_runtime_asserts_over_guards=True, ) return ep diff --git a/tools/llm/test_llama_components.py b/tools/llm/test_llama_components.py index ef7e59cd72..9adb51d324 100644 --- a/tools/llm/test_llama_components.py +++ b/tools/llm/test_llama_components.py @@ -79,7 +79,7 @@ def test_llama_attention(args): args=(hidden_states, position_embeddings, None), dynamic_shapes=dynamic_shapes, strict=False, - allow_complex_guards_as_runtime_asserts=True, + prefer_deferred_runtime_asserts_over_guards=True, ) with torch_tensorrt.logging.debug() if args.debug else nullcontext(): @@ -463,7 +463,7 @@ def test_llama_model(args): kwargs=kwarg_inputs, dynamic_shapes=dynamic_shapes, strict=False, - allow_complex_guards_as_runtime_asserts=True, + prefer_deferred_runtime_asserts_over_guards=True, ) with torch_tensorrt.logging.debug() if args.debug else nullcontext(): diff --git a/tools/llm/utils.py b/tools/llm/utils.py index 2c3434b0ed..842d2a597a 100644 --- a/tools/llm/utils.py +++ b/tools/llm/utils.py @@ -41,7 +41,7 @@ def export_llm(model, inputs, min_seq_len=1, max_seq_len=16): kwargs={"position_ids": position_ids}, dynamic_shapes=({1: seq_len}, {1: seq_len}), strict=False, - allow_complex_guards_as_runtime_asserts=True, + prefer_deferred_runtime_asserts_over_guards=True, ) return ep diff --git a/tools/perf/utils.py b/tools/perf/utils.py index b0bed6ff0e..13d7deac43 100644 --- a/tools/perf/utils.py +++ b/tools/perf/utils.py @@ -228,7 +228,7 @@ def export_llm(model, inputs, min_seq_len=1, max_seq_len=16): (inputs,), dynamic_shapes=({1: seq_len},), strict=False, - allow_complex_guards_as_runtime_asserts=True, + prefer_deferred_runtime_asserts_over_guards=True, ) return ep From 4fd849ace90ff4fb00a099894deb47ca2a0d79ad Mon Sep 17 00:00:00 2001 From: lanluo-nvidia Date: Tue, 9 Sep 2025 11:04:39 -0700 Subject: [PATCH 2/2] revert changes under docs --- .../torch_export_flux_dev.ipynb | 2 +- .../torch_export_flux_dev.py | 2 +- .../weight_streaming_example.ipynb | 2 +- .../weight_streaming_example.py | 2 +- .../dynamo/runtime/_MutableTorchTensorRTModule.html | 10 +++++----- .../dynamo/torch_export_flux_dev.rst.txt | 2 +- .../dynamo/weight_streaming_example.rst.txt | 2 +- docs/py_api/torch_tensorrt.html | 4 ++-- .../dynamo/torch_export_flux_dev.html | 2 +- .../dynamo/weight_streaming_example.html | 2 +- 10 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/_downloads/0c66936e5fb86b43de3fa45f5f060b9d/torch_export_flux_dev.ipynb b/docs/_downloads/0c66936e5fb86b43de3fa45f5f060b9d/torch_export_flux_dev.ipynb index d916f18eec..b35d07b032 100644 --- a/docs/_downloads/0c66936e5fb86b43de3fa45f5f060b9d/torch_export_flux_dev.ipynb +++ b/docs/_downloads/0c66936e5fb86b43de3fa45f5f060b9d/torch_export_flux_dev.ipynb @@ -69,7 +69,7 @@ }, "outputs": [], "source": [ - "batch_size = 2\nBATCH = torch.export.Dim(\"batch\", min=1, max=2)\nSEQ_LEN = torch.export.Dim(\"seq_len\", min=1, max=512)\n# This particular min, max values for img_id input are recommended by torch dynamo during the export of the model.\n# To see this recommendation, you can try exporting using min=1, max=4096\nIMG_ID = torch.export.Dim(\"img_id\", min=3586, max=4096)\ndynamic_shapes = {\n \"hidden_states\": {0: BATCH},\n \"encoder_hidden_states\": {0: BATCH, 1: SEQ_LEN},\n \"pooled_projections\": {0: BATCH},\n \"timestep\": {0: BATCH},\n \"txt_ids\": {0: SEQ_LEN},\n \"img_ids\": {0: IMG_ID},\n \"guidance\": {0: BATCH},\n \"joint_attention_kwargs\": {},\n \"return_dict\": None,\n}\n# The guidance factor is of type torch.float32\ndummy_inputs = {\n \"hidden_states\": torch.randn((batch_size, 4096, 64), dtype=torch.float16).to(\n DEVICE\n ),\n \"encoder_hidden_states\": torch.randn(\n (batch_size, 512, 4096), dtype=torch.float16\n ).to(DEVICE),\n \"pooled_projections\": torch.randn((batch_size, 768), dtype=torch.float16).to(\n DEVICE\n ),\n \"timestep\": torch.tensor([1.0, 1.0], dtype=torch.float16).to(DEVICE),\n \"txt_ids\": torch.randn((512, 3), dtype=torch.float16).to(DEVICE),\n \"img_ids\": torch.randn((4096, 3), dtype=torch.float16).to(DEVICE),\n \"guidance\": torch.tensor([1.0, 1.0], dtype=torch.float32).to(DEVICE),\n \"joint_attention_kwargs\": {},\n \"return_dict\": False,\n}\n# This will create an exported program which is going to be compiled with Torch-TensorRT\nep = _export(\n backbone,\n args=(),\n kwargs=dummy_inputs,\n dynamic_shapes=dynamic_shapes,\n strict=False,\n prefer_deferred_runtime_asserts_over_guards=True,\n)" + "batch_size = 2\nBATCH = torch.export.Dim(\"batch\", min=1, max=2)\nSEQ_LEN = torch.export.Dim(\"seq_len\", min=1, max=512)\n# This particular min, max values for img_id input are recommended by torch dynamo during the export of the model.\n# To see this recommendation, you can try exporting using min=1, max=4096\nIMG_ID = torch.export.Dim(\"img_id\", min=3586, max=4096)\ndynamic_shapes = {\n \"hidden_states\": {0: BATCH},\n \"encoder_hidden_states\": {0: BATCH, 1: SEQ_LEN},\n \"pooled_projections\": {0: BATCH},\n \"timestep\": {0: BATCH},\n \"txt_ids\": {0: SEQ_LEN},\n \"img_ids\": {0: IMG_ID},\n \"guidance\": {0: BATCH},\n \"joint_attention_kwargs\": {},\n \"return_dict\": None,\n}\n# The guidance factor is of type torch.float32\ndummy_inputs = {\n \"hidden_states\": torch.randn((batch_size, 4096, 64), dtype=torch.float16).to(\n DEVICE\n ),\n \"encoder_hidden_states\": torch.randn(\n (batch_size, 512, 4096), dtype=torch.float16\n ).to(DEVICE),\n \"pooled_projections\": torch.randn((batch_size, 768), dtype=torch.float16).to(\n DEVICE\n ),\n \"timestep\": torch.tensor([1.0, 1.0], dtype=torch.float16).to(DEVICE),\n \"txt_ids\": torch.randn((512, 3), dtype=torch.float16).to(DEVICE),\n \"img_ids\": torch.randn((4096, 3), dtype=torch.float16).to(DEVICE),\n \"guidance\": torch.tensor([1.0, 1.0], dtype=torch.float32).to(DEVICE),\n \"joint_attention_kwargs\": {},\n \"return_dict\": False,\n}\n# This will create an exported program which is going to be compiled with Torch-TensorRT\nep = _export(\n backbone,\n args=(),\n kwargs=dummy_inputs,\n dynamic_shapes=dynamic_shapes,\n strict=False,\n allow_complex_guards_as_runtime_asserts=True,\n)" ] }, { diff --git a/docs/_downloads/11bd814a14cab34bab72bf8a16425e4a/torch_export_flux_dev.py b/docs/_downloads/11bd814a14cab34bab72bf8a16425e4a/torch_export_flux_dev.py index e46ad9ba46..8f471668f1 100644 --- a/docs/_downloads/11bd814a14cab34bab72bf8a16425e4a/torch_export_flux_dev.py +++ b/docs/_downloads/11bd814a14cab34bab72bf8a16425e4a/torch_export_flux_dev.py @@ -92,7 +92,7 @@ kwargs=dummy_inputs, dynamic_shapes=dynamic_shapes, strict=False, - prefer_deferred_runtime_asserts_over_guards=True, + allow_complex_guards_as_runtime_asserts=True, ) # %% diff --git a/docs/_downloads/3e4586a9107efae8f87a361bd207b6e0/weight_streaming_example.ipynb b/docs/_downloads/3e4586a9107efae8f87a361bd207b6e0/weight_streaming_example.ipynb index 512d266542..192555d68e 100644 --- a/docs/_downloads/3e4586a9107efae8f87a361bd207b6e0/weight_streaming_example.ipynb +++ b/docs/_downloads/3e4586a9107efae8f87a361bd207b6e0/weight_streaming_example.ipynb @@ -22,7 +22,7 @@ }, "outputs": [], "source": [ - "import copy\nimport timeit\n\nimport numpy as np\nimport torch\nimport torch_tensorrt\nfrom transformers import AutoModelForCausalLM\n\n\ndef export_llm(model, inputs, min_seq_len=1, max_seq_len=16):\n \"\"\"\n Exports the LLM model into an ExportedProgram with dynamic shapes.\n In the case of guard failures due to some PyTorch kernel implements, we also\n try to re-export the graph by expressing them as runtime assert nodes\n \"\"\"\n with torch.no_grad():\n # max=1024 has contraint violation error. https://github.com/pytorch/pytorch/issues/125604\n seq_len = torch.export.Dim(\"seq_len\", min=min_seq_len, max=max_seq_len)\n position_ids = torch.arange(inputs.shape[1]).unsqueeze(0).to(inputs.device)\n try:\n print(\"Trying to export the model using torch.export.export()..\")\n # strict=False only enables aotautograd tracing and excludes dynamo.\n ep = torch.export.export(\n model,\n args=(inputs,),\n kwargs={\"position_ids\": position_ids},\n dynamic_shapes=({1: seq_len}, {1: seq_len}),\n strict=False,\n )\n except:\n print(\n \"Trying torch.export._trace._export to trace the graph since torch.export.export() failed\"\n )\n # This API is used to express the constraint violation guards as asserts in the graph.\n ep = torch.export._trace._export(\n model,\n args=(inputs,),\n kwargs={\"position_ids\": position_ids},\n dynamic_shapes=({1: seq_len}, {1: seq_len}),\n strict=False,\n prefer_deferred_runtime_asserts_over_guards=True,\n )\n\n return ep\n\n\ndef time_generate(model, inputs, output_seq_length, iterations=10):\n \"\"\"\n Measure the time for generating a sentence over certain number of iterations\n \"\"\"\n # We only support single input (B x seq_len) for LLMs now\n input_seq = inputs[0]\n with torch.no_grad():\n timings = []\n for _ in range(iterations):\n start_time = timeit.default_timer()\n inputs_copy = copy.copy(input_seq)\n # Greedy decoding of the model. This generates up to max_tokens.\n while inputs_copy.shape[1] <= output_seq_length:\n outputs = model(inputs_copy)\n logits = outputs.logits\n next_token_logits = logits[:, -1, :]\n next_tokens = torch.argmax(next_token_logits, dim=-1)\n inputs_copy = torch.cat([inputs_copy, next_tokens[:, None]], dim=-1)\n torch.cuda.synchronize()\n end_time = timeit.default_timer()\n timings.append(end_time - start_time)\n\n times = np.array(timings)\n time_mean_ms = np.mean(times) * 1000\n\n return time_mean_ms\n\n\n# Load the LLaMA-2 model\nDEVICE = torch.device(\"cuda:0\")\nllama_path = \"meta-llama/Llama-2-7b-chat-hf\"\nwith torch.no_grad():\n model = AutoModelForCausalLM.from_pretrained(\n llama_path, use_cache=False, attn_implementation=\"eager\"\n ).eval()\n\n# Set input and output sequence lengths\nisl = 128\nosl = 256\n\n# Create random input tensors\ninput_tensors = [torch.randint(0, 5, (1, isl), dtype=torch.int64).cuda()]\n# Convert the model to half precision (FP16)\nmodel = model.half()\n# Exports the LLM model into an ExportedProgram with dynamic shapes.\nllama2_ep = export_llm(model, input_tensors[0], max_seq_len=osl)" + "import copy\nimport timeit\n\nimport numpy as np\nimport torch\nimport torch_tensorrt\nfrom transformers import AutoModelForCausalLM\n\n\ndef export_llm(model, inputs, min_seq_len=1, max_seq_len=16):\n \"\"\"\n Exports the LLM model into an ExportedProgram with dynamic shapes.\n In the case of guard failures due to some PyTorch kernel implements, we also\n try to re-export the graph by expressing them as runtime assert nodes\n \"\"\"\n with torch.no_grad():\n # max=1024 has contraint violation error. https://github.com/pytorch/pytorch/issues/125604\n seq_len = torch.export.Dim(\"seq_len\", min=min_seq_len, max=max_seq_len)\n position_ids = torch.arange(inputs.shape[1]).unsqueeze(0).to(inputs.device)\n try:\n print(\"Trying to export the model using torch.export.export()..\")\n # strict=False only enables aotautograd tracing and excludes dynamo.\n ep = torch.export.export(\n model,\n args=(inputs,),\n kwargs={\"position_ids\": position_ids},\n dynamic_shapes=({1: seq_len}, {1: seq_len}),\n strict=False,\n )\n except:\n print(\n \"Trying torch.export._trace._export to trace the graph since torch.export.export() failed\"\n )\n # This API is used to express the constraint violation guards as asserts in the graph.\n ep = torch.export._trace._export(\n model,\n args=(inputs,),\n kwargs={\"position_ids\": position_ids},\n dynamic_shapes=({1: seq_len}, {1: seq_len}),\n strict=False,\n allow_complex_guards_as_runtime_asserts=True,\n )\n\n return ep\n\n\ndef time_generate(model, inputs, output_seq_length, iterations=10):\n \"\"\"\n Measure the time for generating a sentence over certain number of iterations\n \"\"\"\n # We only support single input (B x seq_len) for LLMs now\n input_seq = inputs[0]\n with torch.no_grad():\n timings = []\n for _ in range(iterations):\n start_time = timeit.default_timer()\n inputs_copy = copy.copy(input_seq)\n # Greedy decoding of the model. This generates up to max_tokens.\n while inputs_copy.shape[1] <= output_seq_length:\n outputs = model(inputs_copy)\n logits = outputs.logits\n next_token_logits = logits[:, -1, :]\n next_tokens = torch.argmax(next_token_logits, dim=-1)\n inputs_copy = torch.cat([inputs_copy, next_tokens[:, None]], dim=-1)\n torch.cuda.synchronize()\n end_time = timeit.default_timer()\n timings.append(end_time - start_time)\n\n times = np.array(timings)\n time_mean_ms = np.mean(times) * 1000\n\n return time_mean_ms\n\n\n# Load the LLaMA-2 model\nDEVICE = torch.device(\"cuda:0\")\nllama_path = \"meta-llama/Llama-2-7b-chat-hf\"\nwith torch.no_grad():\n model = AutoModelForCausalLM.from_pretrained(\n llama_path, use_cache=False, attn_implementation=\"eager\"\n ).eval()\n\n# Set input and output sequence lengths\nisl = 128\nosl = 256\n\n# Create random input tensors\ninput_tensors = [torch.randint(0, 5, (1, isl), dtype=torch.int64).cuda()]\n# Convert the model to half precision (FP16)\nmodel = model.half()\n# Exports the LLM model into an ExportedProgram with dynamic shapes.\nllama2_ep = export_llm(model, input_tensors[0], max_seq_len=osl)" ] }, { diff --git a/docs/_downloads/b26ba3d33b5fc57e738fb2f26cabe4e8/weight_streaming_example.py b/docs/_downloads/b26ba3d33b5fc57e738fb2f26cabe4e8/weight_streaming_example.py index c477ba6df8..601292ba95 100644 --- a/docs/_downloads/b26ba3d33b5fc57e738fb2f26cabe4e8/weight_streaming_example.py +++ b/docs/_downloads/b26ba3d33b5fc57e738fb2f26cabe4e8/weight_streaming_example.py @@ -65,7 +65,7 @@ def export_llm(model, inputs, min_seq_len=1, max_seq_len=16): kwargs={"position_ids": position_ids}, dynamic_shapes=({1: seq_len}, {1: seq_len}), strict=False, - prefer_deferred_runtime_asserts_over_guards=True, + allow_complex_guards_as_runtime_asserts=True, ) return ep diff --git a/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html b/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html index f2907b1d60..c1023a8a76 100644 --- a/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html +++ b/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html @@ -556,7 +556,7 @@

Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModuleuse_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME, immutable_weights: bool = False, strict: bool = True, - prefer_deferred_runtime_asserts_over_guards: bool = False, + allow_complex_guards_as_runtime_asserts: bool = False, weight_streaming_budget: Optional[int] = None, enabled_precisions: Optional[Set[Union[torch.dtype, dtype]]] = None, **kwargs: Any, @@ -622,8 +622,8 @@

Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModuleself.kwarg_inputs: dict[str, Any] = {} self.additional_settings = kwargs self.strict = strict - self.prefer_deferred_runtime_asserts_over_guards = ( - prefer_deferred_runtime_asserts_over_guards + self.allow_complex_guards_as_runtime_asserts = ( + allow_complex_guards_as_runtime_asserts ) self.use_python_runtime = use_python_runtime self.trt_device = to_torch_tensorrt_device(device) @@ -800,14 +800,14 @@

Source code for torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModuledef get_exported_program(self) -> torch.export.ExportedProgram: def export_fn() -> torch.export.ExportedProgram: - if self.prefer_deferred_runtime_asserts_over_guards: + if self.allow_complex_guards_as_runtime_asserts: return _export( self.original_model, self.arg_inputs, kwargs=self.kwarg_inputs, dynamic_shapes=self._get_total_dynamic_shapes(), strict=self.strict, - prefer_deferred_runtime_asserts_over_guards=self.prefer_deferred_runtime_asserts_over_guards, + allow_complex_guards_as_runtime_asserts=self.allow_complex_guards_as_runtime_asserts, ) else: return torch.export.export( diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.rst.txt index 84b4dbf25d..3336f3bb48 100644 --- a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.rst.txt +++ b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.rst.txt @@ -136,7 +136,7 @@ due to `0/1 specialization Functions

-class torch_tensorrt.MutableTorchTensorRTModule(pytorch_model: Module, *, device: Optional[Union[Device, device, str]] = None, use_python_runtime: bool = False, immutable_weights: bool = False, strict: bool = True, prefer_deferred_runtime_asserts_over_guards: bool = False, weight_streaming_budget: Optional[int] = None, enabled_precisions: Optional[Set[Union[dtype, dtype]]] = None, **kwargs: Any)[source]
+class torch_tensorrt.MutableTorchTensorRTModule(pytorch_model: Module, *, device: Optional[Union[Device, device, str]] = None, use_python_runtime: bool = False, immutable_weights: bool = False, strict: bool = True, allow_complex_guards_as_runtime_asserts: bool = False, weight_streaming_budget: Optional[int] = None, enabled_precisions: Optional[Set[Union[dtype, dtype]]] = None, **kwargs: Any)[source]

Initialize a MutableTorchTensorRTModule to seamlessly manipulate it like a regular PyTorch module. All TensorRT compilation and refitting processes are handled automatically as you work with the module. Any changes to its attributes or loading a different state_dict will trigger refitting or recompilation, @@ -654,7 +654,7 @@

Classes
-__init__(pytorch_model: Module, *, device: Optional[Union[Device, device, str]] = None, use_python_runtime: bool = False, immutable_weights: bool = False, strict: bool = True, prefer_deferred_runtime_asserts_over_guards: bool = False, weight_streaming_budget: Optional[int] = None, enabled_precisions: Optional[Set[Union[dtype, dtype]]] = None, **kwargs: Any) None[source]
+__init__(pytorch_model: Module, *, device: Optional[Union[Device, device, str]] = None, use_python_runtime: bool = False, immutable_weights: bool = False, strict: bool = True, allow_complex_guards_as_runtime_asserts: bool = False, weight_streaming_budget: Optional[int] = None, enabled_precisions: Optional[Set[Union[dtype, dtype]]] = None, **kwargs: Any) None[source]
Parameters

pytorch_model (torch.nn.module) – Source module that needs to be accelerated

diff --git a/docs/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.html b/docs/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.html index e81bfe898e..fa0c6a8e5d 100644 --- a/docs/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.html +++ b/docs/tutorials/_rendered_examples/dynamo/torch_export_flux_dev.html @@ -581,7 +581,7 @@

Export the backbone using torch.exportkwargs=dummy_inputs, dynamic_shapes=dynamic_shapes, strict=False, - prefer_deferred_runtime_asserts_over_guards=True, + allow_complex_guards_as_runtime_asserts=True, ) diff --git a/docs/tutorials/_rendered_examples/dynamo/weight_streaming_example.html b/docs/tutorials/_rendered_examples/dynamo/weight_streaming_example.html index 422e845131..5b4b2ed275 100644 --- a/docs/tutorials/_rendered_examples/dynamo/weight_streaming_example.html +++ b/docs/tutorials/_rendered_examples/dynamo/weight_streaming_example.html @@ -556,7 +556,7 @@

Imports and Model Definitionkwargs={"position_ids": position_ids}, dynamic_shapes=({1: seq_len}, {1: seq_len}), strict=False, - prefer_deferred_runtime_asserts_over_guards=True, + allow_complex_guards_as_runtime_asserts=True, ) return ep