[aotindutor] Forward fix a performance regression

Summary: Forward fix a performance regression caused by #110510. When a model is run once, all those kernel pointers are initialized and removing the if-nullptr check will cause those loadKernel be unnecessarily executed again when we rerun the foward function. Another way to do this is to codegen loadKernel in the initializer, which I may in a later PR. ghstack-source-id: d2d5531df77c4e69c38e0e13c21278ca6943f0f0 Pull Request resolved: #110800
pytorch · Oct 7, 2023 · c5567da · c5567da
1 parent d84bcb9
commit c5567da
Showing 1 changed file with 6 additions and 2 deletions.
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
@@ -2021,13 +2021,17 @@ def generate_load_kernel_once(
         self, name: str, mangled_name: str, cubin_path: str, shared_mem: int
     ):
         if V.graph.aot_mode:
+            self.writeline(f"if (kernels.{name} == nullptr) {{")
             self.writeline(
-                f"""kernels.{name} = loadKernel("{cubin_path}", "{mangled_name}", {shared_mem}, this->cubin_dir_);"""
+                f"""    kernels.{name} = loadKernel("{cubin_path}", "{mangled_name}", {shared_mem}, this->cubin_dir_);"""
             )
+            self.writeline("}")
         else:
+            self.writeline(f"if ({name} == nullptr) {{")
             self.writeline(
-                f"""{name} = loadKernel("{cubin_path}", "{mangled_name}", {shared_mem});"""
+                f"""    {name} = loadKernel("{cubin_path}", "{mangled_name}", {shared_mem});"""
             )
+            self.writeline("}")
 
     def generate_args_decl(self, call_args):
         dynamic_symbols = V.graph.sizevars.free_symbols()