- 
                Notifications
    You must be signed in to change notification settings 
- Fork 25.7k
Closed
Description
🐛 Describe the bug
Compare with the TorchInductor CPU Performance Dashboard on ww02.2, there is a performance regression on Torchbench model lennard_jones on ww02.3 as bellow:
| ww02.3 | ww02.2 | |||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| batch_size | speedup | inductor | eager | batch_size | speedup | inductor | eager | speedup ratio | eager ratio | inductor ratio | 
| 1000 | 0.9068 | 0.0003193 | 0.000289541 | 1000 | 0.8883 | 0.0002327 | 0.000206707 | 1.02 | 0.71 | 0.73 | 
WW02.3 SW info:
| SW | Nightly commit | Master/Main commit | 
|---|---|---|
| Pytorch | fac4361 | 73e5379 | 
| Torchbench | / | 354378b | 
| torchaudio | ecc2781 | 4a037b0 | 
| torchtext | 112d757 | c7cc5fc | 
| torchvision | ac06efe | 35f68a0 | 
| torchdata | 049fb62 | c0934b9 | 
WW02.2 SW info:
| SW | Nightly commit | Master/Main commit | 
|---|---|---|
| Pytorch | fac4361 | 73e5379 | 
| Torchbench | / | ff361c6 | 
| torchaudio | 1c98d76 | 0be8423 | 
| torchtext | 6cbfd3e | 7c7b640 | 
| torchvision | b7637f6 | 0dceac0 | 
| torchdata | 0d9aa37 | 0a0ae5d | 
grapy.py
grapy.py of this model on ww02.3
GRAPH_INDEX:0
class <lambda>(torch.nn.Module):
    def forward(self, arg0_1: f32[16, 1], arg1_1: f32[16], arg2_1: f32[1982689, 1], arg3_1: f32[16, 16], arg4_1: f32[16], arg5_1: f32[1982689, 1], arg6_1: f32[16, 16], arg7_1: f32[16], arg8_1: f32[1982689, 1], arg9_1: f32[16, 16], arg10_1: f32[16], arg11_1: f32[1982689, 1], arg12_1: f32[1, 16], arg13_1: f32[1], arg14_1: f32[1982689, 1], arg15_1: f32[1000, 1]):
        # File: /workspace/pytorch/torch/nn/modules/container.py:217, code: input = module(input)
        _mkl_linear: f32[1000, 16] = torch.ops.mkl._mkl_linear.default(arg15_1, arg2_1, arg0_1, arg1_1, 1000);  arg15_1 = arg2_1 = arg0_1 = arg1_1 = None
        tanh: f32[1000, 16] = torch.ops.aten.tanh.default(_mkl_linear);  _mkl_linear = None
        _mkl_linear_1: f32[1000, 16] = torch.ops.mkl._mkl_linear.default(tanh, arg5_1, arg3_1, arg4_1, 1000);  tanh = arg5_1 = arg3_1 = arg4_1 = None
        tanh_1: f32[1000, 16] = torch.ops.aten.tanh.default(_mkl_linear_1);  _mkl_linear_1 = None
        _mkl_linear_2: f32[1000, 16] = torch.ops.mkl._mkl_linear.default(tanh_1, arg8_1, arg6_1, arg7_1, 1000);  tanh_1 = arg8_1 = arg6_1 = arg7_1 = None
        tanh_2: f32[1000, 16] = torch.ops.aten.tanh.default(_mkl_linear_2);  _mkl_linear_2 = None
        _mkl_linear_3: f32[1000, 16] = torch.ops.mkl._mkl_linear.default(tanh_2, arg11_1, arg9_1, arg10_1, 1000);  tanh_2 = arg11_1 = arg9_1 = arg10_1 = None
        tanh_3: f32[1000, 16] = torch.ops.aten.tanh.default(_mkl_linear_3);  _mkl_linear_3 = None
        _mkl_linear_4: f32[1000, 1] = torch.ops.mkl._mkl_linear.default(tanh_3, arg14_1, arg12_1, arg13_1, 1000);  tanh_3 = arg14_1 = arg12_1 = arg13_1 = None
        return (_mkl_linear_4,)
        
grapy.py of this model on ww02.2
GRAPH_INDEX:0
class <lambda>(torch.nn.Module):
    def forward(self, arg0_1: f32[16, 1], arg1_1: f32[16], arg2_1: f32[1982689, 1], arg3_1: f32[16, 16], arg4_1: f32[16], arg5_1: f32[1982689, 1], arg6_1: f32[16, 16], arg7_1: f32[16], arg8_1: f32[1982689, 1], arg9_1: f32[16, 16], arg10_1: f32[16], arg11_1: f32[1982689, 1], arg12_1: f32[1, 16], arg13_1: f32[1], arg14_1: f32[1982689, 1], arg15_1: f32[1000, 1]):
        # File: /workspace/pytorch/torch/nn/modules/container.py:217, code: input = module(input)
        _mkl_linear: f32[1000, 16] = torch.ops.mkl._mkl_linear.default(arg15_1, arg2_1, arg0_1, arg1_1, 1000);  arg15_1 = arg2_1 = arg0_1 = arg1_1 = None
        tanh: f32[1000, 16] = torch.ops.aten.tanh.default(_mkl_linear);  _mkl_linear = None
        _mkl_linear_1: f32[1000, 16] = torch.ops.mkl._mkl_linear.default(tanh, arg5_1, arg3_1, arg4_1, 1000);  tanh = arg5_1 = arg3_1 = arg4_1 = None
        tanh_1: f32[1000, 16] = torch.ops.aten.tanh.default(_mkl_linear_1);  _mkl_linear_1 = None
        _mkl_linear_2: f32[1000, 16] = torch.ops.mkl._mkl_linear.default(tanh_1, arg8_1, arg6_1, arg7_1, 1000);  tanh_1 = arg8_1 = arg6_1 = arg7_1 = None
        tanh_2: f32[1000, 16] = torch.ops.aten.tanh.default(_mkl_linear_2);  _mkl_linear_2 = None
        _mkl_linear_3: f32[1000, 16] = torch.ops.mkl._mkl_linear.default(tanh_2, arg11_1, arg9_1, arg10_1, 1000);  tanh_2 = arg11_1 = arg9_1 = arg10_1 = None
        tanh_3: f32[1000, 16] = torch.ops.aten.tanh.default(_mkl_linear_3);  _mkl_linear_3 = None
        _mkl_linear_4: f32[1000, 1] = torch.ops.mkl._mkl_linear.default(tanh_3, arg14_1, arg12_1, arg13_1, 1000);  tanh_3 = arg14_1 = arg12_1 = arg13_1 = None
        return (_mkl_linear_4,)
        
Minified repro
python -m torch.backends.xeon.run_cpu --node_id 0 benchmarks/dynamo/torchbench.py --performance --float32 -dcpu --output=inductor_log/ww022.csv -n50 --inductor  --no-skip --dashboard --only lennard_jones  --cold_start_latency
Metadata
Metadata
Assignees
Labels
No labels
Type
Projects
Status
Done