I've tracked down at least one commit that has contributed to performance regression for the conv kernel: a2db2f7. E.g. this reduced Resnet50 FPS from 13.2 to 8.3 on my laptop.
; Function Attrs: noinline
define internal void @libjit_convDKKC8_f_44_specialized(float*, float*, float*, float*, i64*, i64*, i64*, i64*, i64, i64, i64, i32, i32, i32, i32) #8 {
entry:
call void @libjit_convDKKC8_f(float* %0, float* %1, float* %2, float* %3, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @26, i32 0, i32 0), i64* getelementptr inbounds ([4 x i64], [4 x i64]* @26, i32 0, i32 0), i64* getelementptr inbounds ([5 x i64], [5 x i64]* @27, i32 0, i32 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @12, i32 0, i32 0), i64 3, i64 1, i64 1, i32 0, i32 2, i32 5, i32 4)
ret void
}
filterSize: 3
stride: 1
pad: 1
pixelScanFirst: 0
numDepthRegs: 2
sizeGroupY: 5
depthStrips: 4