diff --git a/gpu4pyscf/df/int3c2e_bdiv.py b/gpu4pyscf/df/int3c2e_bdiv.py
index f6d6d8457..d87159255 100644
--- a/gpu4pyscf/df/int3c2e_bdiv.py
+++ b/gpu4pyscf/df/int3c2e_bdiv.py
@@ -494,6 +494,8 @@ def orbital_pair_cart2sph(self, compressed_eri3c, inplace=True):
         p0 = p1 = 0
         for (i, j), bas_ij_idx in zip(ij_tasks, self.shl_pair_idx):
             nshl_pair = bas_ij_idx.size
+            if nshl_pair == 0:
+                continue
             ci = c2s[i]
             cj = c2s[j]
             nfi, di = ci.shape
diff --git a/gpu4pyscf/df/tests/test_df_int3c2e.py b/gpu4pyscf/df/tests/test_df_int3c2e.py
index eec40ecb7..64d1a4072 100644
--- a/gpu4pyscf/df/tests/test_df_int3c2e.py
+++ b/gpu4pyscf/df/tests/test_df_int3c2e.py
@@ -192,3 +192,19 @@ def test_int3c2e_sparse1():
     dat = int3c2e_bdiv.aux_e2(mol, mol)
     ref = incore.aux_e2(mol, mol)
     assert abs(dat.get() - ref).max() < 1e-9
+
+    int3c2e_opt = int3c2e_bdiv.Int3c2eOpt(mol, mol).build()
+    ao_pair_mapping = int3c2e_opt.create_ao_pair_mapping()
+    nao = mol.nao
+    i, j = divmod(ao_pair_mapping, nao)
+    coeff = cp.asarray(int3c2e_opt.coeff)
+    aux_coeff = cp.asarray(int3c2e_opt.coeff)
+    for eri3c_batch in int3c2e_opt.int3c2e_bdiv_generator():
+        eri3c_batch = int3c2e_opt.orbital_pair_cart2sph(eri3c_batch, inplace=True)
+        dat = cp.zeros((nao*nao, nao))
+        dat[i*nao+j] = dat[j*nao+i] = eri3c_batch
+        dat = dat.reshape(nao,nao,nao)
+        dat = contract('pqr,rk->pqk', dat, aux_coeff)
+        dat = contract('pqk,qj->pjk', dat, coeff)
+        dat = contract('pjk,pi->ijk', dat, coeff)
+        assert abs(dat.get() - ref).max() < 1e-9
diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py
index 084718b6e..24bef8353 100644
--- a/gpu4pyscf/lib/cutensor.py
+++ b/gpu4pyscf/lib/cutensor.py
@@ -56,6 +56,20 @@ def _auto_create_mode(array, mode):
 #            cutensor_dtype, alignment_req=alignment_req)
 #    return _tensor_descriptors[key]
 
+def _contract_einsum(pattern, a, b, alpha, beta, out=None, einsum=cupy.einsum):
+    if out is None:
+        out = einsum(pattern, a, b)
+        out *= alpha
+    elif beta == 0.:
+        out[:] = einsum(pattern, a, b)
+        out *= alpha
+    else:
+        out *= beta
+        tmp = einsum(pattern, a, b)
+        tmp *= alpha
+        out += tmp
+    return cupy.asarray(out, order='C')
+
 def contraction(
     pattern, a, b, alpha, beta,
     out=None,
@@ -67,6 +81,9 @@ def contraction(
     compute_desc=0,
     ws_pref=WORKSPACE_RECOMMENDED
 ):
+    if a.size == 0 or b.size == 0:
+        # cutensor does not support the 0-sized operands
+        return _contract_einsum(pattern, a, b, alpha, beta, out)
 
     pattern = pattern.replace(" ", "")
     str_a, rest = pattern.split(',')
@@ -138,22 +155,11 @@ def contraction(
     warnings.warn(f'using {contract_engine} as the tensor contraction engine.')
     def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None):
         try:
-            if out is None:
-                out = einsum(pattern, a, b)
-                out *= alpha
-            elif beta == 0.:
-                out[:] = einsum(pattern, a, b)
-                out *= alpha
-            else:
-                out *= beta
-                tmp = einsum(pattern, a, b)
-                tmp *= alpha
-                out += tmp
+            return _contract_einsum(pattern, a, b, alpha, beta, out, einsum)
         except cupy.cuda.memory.OutOfMemoryError:
             print('Out of memory error caused by cupy.einsum. '
                   'It is recommended to install cutensor to resolve this.')
             raise
-        return cupy.asarray(out, order='C')
 else:
     def contract(pattern, a, b, alpha=1.0, beta=0.0, out=None):
         '''