From 9c1854e1bd8de2eb1393993927c7c899fdcaadd4 Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Sat, 20 Nov 2021 00:28:38 +0100
Subject: [PATCH 01/22] ENH:linalg: Cythonized core expm functionality

---
 .gitignore                         |   2 +
 scipy/linalg/_matfuncs_expm.pyx.in | 722 +++++++++++++++++++++++++++++
 scipy/linalg/setup.py              |   3 +
 3 files changed, 727 insertions(+)
 create mode 100644 scipy/linalg/_matfuncs_expm.pyx.in
diff --git a/.gitignore b/.gitignore
index ab34160de68f..b22d18d96e92 100644
--- a/.gitignore
+++ b/.gitignore
@@ -198,6 +198,8 @@ scipy/linalg/cython_lapack.pyx
 scipy/linalg/src/id_dist/src/*_subr_*.f
 scipy/linalg/_matfuncs_sqrtm_triu.c
 scipy/linalg/_matfuncs_sqrtm_triu.cpp
+scipy/linalg/_matfuncs_expm.c
+scipy/linalg/_matfuncs_expm.pyx
 scipy/ndimage/src/_ni_label.c
 scipy/ndimage/src/_cytest.c
 scipy/optimize/_bglu_dense.c
diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in
new file mode 100644
index 000000000000..5c3384701710
--- /dev/null
+++ b/scipy/linalg/_matfuncs_expm.pyx.in
@@ -0,0 +1,722 @@
+# cython: language_level=3
+cimport cython
+cimport numpy as cnp
+import numpy as np
+import random
+import math
+from libc.stdlib cimport malloc, free
+from libc.math cimport fabs, ceil, log2, pow
+from scipy.linalg.cython_lapack cimport (sgetrf, sgetrs, dgetrf, dgetrs,
+                                         cgetrf, cgetrs, zgetrf, zgetrs)
+from scipy.linalg.cython_blas cimport (sgemm, saxpy, sscal, scopy, sgemv,
+                                       dgemm, daxpy, dscal, dcopy, dgemv,
+                                       cgemm, caxpy, cscal, ccopy, cgemv,
+                                       zgemm, zaxpy, zscal, zcopy, dgemv,
+                                       csscal, zdscal, idamax)
+
+__all__ = ['pick_pade_structure', 'pade_UV_calc']
+
+ctypedef fused lapack_t:
+    float
+    double
+    (float complex)
+    (double complex)
+
+ctypedef fused lapack_cz_t:
+    (float complex)
+    (double complex)
+
+ctypedef fused lapack_sd_t:
+    float
+    double
+
+{{py:
+# Tempita variables
+typenames = ['float', 'double', 'float complex', 'double complex']
+precision = ['float', 'double', 'float', 'double']
+prefixes = ['s', 'd', 'c', 'z']
+}}
+# ====================== swap_c_and_f_layout : s, d, c, z ====================
+@cython.cdivision(True)
+@cython.wraparound(False)
+@cython.boundscheck(False)
+@cython.initializedcheck(False)
+cdef void swap_c_and_f_layout(lapack_t *a, lapack_t *b, int r, int c, int n) nogil:
+    """Recursive matrix transposition for square arrays"""
+    cdef int i, j, ith_row, r2, c2
+    cdef lapack_t *bb=b
+    cdef lapack_t *aa=a
+    if r < 16:
+        for j in range(c):
+            ith_row = 0
+            for i in range(r):
+            # Basically b[i*n+j] = a[j*n+i] without index math
+                bb[ith_row] = aa[i]
+                ith_row += n
+            aa += n
+            bb += 1
+    else:
+        # If tall
+        if (r > c):
+            r2 = r//2
+            swap_c_and_f_layout(a, b, r2, c, n)
+            swap_c_and_f_layout(a + r2, b+(r2)*n, r-r2, c, n)
+        else:  # Nope
+            c2 = c//2
+            swap_c_and_f_layout(a, b, r, c2, n);
+            swap_c_and_f_layout(a+(c2)*n, b+c2, r, c-c2, n)
+# ============================================================================
+
+# ========================= norm1 : s, d, c, z ===============================
+# The generic abs() function is made nogil friendly in Cython 3.x (unreleased)
+@cython.wraparound(False)
+@cython.boundscheck(False)
+@cython.initializedcheck(False)
+cdef double norm1(lapack_t[:, ::1] A):
+    """
+    Fast 1-norm computation for C-contiguous square arrays.
+    Regardless of the dtype we work in double precision to prevent overflows
+    """
+    cdef Py_ssize_t n = A.shape[0]
+    cdef Py_ssize_t i, j
+    cdef int ind = 1, intn = <int>n
+    cdef double temp = 0.
+    cdef double complex temp_cz = 0.
+    cdef double temp_sd = 0.
+    cdef double *work = <double*> malloc(n*sizeof(double))
+    if not work:
+        raise MemoryError('Internal function "norm1" failed to allocate memory.')
+    try:
+        if lapack_t in lapack_cz_t:
+            for j in range(n):
+                temp_cz = A[0, j]
+                work[j] = abs(temp_cz)
+            for i in range(1, n):
+                for j in range(n):
+                    temp_cz = A[i, j]
+                    work[j] += abs(temp_cz)
+        else:
+            for j in range(n):
+                temp_sd = A[0, j]
+                work[j] = abs(temp_sd)
+            for i in range(1, n):
+                for j in range(n):
+                    temp_sd = A[i, j]
+                    work[j] += abs(temp_sd)
+
+        ind = idamax(&intn, &work[0], &ind)
+        temp = work[ind-1]
+        return temp
+    finally:
+        free(work)
+# ============================================================================
+
+# ========================= kth power norm : d ===============================
+@cython.wraparound(False)
+@cython.boundscheck(False)
+@cython.initializedcheck(False)
+cdef double kth_power_norm_d(double* A, double* v1, double* v2, Py_ssize_t n, int spins):
+    cdef int k, int_one = 1, intn = <int>n
+    cdef double one =1., zero = 0.
+    for k in range(spins):
+        dgemv(<char*>'C', &intn, &intn, &one, &A[0], &intn, &v1[0], &int_one, &zero, &v2[0], &int_one)
+        dcopy(&intn, &v2[0], &int_one, &v1[0], &int_one)
+    k = idamax(&intn, &v1[0], &int_one)
+    return v1[k-1]
+# ============================================================================
+
+# ====================== pick_pade_structure : s, d, c, z ====================
+{{for tname, pfx, dchar, dt in zip(typenames, prefixes, ['f', 'd', 'F', 'D'],
+                                   ['float32', 'float64', 'complex64', 'complex128'],
+                                  )}}
+@cython.cdivision(True)
+@cython.wraparound(False)
+@cython.boundscheck(False)
+@cython.initializedcheck(False)
+cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, ::1] A):
+    cdef Py_ssize_t n = A.shape[0], i, j, k
+    cdef int lm = 0, s = 0, intn = <int>n, n2= intn*intn, int_one = 1
+    cdef cnp.ndarray[cnp.{{ dt }}_t, ndim=3] Am = np.empty((5, n, n), dtype='{{ dchar }}')
+    cdef {{ tname }}[:, :, ::1] Amv = Am
+    cdef:
+        {{ tname }} one = 1.0, zero = 0.0
+        double normA
+        double d4, d6, d8, d10
+        double eta0, eta1, eta2, eta3, eta4
+        double u = (2.)**({{if pfx in ['s', 'c']}}-24.{{else}}-53{{endif}})
+        double two_pow_s
+        double temp
+    cdef double *absA = <double*>malloc(n*n*sizeof(double))
+    cdef double *work = <double*>malloc(n*sizeof(double))
+    cdef double *work2 = <double*>malloc(n*sizeof(double))
+    if not work or not work2 or not absA:
+        raise MemoryError('Internal function "pick_pade_structure" failed to allocate memory.')
+    cdef double [5] theta
+    cdef double [5] coeff
+    cdef char* cN = 'N'
+
+    theta[0] = 1.495585217958292e-002
+    theta[1] = 2.539398330063230e-001
+    theta[2] = 9.504178996162932e-001
+    theta[3] = 2.097847961257068e+000
+    theta[4] = 4.250000000000000e+000
+    coeff[0] = u*100800.
+    coeff[1] = u*10059033600.
+    coeff[2] = u*4487938430976000.
+    coeff[3] = u*5914384781877411840000.
+    coeff[4] = u*113250775606021113483283660800000000.
+    try:
+        for j in range(n):
+            work[j] = 1.
+
+        {{ pfx }}copy(&n2, &A[0, 0], &int_one, &Amv[0, 0, 0], &int_one)
+        for i in range(n):
+            for j in range(n):
+                absA[i*n + j] = abs(A[i, j])
+
+        # First spin = normest(|A|, 1), increase m when spun more
+        normA = kth_power_norm_d(absA, work, work2, n, 1)
+        {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[0, 0, 0], &intn, &Amv[0, 0, 0], &intn, &zero, &Amv[1, 0, 0], &intn)
+        {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[1, 0, 0], &intn, &Amv[1, 0, 0], &intn, &zero, &Amv[2, 0, 0], &intn)
+        {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[2, 0, 0], &intn, &Amv[1, 0, 0], &intn, &zero, &Amv[3, 0, 0], &intn)
+        d4 = norm1(Amv[2]) ** (1./4.)
+        d6 = norm1(Amv[3]) ** (1./6.)
+        eta0 = max(d4, d6)
+        eta1 = eta0
+
+        # m = 3
+        temp = kth_power_norm_d(absA, work, work2, n, 6)
+        lm = max(<int>ceil(log2(temp/normA/coeff[0])/6), 0)
+        if eta0 < theta[0] and lm == 0:
+            return Am, 3, s
+
+        # m = 5
+        temp = kth_power_norm_d(absA, work, work2, n, 4)
+        lm = max(<int>ceil(log2(temp/normA/coeff[1])/10), 0)
+        if eta1 < theta[1] and lm == 0:
+            return Am, 5, s
+
+        # m = 7
+        if n < 360:
+            {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[2, 0, 0], &intn, &Amv[2, 0, 0], &intn, &zero, &Amv[4, 0, 0], &intn)
+            d8 = norm1(Amv[4, :, :]) ** (1./8.)
+        else:
+            d8  = _norm1est(Am[0], m=8) ** (1./8.)
+
+        eta2 = max(d6, d8)
+        temp = kth_power_norm_d(absA, work, work2, n, 4)
+        lm = max(<int>ceil(log2(temp/normA/coeff[2])/14), 0)
+        if eta2 < theta[2] and lm == 0:
+            return Am, 7, s
+
+        # m = 9
+        temp = kth_power_norm_d(absA, work, work2, n, 4)
+        lm = max(<int>ceil(log2(temp/normA/coeff[3])/18), 0)
+        if eta2 < theta[3] and lm == 0:
+            return Am, 9, s
+
+        # m = 13
+        # Scale-square
+        if n < 360:
+            {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[3, 0, 0], &intn, &Amv[2, 0, 0], &intn, &zero, &Amv[4, 0, 0], &intn)
+            d10 = norm1(Amv[4, :, :]) ** (1./10.)
+        else:
+            d10  = _norm1est(Am[0], m=10) ** (1./10.)
+
+        eta3 = max(d8, d10)
+        eta4 = min(eta2, eta3)
+        s = max(<int>ceil(log2(eta4/theta[4])), 0)
+        if s != 0:
+            two_pow_s = 2.** (-s)
+            dscal(&n2, &two_pow_s, absA, &int_one)
+            # kth_power_norm has spun 19 times already
+            two_pow_s = 2.** ((-s)*19.)
+            for i in range(n):
+                work[i] *= two_pow_s
+            normA *= 2.**(-s)
+        temp = kth_power_norm_d(absA, work, work2, n, 8)
+        s += max(<int>ceil(log2(temp/normA/coeff[4])/26), 0)
+        return Am, 13, s
+    finally:
+        free(work)
+        free(work2)
+        free(absA)
+
+
+{{endfor}}
+# ============================================================================
+
+# ====================== pade_m_UV_calc : s, d, c, z =========================
+# Note: MSVC does not accept "Memview[i, j, k] += 1.+0.j" as a valid operation
+# and results with error "C2088: '+=': illegal for struct".
+# OCD is unbearable but we do explicit addition for that reason.
+{{for tname, pfx, prec, sc in zip(typenames, prefixes, precision, ['s', 'd', 'cs', 'zd'])}}
+@cython.wraparound(False)
+@cython.boundscheck(False)
+@cython.initializedcheck(False)
+cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogil:
+    cdef {{ tname }} b[7]
+    cdef int *ipiv = <int*>malloc(n*sizeof(int))
+    cdef int i, info, n2 = n*n, int_one = 1
+    cdef {{ prec }} two=2.0
+    cdef {{ tname }} one = 1.0, zero = 0.0, neg_one = -1.0
+    if not ipiv:
+        raise MemoryError('Internal function "pade_357_UV_calc_c" failed to allocate memory.')
+    # b[m] is always 1. hence skipped
+    if m == 3:
+        b[0] = 120.
+        b[1] = 60.
+        b[2] = 12.
+    elif m == 5:
+        b[0] = 30240.
+        b[1] = 15120.
+        b[2] = 3360.
+        b[3] = 420.
+        b[4] = 30.
+    elif m == 7:
+        b[0] = 17297280.
+        b[1] = 8648640.
+        b[2] = 1995840.
+        b[3] = 277200.
+        b[4] = 25200.
+        b[5] = 1512.
+        b[6] = 56.
+    else:
+        raise ValueError(f'Internal function "pade_357_UV_calc_c" received an invalid value {m}')
+
+    # Utilize the unused powers of Am as scratch memory
+    if m == 3:
+        # U = Am[0] @ Am[1] + 60.*Am[0]
+        {{ pfx }}copy(&n2, &Am[0, 0, 0], &int_one, &Am[3, 0, 0], &int_one)
+        {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[1, 0, 0], &n, &b[1], &Am[3, 0, 0], &n)
+        # V = 12.*Am[1] + 120*I_n
+        {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one)
+        for i in range(n):
+            Am[1, i, i] = Am[1, i, i] + b[0]
+
+    elif m == 5:
+        # U = Am[0] @ (b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n)
+        {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
+        {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one)
+        {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
+        for i in range(n):
+            Am[4, i, i] = Am[4, i, i] + b[1]
+        {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n)
+        # V = b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n
+        {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one)
+        {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
+        for i in range(n):
+            Am[1, i, i] = Am[1, i, i] + b[0]
+
+    else:
+        # U = Am[0] @ (b[7]*Am[3] + b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n)
+        {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
+        {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one)
+        {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
+        {{ pfx }}axpy(&n2, &b[7], &Am[3, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
+        for i in range(n):
+            Am[4, i, i] = Am[4, i, i] + b[1]
+        # We ran out of space for dgemm; first compute V and then reuse space for U
+        # V = b[6]*Am[3] + b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n
+        {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one)
+        {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
+        {{ pfx }}axpy(&n2, &b[6], &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
+        for i in range(n):
+            Am[1, i, i] = Am[1, i, i] + b[0]
+        # Now we can scratch A[2] or A[3]
+        {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n)
+
+    # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U
+    {{ pfx }}axpy(&n2, &neg_one, &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
+
+    # Convert array layout for solving AX = B into Am[2]
+    swap_c_and_f_layout(&Am[3, 0, 0], &Am[2, 0, 0], n, n, n)
+
+    {{ pfx }}getrf( &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &info )
+    {{ pfx }}getrs(<char*>'C', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info )
+    {{ sc }}scal(&n2, &two, &Am[2, 0, 0], &int_one)
+    for i in range(n):
+        Am[2, i, i] = Am[2, i, i] + 1.
+
+    # Put it back in Am in C order
+    swap_c_and_f_layout(&Am[2, 0, 0], &Am[0, 0, 0], n, n, n)
+    free(ipiv)
+
+
+{{endfor}}
+{{for tname, pfx, sc in zip(typenames, prefixes,  ['s', 'd', 'cs', 'zd'])}}
+@cython.wraparound(False)
+@cython.boundscheck(False)
+@cython.initializedcheck(False)
+cdef void pade_9_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n) nogil:
+    cdef {{ tname }} b[9]
+    cdef {{ tname }} *work = <{{ tname }}*>malloc(n*n*sizeof({{ tname }}))
+    cdef int *ipiv = <int*>malloc(n*sizeof(int))
+    cdef int i, info, n2 = n*n, int_one = 1
+    cdef {{ tname }} one = 1.0, zero = 0.0, neg_one = -1.0
+    cdef {{if pfx in ['s', 'c']}}float{{else}}double{{endif}} two = 2.0
+
+    if not (work and ipiv):
+        raise MemoryError('Internal function "pade_9_UV_calc" failed to allocate memory.')
+    try:
+        # b[9] = 1. hence skipped
+        b[0] = 17643225600.
+        b[1] = 8821612800.
+        b[2] = 2075673600.
+        b[3] = 302702400.
+        b[4] = 30270240.
+        b[5] = 2162160.
+        b[6] = 110880.
+        b[7] = 3960.
+        b[8] = 90.
+
+        # Utilize the unused powers of Am as scratch memory
+        # U = Am[0] @ (b[9]*Am[4] + b[7]*Am[3] + b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n)
+        {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &work[0], &int_one)
+        {{ pfx }}scal(&n2, &b[3], &work[0], &int_one)
+        {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &work[0], &int_one)
+        {{ pfx }}axpy(&n2, &b[7], &Am[3, 0, 0], &int_one, &work[0], &int_one)
+        {{ pfx }}axpy(&n2, &one, &Am[4, 0, 0], &int_one, &work[0], &int_one)
+        for i in range(n):
+            work[i*(n+1)] += b[1]
+        # V = b[8]*Am[4] + b[6]*Am[3] + b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n
+        {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one)
+        {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
+        {{ pfx }}axpy(&n2, &b[6], &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
+        {{ pfx }}axpy(&n2, &b[8], &Am[4, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
+        for i in range(n):
+            Am[1, i, i] = Am[1, i, i] + b[0]
+        # Now we can scratch A[2] or A[3]
+        {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &work[0], &n, &zero, &Am[3, 0, 0], &n)
+
+        # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U
+        {{ pfx }}axpy(&n2, &neg_one, &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
+
+        # Convert array layout for solving AX = B into Am[2]
+        swap_c_and_f_layout(&Am[3, 0, 0], &Am[2, 0, 0], n, n, n)
+
+        {{ pfx }}getrf( &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &info )
+        {{ pfx }}getrs(<char*>'C', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info)
+        {{ sc }}scal(&n2, &two, &Am[2, 0, 0], &int_one)
+        for i in range(n):
+            Am[2, i, i] = Am[2, i, i] + 1.
+
+        # Put it back in Am in C order
+        swap_c_and_f_layout(&Am[2, 0, 0], &Am[0, 0, 0], n, n, n)
+    finally:
+        free(work)
+        free(ipiv)
+
+
+{{endfor}}
+{{for tname, pfx, sc in zip(typenames, prefixes,  ['s', 'd', 'cs', 'zd'])}}
+@cython.wraparound(False)
+@cython.boundscheck(False)
+@cython.initializedcheck(False)
+cdef void pade_13_UV_calc_{{ pfx }}({{ tname }}[:, :, ::1]Am, int n) nogil:
+    cdef {{ tname }} *work2 = <{{ tname }}*>malloc(n*n*sizeof({{ tname }}))
+    cdef {{ tname }} *work3 = <{{ tname }}*>malloc(n*n*sizeof({{ tname }}))
+    cdef {{ tname }} *work4 = <{{ tname }}*>malloc(n*n*sizeof({{ tname }}))
+    cdef int *ipiv = <int*>malloc(n*sizeof(int))
+    cdef Py_ssize_t i, j, k
+    cdef int info, int1 = 1, n2 = n*n
+    cdef {{ tname }} one = 1.0, zero = 0.0, neg_one = -1.
+    cdef {{if pfx in ['s', 'c']}}float{{else}}double{{endif}} two = 2.0
+    cdef {{ tname }} b[14]
+    b[0] = 64764752532480000.
+    b[1] = 32382376266240000.
+    b[2] = 7771770303897600.
+    b[3] = 1187353796428800.
+    b[4] = 129060195264000.
+    b[5] = 10559470521600.
+    b[6] = 670442572800.
+    b[7] = 33522128640.
+    b[8] = 1323241920.
+    b[9] = 40840800.
+    b[10] = 960960.
+    b[11] = 16380.
+    b[12] = 182.
+    b[13] = 1.
+
+    if not (work2 and work3 and work4 and ipiv):
+        raise MemoryError('Internal function "pade_13_UV_calc" failed to allocate memory.')
+    try:
+        # U = Am[0] @ (Am[3] @ (b[13]*Am[3] + b[11]*Am[2] + b[9]*Am[1]) +
+        #              b[7]*Am[3] + b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n)
+        # V = Am[3] @ (b[12]*Am[3] + b[10]*Am[2] + b[8]*Am[1]) +
+        #              b[6]*Am[3] + b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n
+
+        {{ pfx }}copy(&n2, &Am[1, 0, 0], &int1, &work2[0], &int1)
+        {{ pfx }}scal(&n2, &b[9], &work2[0], &int1)
+        {{ pfx }}axpy(&n2, &b[11], &Am[2, 0, 0], &int1, &work2[0], &int1)
+        {{ pfx }}axpy(&n2, &b[13], &Am[3, 0, 0], &int1, &work2[0], &int1)
+
+        {{ pfx }}copy(&n2, &Am[1, 0, 0], &int1, &work3[0], &int1)
+        {{ pfx }}scal(&n2, &b[2], &work3[0], &int1)
+        {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int1, &work3[0], &int1)
+        {{ pfx }}axpy(&n2, &b[6], &Am[3, 0, 0], &int1, &work3[0], &int1)
+
+        {{ pfx }}copy(&n2, &Am[1, 0, 0], &int1, &work4[0], &int1)
+        {{ pfx }}scal(&n2, &b[8], &work4[0], &int1)
+        {{ pfx }}axpy(&n2, &b[10], &Am[2, 0, 0], &int1, &work4[0], &int1)
+        {{ pfx }}axpy(&n2, &b[12], &Am[3, 0, 0], &int1, &work4[0], &int1)
+
+        # Overwrite Am[1] as it is not used further
+        {{ pfx }}scal(&n2, &b[3], &Am[1, 0, 0], &int1)
+        {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int1, &Am[1, 0, 0], &int1)
+        {{ pfx }}axpy(&n2, &b[7], &Am[3, 0, 0], &int1, &Am[1, 0, 0], &int1)
+
+        for i in range(n):
+            Am[1, i, i] = Am[1, i, i] + b[1]
+            work3[i*(n+1)] += b[0]
+
+        # U = D @ (A @ B + C)
+        {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &work2[0], &n, &Am[3, 0, 0], &n, &one, &Am[1, 0, 0], &n)
+        {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[1, 0, 0], &n, &Am[0, 0, 0], &n, &zero, &work2[0], &n)
+        # V = A @ B + C
+        {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &work4[0], &n, &Am[3, 0, 0], &n, &one, &work3[0], &n)
+        # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U
+        {{ pfx }}axpy(&n2, &neg_one, &work2[0], &int1, &work3[0], &int1)
+
+        # Convert array layout for solving AX = B into work4
+        swap_c_and_f_layout(work2, work4, n, n, n)
+        free(work2)
+
+        {{ pfx }}getrf( &n, &n, &work3[0], &n, &ipiv[0], &info )
+        {{ pfx }}getrs(<char*>'C', &n, &n, &work3[0], &n, &ipiv[0], &work4[0], &n, &info )
+        {{ sc }}scal(&n2, &two, &work4[0], &int1)
+        for i in range(n):
+            work4[i*(n+1)] += 1.
+        # Put it back in Am in C order
+        swap_c_and_f_layout(work4, &Am[0, 0, 0], n, n, n)
+    finally:
+        free(ipiv)
+        free(work3)
+        free(work4)
+
+
+{{endfor}}
+# ============================================================================
+
+# ====================== norm1est ============================================
+def _norm1est(A, m=1, t=2, max_iter=5):
+    """Compute a lower bound for the 1-norm of 2D matrix A or its powers
+
+    Parameters
+    ----------
+    A : ndarray
+        Input square array of shape (N, N).
+    m : int, optional
+        If it is different than one, then m-th power of the matrix norm is
+        computed.
+    t : int, optional
+        The number of columns of the internal matrix used in the iterations.
+    max_iter : int, optional
+        The number of total iterations to be performed. Problems that require
+        more than 5 iterations are rarely reported in practice.
+
+    Returns
+    -------
+    c : float
+        The resulting 1-norm condition number estimate of A.
+
+    Notes
+    -----
+    Implements a SciPy adaptation of Algorithm 2.4 of [1], and the original
+    Fortran code given in [2].
+
+    The algorithm involves randomized elements and hence if needed, the seed
+    of the Python built-in random module can be set for reproducible results.
+
+    References
+    ----------
+    .. [1] Nicholas J. Higham and Françoise Tisseur (2000),
+           "A Block Algorithm for Matrix 1-Norm Estimation,
+           with an Application to 1-Norm Pseudospectra."
+           SIAM J. Matrix Anal. Appl. Vol. 21, No. 4, pp. 1185-1201.
+
+    .. [2] Sheung Hun Cheng, Nicholas J. Higham (2001), "Implementation for
+           LAPACK of a Block Algorithm for Matrix 1-Norm Estimation",
+           NA Report 393
+
+    """
+    # Ref [1] skips parallel col test for complex inputs
+    real_A = np.isrealobj(A)
+    n = A.shape[0]
+    est_old = 0
+    ind_hist = []
+    S = np.zeros([n, 2*t], dtype=np.int8 if real_A else A.dtype)
+    Y = np.empty([n, t], dtype=A.dtype)
+    Y[:, 0] = A.sum(axis=1) / n
+
+    # Higham and Tisseur assigns random 1, -1 for initialization but they also
+    # mention that it is arbitrary. Hence instead we use e_j to already start
+    # the while loop. Also we don't use a temporary X but keep indices instead
+    if t > 1:
+        cols = random.sample(population=range(n), k=t-1)
+        Y[:, 1:t] = A[:, cols]
+        ind_hist += cols
+
+    for k in range(max_iter):
+        if m >= 1:
+            for _ in range(m-1):
+                Y = A @ Y
+
+        Y_sums = (np.abs(Y)).sum(axis=0)
+        best_j = np.argmax(Y_sums)
+        est = Y_sums[best_j]
+        if est <= est_old:  # (1)
+            est = est_old
+            break
+        # else:
+            # w = Y[:, best_j]
+        est_old = est
+        S[:, :t] = S[:, t:]
+        _custom_signum(Y, S[:, t:], not real_A)
+        if t > 1 and real_A:
+            # (2)
+            if ((S[:, t:].T @ S[:, :t]).max(axis=1) == n).all() and k > 0:
+                break
+            else:
+                # replaces S in-place
+                _replace_parallel_cols(S)
+
+        # (3)
+        Z = A.conj().T @ S
+        if m >= 1:
+            for _ in range(m-1):
+                Z = A.conj().T @ Z
+
+        Z_sums = (np.abs(Z)).sum(axis=1)
+        if np.argmax(Z_sums) == best_j:  # (4)
+            break
+        h_sorter = np.argsort(Z_sums)
+        if all([x in ind_hist for x in h_sorter[:t]]):  # (5)
+            break
+        else:
+            pick = random.choice(range(n))
+            for _ in range(t):
+                while pick in ind_hist:
+                    pick = random.choice(range(n))
+                ind_hist += [pick]
+
+            Y = A[:, ind_hist[-t:]]
+
+    # v = np.zeros_like(X[:, 0])  # just some equal size array
+    # v[best_j] = 1
+
+    return est  # , v, w
+
+
+def _replace_parallel_cols(S):
+    """Helper function for 1-norm condition number estimator.
+
+    This function is akin to ?LARPC function given in Cheng, Higham (see Ref
+    section of expm, [2]).
+
+    Parameters
+    ----------
+    S : ndarray
+        Working array of the current iteration
+    S_old : ndarray
+        Working array of the previous iteration. Pass None if only S exists.
+    rng : Generator
+        The random generator used inside the body of condition estimator
+
+    Returns
+    -------
+    None
+
+    """
+    n, t = S.shape
+    max_spin = math.ceil(n / t)
+
+    for col in range(t):
+        curr_col = t + col
+        n_it = 0
+        while round(np.abs(S[:, col] @ S[:, :curr_col]).max()) == n:
+            S[:, col] = random.choices([1, -1], k=n)
+            n_it += 1
+            if n_it > max_spin:
+                break
+
+    return
+
+
+def _custom_signum(M, sgnM, is_complex=False):
+    """Helper function for 1-norm condition number estimator.
+
+    This function is defined to comply with ref. [1] of condest. For reals,
+    nonnegative inputs are mapped to 1 and to -1 otherwise. For complex inputs
+    sgn(x) = x / ||x|| and sgn(0 + 0j) := 1.
+
+    Parameters
+    ----------
+    M : ndarray
+        Input array
+
+    sgnM : ndarray
+        Output array that will be overwritten
+
+    Returns
+    -------
+    sgnM : ndarray
+        The resulting sign(M) array.
+
+    """
+    if is_complex:
+        sgnM.fill(1.)
+        mask = M != 0.
+        sgnM[mask] = M[mask] / np.abs(M[mask])
+    else:
+        np.signbit(M, out=sgnM)
+
+
+@cython.initializedcheck(False)
+def pade_UV_calc(lapack_t[:, :, ::1]Am, int n, int m):
+    """Helper functions for expm to solve the final polynomial evaluation"""
+    if lapack_t is float:
+        if m in [3, 5, 7]:
+            pade_357_UV_calc_s(Am, n, m)
+        elif m == 9:
+            pade_9_UV_calc_s(Am, n)
+        else: 
+            pade_13_UV_calc_s(Am, n)
+    elif lapack_t is double:
+        if m in [3, 5, 7]:
+            pade_357_UV_calc_d(Am, n, m)
+        elif m == 9:
+            pade_9_UV_calc_d(Am, n)
+        else: 
+            pade_13_UV_calc_d(Am, n)
+    elif lapack_t is floatcomplex:
+        if m in [3, 5, 7]:
+            pade_357_UV_calc_c(Am, n, m)
+        elif m == 9:
+            pade_9_UV_calc_c(Am, n)
+        else: 
+            pade_13_UV_calc_c(Am, n)
+    elif lapack_t is doublecomplex:
+        if m in [3, 5, 7]:
+            pade_357_UV_calc_z(Am, n, m)
+        elif m == 9:
+            pade_9_UV_calc_z(Am, n)
+        else: 
+            pade_13_UV_calc_z(Am, n)
+    else:
+        raise ValueError('Internal function "pade_UV_calc" received an unsupported dtype')
+
+
+@cython.initializedcheck(False)
+def pick_pade_structure(lapack_t[:, ::1]A):
+    """Helper functions for expm to choose Pade approximation order"""
+    if lapack_t is float:
+        return pick_pade_structure_s(A)
+    elif lapack_t is double:
+        return pick_pade_structure_d(A)
+    elif lapack_t is floatcomplex:
+        return pick_pade_structure_c(A)
+    elif lapack_t is doublecomplex:
+        return pick_pade_structure_z(A)
+    else:
+        raise ValueError('Internal function "pick_pade_structure" received an unsupported dtype.')
diff --git a/scipy/linalg/setup.py b/scipy/linalg/setup.py
index 2f0ef0a48c17..f1ee9b40df64 100644
--- a/scipy/linalg/setup.py
+++ b/scipy/linalg/setup.py
@@ -152,6 +152,9 @@ def configuration(parent_package='', top_path=None):
     config.add_extension('_cythonized_array_utils',
                          sources=['_cythonized_array_utils.c'])
 
+    config.add_extension('_matfuncs_expm',
+                         sources=['_matfuncs_expm.c'])
+
     # Add any license files
     config.add_data_files('src/id_dist/doc/doc.tex')
     config.add_data_files('src/lapack_deprecations/LICENSE')

From de83d35c56fa01ad6255e8029f4a5ac50e3c90bb Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Sat, 20 Nov 2021 15:11:06 +0100
Subject: [PATCH 02/22] MAINT:linalg: Confirm malloc() objects freed

---
 scipy/linalg/_matfuncs_expm.pyx.in | 150 +++++++++++++++--------------
 1 file changed, 76 insertions(+), 74 deletions(-)

diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in
index 5c3384701710..5356d8e55472 100644
--- a/scipy/linalg/_matfuncs_expm.pyx.in
+++ b/scipy/linalg/_matfuncs_expm.pyx.in
@@ -262,85 +262,87 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi
     cdef {{ tname }} one = 1.0, zero = 0.0, neg_one = -1.0
     if not ipiv:
         raise MemoryError('Internal function "pade_357_UV_calc_c" failed to allocate memory.')
-    # b[m] is always 1. hence skipped
-    if m == 3:
-        b[0] = 120.
-        b[1] = 60.
-        b[2] = 12.
-    elif m == 5:
-        b[0] = 30240.
-        b[1] = 15120.
-        b[2] = 3360.
-        b[3] = 420.
-        b[4] = 30.
-    elif m == 7:
-        b[0] = 17297280.
-        b[1] = 8648640.
-        b[2] = 1995840.
-        b[3] = 277200.
-        b[4] = 25200.
-        b[5] = 1512.
-        b[6] = 56.
-    else:
-        raise ValueError(f'Internal function "pade_357_UV_calc_c" received an invalid value {m}')
-
-    # Utilize the unused powers of Am as scratch memory
-    if m == 3:
-        # U = Am[0] @ Am[1] + 60.*Am[0]
-        {{ pfx }}copy(&n2, &Am[0, 0, 0], &int_one, &Am[3, 0, 0], &int_one)
-        {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[1, 0, 0], &n, &b[1], &Am[3, 0, 0], &n)
-        # V = 12.*Am[1] + 120*I_n
-        {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one)
-        for i in range(n):
-            Am[1, i, i] = Am[1, i, i] + b[0]
+    try:
+        # b[m] is always 1. hence skipped
+        if m == 3:
+            b[0] = 120.
+            b[1] = 60.
+            b[2] = 12.
+        elif m == 5:
+            b[0] = 30240.
+            b[1] = 15120.
+            b[2] = 3360.
+            b[3] = 420.
+            b[4] = 30.
+        elif m == 7:
+            b[0] = 17297280.
+            b[1] = 8648640.
+            b[2] = 1995840.
+            b[3] = 277200.
+            b[4] = 25200.
+            b[5] = 1512.
+            b[6] = 56.
+        else:
+            raise ValueError(f'Internal function "pade_357_UV_calc_c" received an invalid value {m}')
 
-    elif m == 5:
-        # U = Am[0] @ (b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n)
-        {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
-        {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one)
-        {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
-        for i in range(n):
-            Am[4, i, i] = Am[4, i, i] + b[1]
-        {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n)
-        # V = b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n
-        {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one)
-        {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
-        for i in range(n):
-            Am[1, i, i] = Am[1, i, i] + b[0]
+        # Utilize the unused powers of Am as scratch memory
+        if m == 3:
+            # U = Am[0] @ Am[1] + 60.*Am[0]
+            {{ pfx }}copy(&n2, &Am[0, 0, 0], &int_one, &Am[3, 0, 0], &int_one)
+            {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[1, 0, 0], &n, &b[1], &Am[3, 0, 0], &n)
+            # V = 12.*Am[1] + 120*I_n
+            {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one)
+            for i in range(n):
+                Am[1, i, i] = Am[1, i, i] + b[0]
 
-    else:
-        # U = Am[0] @ (b[7]*Am[3] + b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n)
-        {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
-        {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one)
-        {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
-        {{ pfx }}axpy(&n2, &b[7], &Am[3, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
-        for i in range(n):
-            Am[4, i, i] = Am[4, i, i] + b[1]
-        # We ran out of space for dgemm; first compute V and then reuse space for U
-        # V = b[6]*Am[3] + b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n
-        {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one)
-        {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
-        {{ pfx }}axpy(&n2, &b[6], &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
-        for i in range(n):
-            Am[1, i, i] = Am[1, i, i] + b[0]
-        # Now we can scratch A[2] or A[3]
-        {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n)
+        elif m == 5:
+            # U = Am[0] @ (b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n)
+            {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
+            {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one)
+            {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
+            for i in range(n):
+                Am[4, i, i] = Am[4, i, i] + b[1]
+            {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n)
+            # V = b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n
+            {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one)
+            {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
+            for i in range(n):
+                Am[1, i, i] = Am[1, i, i] + b[0]
+
+        else:
+            # U = Am[0] @ (b[7]*Am[3] + b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n)
+            {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
+            {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one)
+            {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
+            {{ pfx }}axpy(&n2, &b[7], &Am[3, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
+            for i in range(n):
+                Am[4, i, i] = Am[4, i, i] + b[1]
+            # We ran out of space for dgemm; first compute V and then reuse space for U
+            # V = b[6]*Am[3] + b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n
+            {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one)
+            {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
+            {{ pfx }}axpy(&n2, &b[6], &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
+            for i in range(n):
+                Am[1, i, i] = Am[1, i, i] + b[0]
+            # Now we can scratch A[2] or A[3]
+            {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n)
 
-    # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U
-    {{ pfx }}axpy(&n2, &neg_one, &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
+        # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U
+        {{ pfx }}axpy(&n2, &neg_one, &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
 
-    # Convert array layout for solving AX = B into Am[2]
-    swap_c_and_f_layout(&Am[3, 0, 0], &Am[2, 0, 0], n, n, n)
+        # Convert array layout for solving AX = B into Am[2]
+        swap_c_and_f_layout(&Am[3, 0, 0], &Am[2, 0, 0], n, n, n)
 
-    {{ pfx }}getrf( &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &info )
-    {{ pfx }}getrs(<char*>'C', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info )
-    {{ sc }}scal(&n2, &two, &Am[2, 0, 0], &int_one)
-    for i in range(n):
-        Am[2, i, i] = Am[2, i, i] + 1.
+        {{ pfx }}getrf( &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &info )
+        {{ pfx }}getrs(<char*>'C', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info )
+        {{ sc }}scal(&n2, &two, &Am[2, 0, 0], &int_one)
+        for i in range(n):
+            Am[2, i, i] = Am[2, i, i] + 1.
 
-    # Put it back in Am in C order
-    swap_c_and_f_layout(&Am[2, 0, 0], &Am[0, 0, 0], n, n, n)
-    free(ipiv)
+        # Put it back in Am in C order
+        swap_c_and_f_layout(&Am[2, 0, 0], &Am[0, 0, 0], n, n, n)
+    finally:
+        free(ipiv)
 
 
 {{endfor}}
@@ -480,7 +482,6 @@ cdef void pade_13_UV_calc_{{ pfx }}({{ tname }}[:, :, ::1]Am, int n) nogil:
 
         # Convert array layout for solving AX = B into work4
         swap_c_and_f_layout(work2, work4, n, n, n)
-        free(work2)
 
         {{ pfx }}getrf( &n, &n, &work3[0], &n, &ipiv[0], &info )
         {{ pfx }}getrs(<char*>'C', &n, &n, &work3[0], &n, &ipiv[0], &work4[0], &n, &info )
@@ -491,6 +492,7 @@ cdef void pade_13_UV_calc_{{ pfx }}({{ tname }}[:, :, ::1]Am, int n) nogil:
         swap_c_and_f_layout(work4, &Am[0, 0, 0], n, n, n)
     finally:
         free(ipiv)
+        free(work2)
         free(work3)
         free(work4)
 

From 5bb5fe9f1c398319f1a3f373bfe5ba2122d23b0b Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Sat, 20 Nov 2021 21:33:21 +0100
Subject: [PATCH 03/22] MAINT:linalg: allocate expm scratch array once at the
 start

---
 scipy/linalg/_matfuncs_expm.pyx.in | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in
index 5356d8e55472..2e2db8e96551 100644
--- a/scipy/linalg/_matfuncs_expm.pyx.in
+++ b/scipy/linalg/_matfuncs_expm.pyx.in
@@ -133,10 +133,9 @@ cdef double kth_power_norm_d(double* A, double* v1, double* v2, Py_ssize_t n, in
 @cython.wraparound(False)
 @cython.boundscheck(False)
 @cython.initializedcheck(False)
-cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, ::1] A):
-    cdef Py_ssize_t n = A.shape[0], i, j, k
+cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, :, ::1] Am):
+    cdef Py_ssize_t n = Am.shape[1], i, j, k
     cdef int lm = 0, s = 0, intn = <int>n, n2= intn*intn, int_one = 1
-    cdef cnp.ndarray[cnp.{{ dt }}_t, ndim=3] Am = np.empty((5, n, n), dtype='{{ dchar }}')
     cdef {{ tname }}[:, :, ::1] Amv = Am
     cdef:
         {{ tname }} one = 1.0, zero = 0.0
@@ -169,10 +168,10 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, ::1] A):
         for j in range(n):
             work[j] = 1.
 
-        {{ pfx }}copy(&n2, &A[0, 0], &int_one, &Amv[0, 0, 0], &int_one)
+        # {{ pfx }}copy(&n2, &A[0, 0], &int_one, &Amv[0, 0, 0], &int_one)
         for i in range(n):
             for j in range(n):
-                absA[i*n + j] = abs(A[i, j])
+                absA[i*n + j] = abs(Am[0, i, j])
 
         # First spin = normest(|A|, 1), increase m when spun more
         normA = kth_power_norm_d(absA, work, work2, n, 1)
@@ -188,13 +187,13 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, ::1] A):
         temp = kth_power_norm_d(absA, work, work2, n, 6)
         lm = max(<int>ceil(log2(temp/normA/coeff[0])/6), 0)
         if eta0 < theta[0] and lm == 0:
-            return Am, 3, s
+            return 3, s
 
         # m = 5
         temp = kth_power_norm_d(absA, work, work2, n, 4)
         lm = max(<int>ceil(log2(temp/normA/coeff[1])/10), 0)
         if eta1 < theta[1] and lm == 0:
-            return Am, 5, s
+            return 5, s
 
         # m = 7
         if n < 360:
@@ -207,13 +206,13 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, ::1] A):
         temp = kth_power_norm_d(absA, work, work2, n, 4)
         lm = max(<int>ceil(log2(temp/normA/coeff[2])/14), 0)
         if eta2 < theta[2] and lm == 0:
-            return Am, 7, s
+            return 7, s
 
         # m = 9
         temp = kth_power_norm_d(absA, work, work2, n, 4)
         lm = max(<int>ceil(log2(temp/normA/coeff[3])/18), 0)
         if eta2 < theta[3] and lm == 0:
-            return Am, 9, s
+            return 9, s
 
         # m = 13
         # Scale-square
@@ -236,7 +235,7 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, ::1] A):
             normA *= 2.**(-s)
         temp = kth_power_norm_d(absA, work, work2, n, 8)
         s += max(<int>ceil(log2(temp/normA/coeff[4])/26), 0)
-        return Am, 13, s
+        return 13, s
     finally:
         free(work)
         free(work2)
@@ -710,7 +709,7 @@ def pade_UV_calc(lapack_t[:, :, ::1]Am, int n, int m):
 
 
 @cython.initializedcheck(False)
-def pick_pade_structure(lapack_t[:, ::1]A):
+def pick_pade_structure(lapack_t[:, :, ::1]A):
     """Helper functions for expm to choose Pade approximation order"""
     if lapack_t is float:
         return pick_pade_structure_s(A)

From 876a1b57d11c9aba803869542e658ff4685ec263 Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Sun, 21 Nov 2021 03:41:24 +0100
Subject: [PATCH 04/22] FIX:linalg: correct expm spurious array conjugation

---
 scipy/linalg/_matfuncs_expm.pyx.in | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in
index 2e2db8e96551..d4fded2d2fe0 100644
--- a/scipy/linalg/_matfuncs_expm.pyx.in
+++ b/scipy/linalg/_matfuncs_expm.pyx.in
@@ -333,7 +333,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi
         swap_c_and_f_layout(&Am[3, 0, 0], &Am[2, 0, 0], n, n, n)
 
         {{ pfx }}getrf( &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &info )
-        {{ pfx }}getrs(<char*>'C', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info )
+        {{ pfx }}getrs(<char*>'T', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info )
         {{ sc }}scal(&n2, &two, &Am[2, 0, 0], &int_one)
         for i in range(n):
             Am[2, i, i] = Am[2, i, i] + 1.
@@ -397,7 +397,7 @@ cdef void pade_9_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n) nogil:
         swap_c_and_f_layout(&Am[3, 0, 0], &Am[2, 0, 0], n, n, n)
 
         {{ pfx }}getrf( &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &info )
-        {{ pfx }}getrs(<char*>'C', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info)
+        {{ pfx }}getrs(<char*>'T', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info)
         {{ sc }}scal(&n2, &two, &Am[2, 0, 0], &int_one)
         for i in range(n):
             Am[2, i, i] = Am[2, i, i] + 1.
@@ -483,7 +483,7 @@ cdef void pade_13_UV_calc_{{ pfx }}({{ tname }}[:, :, ::1]Am, int n) nogil:
         swap_c_and_f_layout(work2, work4, n, n, n)
 
         {{ pfx }}getrf( &n, &n, &work3[0], &n, &ipiv[0], &info )
-        {{ pfx }}getrs(<char*>'C', &n, &n, &work3[0], &n, &ipiv[0], &work4[0], &n, &info )
+        {{ pfx }}getrs(<char*>'T', &n, &n, &work3[0], &n, &ipiv[0], &work4[0], &n, &info )
         {{ sc }}scal(&n2, &two, &work4[0], &int1)
         for i in range(n):
             work4[i*(n+1)] += 1.

From a0cd10b96e49f0123b367687f826cac267ad128d Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Sun, 21 Nov 2021 23:42:00 +0100
Subject: [PATCH 05/22] ENH:linalg: expm public API function

---
 scipy/linalg/_matfuncs.py | 180 +++++++++++++++++++++++++++++++++++---
 1 file changed, 166 insertions(+), 14 deletions(-)

diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py
index 27c0a96b450c..93d038a8649f 100644
--- a/scipy/linalg/_matfuncs.py
+++ b/scipy/linalg/_matfuncs.py
@@ -9,7 +9,10 @@
 
 from numpy import (Inf, dot, diag, prod, logical_not, ravel,
         transpose, conjugate, absolute, amax, sign, isfinite, single)
+from numpy.lib.scimath import sqrt as csqrt
 import numpy as np
+from itertools import product
+from scipy.linalg import LinAlgError, bandwidth
 
 # Local imports
 from ._misc import norm
@@ -19,7 +22,7 @@
 from ._decomp_schur import schur, rsf2csf
 from ._expm_frechet import expm_frechet, expm_cond
 from ._matfuncs_sqrtm import sqrtm
-
+from ._matfuncs_expm import pick_pade_structure, pade_UV_calc
 eps = np.finfo(float).eps
 feps = np.finfo(single).eps
 
@@ -208,25 +211,42 @@ def logm(A, disp=True):
 
 
 def expm(A):
-    """
-    Compute the matrix exponential using Pade approximation.
+    """Compute the matrix exponential of an array.
 
     Parameters
     ----------
-    A : (N, N) array_like or sparse matrix
-        Matrix to be exponentiated.
+    A : ndarray
+        Input with last two dimensions are square ``(..., n, n)``.
 
     Returns
     -------
-    expm : (N, N) ndarray
-        Matrix exponential of `A`.
+    eA : ndarray
+        The resulting matrix exponential with the same shape of A
+
+    Notes
+    -----
+    Implements the algorithm given in [1], which is essentially a careful Pade
+    approximation and thus higher powers (up to eight) of the input are
+    computed. This has to be taken into account as it is not difficult to have
+    exponential growth leading to floating point overflows. For input with size
+    ``n``, the memory usage is in the worst case in the order of ``8*(n**2)``.
+    If the input data is not of single and double precision of real and complex
+    dtypes, it is copied to a new array. Same applies for noncontiguous inputs.
+
+    For cases ``n >= 400``, the exact 1-norm computation cost, breaks even with
+    1-norm estimation and from that point on the estimation scheme given in
+    [2] is used.
 
     References
     ----------
-    .. [1] Awad H. Al-Mohy and Nicholas J. Higham (2009)
-           "A New Scaling and Squaring Algorithm for the Matrix Exponential."
-           SIAM Journal on Matrix Analysis and Applications.
-           31 (3). pp. 970-989. ISSN 1095-7162
+    .. [1] Awad H. Al-Mohy and Nicholas J. Higham, (2009), "A New Scaling
+           and Squaring Algorithm for the Matrix Exponential", SIAM J. Matrix
+           Anal. Appl. 31(3):970-989, :doi:`10.1137/09074721X`
+
+    .. [2] Nicholas J. Higham and Francoise Tisseur (2000), "A Block Algorithm
+           for Matrix 1-Norm Estimation, with an Application to 1-Norm
+           Pseudospectra." SIAM J. Matrix Anal. Appl. 21(4):1185-1201,
+           :doi:`10.1137/S0895479899356080`
 
     Examples
     --------
@@ -250,9 +270,141 @@ def expm(A):
            [ 1.06860742+0.48905626j, -1.71075555+0.91406299j]])
 
     """
-    # Input checking and conversion is provided by sparse.linalg.expm().
-    import scipy.sparse.linalg
-    return scipy.sparse.linalg.expm(A)
+    a = np.asarray(A)
+    if a.size == 1 and a.ndim < 2:
+        return np.array([[np.exp(a.ravel()[0])]])
+
+    if a.ndim < 2:
+        raise LinAlgError('The input array must be at least two-dimensional')
+    if a.shape[-1] != a.shape[-2]:
+        raise LinAlgError('Last 2 dimensions of the array must be square')
+    n = a.shape[-1]
+    # Empty array
+    if min(*a.shape) == 0:
+        return np.empty_like(a)
+
+    # Scalar case
+    if a.shape[-2:] == (1, 1):
+        return np.exp(a)
+
+    if not np.issubdtype(a.dtype, np.inexact):
+        a = a.astype(float)
+    elif a.dtype == np.float16:
+        a = a.astype(np.float32)
+
+    # Explicit formula for 2x2 case, formula (2.2) in [1]
+    # without Kahan's method numerical instabilities can occur.
+    if a.shape[-2:] == (2, 2):
+        if a.ndim == 2:
+
+            p, r, s, t = a[0, 0], a[0, 1], a[1, 0], a[1, 1]
+            mu = csqrt((p-t)**2 + 4*r*s)/2
+            coshMu, sinchMu = np.cosh(mu), np.sinh(mu)/mu if mu != 0. else 1.
+            eA = np.empty([2, 2], dtype=mu.dtype)
+            eA[0, 0] = coshMu + 0.5*(p-t)*sinchMu
+            eA[0, 1] = r*sinchMu
+            eA[1, 0] = s*sinchMu
+            eA[1, 1] = coshMu - 0.5*(p-t)*sinchMu
+            eA *= np.exp((p + t)/2.)
+            if np.isrealobj(a):
+                return eA.real
+            return eA
+
+        else:
+            a1, a2, a3, a4 = (a[..., 0, 0],
+                              a[..., 0, 1],
+                              a[..., 1, 0],
+                              a[..., 1, 1])
+            mu = (a[..., 0, 0]-a[..., 1, 1])**2 + 4*a[..., 0, 1]*a[..., 1, 0]
+            mu = csqrt(mu)/2.
+            eApD2 = np.exp((a[..., 0, 0]+a[..., 1, 1])/2.)
+            AmD2 = (a1 - a4)/2.
+            coshMu = np.cosh(mu),
+            sinchMu = _sinch(mu)
+            eA = np.empty((a.shape), dtype=mu.dtype)
+            eA[..., 0, 0] = eApD2 * (coshMu + AmD2*sinchMu)
+            eA[..., 0, 1] = eApD2 * a2 * sinchMu
+            eA[..., 1, 0] = eApD2 * a3 * sinchMu
+            eA[..., 1, 1] = eApD2 * (coshMu - AmD2*sinchMu)
+            if np.isrealobj(a):
+                return eA.real
+            return eA
+
+    # Generic case with unspecified stacked dimensions.
+    # array to hold the result
+    n = a.shape[-1]
+    eA = np.empty(a.shape, dtype=a.dtype)
+    # working memory to hold intermediate arrays
+    Am = np.empty((5, n, n), dtype=a.dtype)
+
+    for ind in product(*[range(x) for x in a.shape[:-2]]):
+        aw = a[ind]
+
+        lu = bandwidth(aw)
+        if not any(lu):  # a is diagonal?
+            eA[ind] = np.diag(np.exp(np.diag(aw)))
+            continue
+
+        # # Generic/triangular case; copy the slice into scratch and send
+        Am[0, :, :] = aw
+        m, s = pick_pade_structure(Am)
+
+        if s != 0:  # scaling needed
+            Am[:4] *= [[[2**(-s)]], [[4**(-s)]], [[16**(-s)]], [[64**(-s)]]]
+
+        pade_UV_calc(Am, n, m)
+        eAw = Am[0]
+
+        if s != 0:  # squaring needed
+
+            if lu[1] == 0:  # lower triangular
+                diag_aw = np.diag(aw)
+                # einsum returns a writable view
+                np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-s))
+                for i in range(s-1, -1, -1):
+                    eAw = eAw @ eAw
+                    # diagonal
+                    np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-i))
+                    # superdiagonal
+                    sd = np.diag(aw, k=-1) * 2**(-i)
+                    l1_plus_l2 = (diag_aw[:-1] + diag_aw[1:]) * 2**(-i-1)
+                    l1_minus_l2 = (diag_aw[:-1] - diag_aw[1:]) * 2**(-i-1)
+                    esd = sd*np.exp(l1_plus_l2)*_sinch(l1_minus_l2)
+                    np.einsum('ii->i', eAw[1:, :-1])[:] = esd
+
+            elif lu[0] == 0:  # upper triangular
+                diag_aw = np.diag(aw)
+                # einsum returns a writable view
+                np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-s))
+                for i in range(s-1, -1, -1):
+                    eAw = eAw @ eAw
+                    # diagonal
+                    np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-i))
+                    # superdiagonal
+                    sd = np.diag(aw, k=1) * 2**(-i)
+                    l1_plus_l2 = (diag_aw[:-1] + diag_aw[1:]) * 2**(-i-1)
+                    l1_minus_l2 = (diag_aw[:-1] - diag_aw[1:]) * 2**(-i-1)
+                    esd = sd*np.exp(l1_plus_l2)*_sinch(l1_minus_l2)
+                    np.einsum('ii->i', eAw[:-1, 1:])[:] = esd
+
+            else:  # generic
+                for _ in range(s):
+                    eAw = eAw @ eAw
+
+        # Zero out the entries from np.empty in case of triangular input
+        if (lu[0] == 0) or (lu[1] == 0):
+            eA[ind] = np.triu(eAw) if lu[0] == 0 else np.tril(eAw)
+        else:
+            eA[ind] = eAw
+
+    return eA
+
+
+def _sinch(x):
+    m = x == 0.
+    x[m] = 1.
+    x[~m] = np.sinh(x[~m])/x[~m]
+    return x
 
 
 def cosm(A):

From d40fa4f80b8578a9e9b754e9012de5e21974cb57 Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Sun, 21 Nov 2021 23:56:20 +0100
Subject: [PATCH 06/22] TST:linalg: Tweaks to tests for the new expm()

---
 scipy/linalg/tests/test_matfuncs.py | 33 ++++++-----------------------
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/scipy/linalg/tests/test_matfuncs.py b/scipy/linalg/tests/test_matfuncs.py
index 9ff68beae506..97e52886ed51 100644
--- a/scipy/linalg/tests/test_matfuncs.py
+++ b/scipy/linalg/tests/test_matfuncs.py
@@ -9,10 +9,8 @@
 
 import numpy as np
 from numpy import array, identity, dot, sqrt
-from numpy.testing import (
-        assert_array_equal, assert_array_less, assert_equal,
-        assert_array_almost_equal,
-        assert_allclose, assert_, assert_warns)
+from numpy.testing import (assert_array_almost_equal, assert_allclose, assert_,
+                           assert_array_less, assert_array_equal, assert_warns)
 import pytest
 
 import scipy.linalg
@@ -108,7 +106,7 @@ def test_al_mohy_higham_2012_experiment_1_logm(self):
         A = _get_al_mohy_higham_2012_experiment_1()
         A_logm, info = logm(A, disp=False)
         A_round_trip = expm(A_logm)
-        assert_allclose(A_round_trip, A, rtol=1e-5, atol=1e-14)
+        assert_allclose(A_round_trip, A, rtol=5e-5, atol=1e-14)
 
     def test_al_mohy_higham_2012_experiment_1_funm_log(self):
         # The raw funm with np.log does not complete the round trip.
@@ -604,25 +602,8 @@ def test_zero(self):
         assert_array_almost_equal(expm(a),[[1,0],[0,1]])
 
     def test_single_elt(self):
-        # See gh-5853
-        from scipy.sparse import csc_matrix
-
-        vOne = -2.02683397006j
-        vTwo = -2.12817566856j
-
-        mOne = csc_matrix([[vOne]], dtype='complex')
-        mTwo = csc_matrix([[vTwo]], dtype='complex')
-
-        outOne = expm(mOne)
-        outTwo = expm(mTwo)
-
-        assert_equal(type(outOne), type(mOne))
-        assert_equal(type(outTwo), type(mTwo))
-
-        assert_allclose(outOne[0, 0], complex(-0.44039415155949196,
-                                              -0.8978045395698304))
-        assert_allclose(outTwo[0, 0], complex(-0.52896401032626006,
-                                              -0.84864425749518878))
+        elt = expm(1)
+        assert_allclose(elt, np.array([[np.e]]))
 
     def test_empty_matrix_input(self):
         # handle gh-11082
@@ -713,8 +694,8 @@ def test_fuzz(self):
             expected_expm = scipy.linalg.expm(A)
             expected_frechet = scipy.linalg.expm(M)[:n, n:]
             observed_expm, observed_frechet = expm_frechet(A, E)
-            assert_allclose(expected_expm, observed_expm)
-            assert_allclose(expected_frechet, observed_frechet)
+            assert_allclose(expected_expm, observed_expm, atol=5e-8)
+            assert_allclose(expected_frechet, observed_frechet, atol=1e-7)
 
     def test_problematic_matrix(self):
         # this test case uncovered a bug which has since been fixed

From 4ef8e3d82925fc3bbd50ca040a9eec7ef5d413a2 Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Mon, 22 Nov 2021 12:07:00 +0100
Subject: [PATCH 07/22] TST:sparse:linalg: Use sparse.linalg own expm in tests

---
 .../sparse/linalg/tests/test_expm_multiply.py | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/scipy/sparse/linalg/tests/test_expm_multiply.py b/scipy/sparse/linalg/tests/test_expm_multiply.py
index 0e00ad291ab6..250668c87e68 100644
--- a/scipy/sparse/linalg/tests/test_expm_multiply.py
+++ b/scipy/sparse/linalg/tests/test_expm_multiply.py
@@ -6,6 +6,7 @@
                            suppress_warnings)
 from scipy.sparse import SparseEfficiencyWarning
 import scipy.linalg
+from scipy.sparse.linalg import expm as sp_expm
 from scipy.sparse.linalg._expm_multiply import (_theta, _compute_p_max,
         _onenormest_matrix_power, expm_multiply, _expm_multiply_simple,
         _expm_multiply_interval)
@@ -63,7 +64,7 @@ def test_expm_multiply(self):
             A = scipy.linalg.inv(np.random.randn(n, n))
             B = np.random.randn(n, k)
             observed = expm_multiply(A, B)
-            expected = np.dot(scipy.linalg.expm(A), B)
+            expected = np.dot(sp_expm(A), B)
             assert_allclose(observed, expected)
 
     def test_matrix_vector_multiply(self):
@@ -74,7 +75,7 @@ def test_matrix_vector_multiply(self):
             A = scipy.linalg.inv(np.random.randn(n, n))
             v = np.random.randn(n)
             observed = expm_multiply(A, v)
-            expected = np.dot(scipy.linalg.expm(A), v)
+            expected = np.dot(sp_expm(A), v)
             assert_allclose(observed, expected)
 
     def test_scaled_expm_multiply(self):
@@ -88,7 +89,7 @@ def test_scaled_expm_multiply(self):
                     A = scipy.linalg.inv(np.random.randn(n, n))
                     B = np.random.randn(n, k)
                     observed = _expm_multiply_simple(A, B, t=t)
-                    expected = np.dot(scipy.linalg.expm(t*A), B)
+                    expected = np.dot(sp_expm(t*A), B)
                     assert_allclose(observed, expected)
 
     def test_scaled_expm_multiply_single_timepoint(self):
@@ -99,7 +100,7 @@ def test_scaled_expm_multiply_single_timepoint(self):
         A = np.random.randn(n, n)
         B = np.random.randn(n, k)
         observed = _expm_multiply_simple(A, B, t=t)
-        expected = scipy.linalg.expm(t*A).dot(B)
+        expected = sp_expm(t*A).dot(B)
         assert_allclose(observed, expected)
 
     def test_sparse_expm_multiply(self):
@@ -115,8 +116,9 @@ def test_sparse_expm_multiply(self):
                 sup.filter(SparseEfficiencyWarning,
                            "splu requires CSC matrix format")
                 sup.filter(SparseEfficiencyWarning,
-                           "spsolve is more efficient when sparse b is in the CSC matrix format")
-                expected = scipy.linalg.expm(A).dot(B)
+                           "spsolve is more efficient when sparse b is in the"
+                           " CSC matrix format")
+                expected = sp_expm(A).dot(B)
             assert_allclose(observed, expected)
 
     def test_complex(self):
@@ -155,8 +157,7 @@ def test_sparse_expm_multiply_interval(self):
                     sup.filter(SparseEfficiencyWarning,
                                "spsolve is more efficient when sparse b is in the CSC matrix format")
                     for solution, t in zip(X, samples):
-                        assert_allclose(solution,
-                                scipy.linalg.expm(t*A).dot(target))
+                        assert_allclose(solution, sp_expm(t*A).dot(target))
 
     def test_expm_multiply_interval_vector(self):
         np.random.seed(1234)
@@ -172,7 +173,7 @@ def test_expm_multiply_interval_vector(self):
                 samples = np.linspace(start=start, stop=stop,
                         num=num, endpoint=endpoint)
                 for solution, t in zip(X, samples):
-                    assert_allclose(solution, scipy.linalg.expm(t*A).dot(v))
+                    assert_allclose(solution, sp_expm(t*A).dot(v))
 
     def test_expm_multiply_interval_matrix(self):
         np.random.seed(1234)
@@ -189,7 +190,7 @@ def test_expm_multiply_interval_matrix(self):
                     samples = np.linspace(start=start, stop=stop,
                             num=num, endpoint=endpoint)
                     for solution, t in zip(X, samples):
-                        assert_allclose(solution, scipy.linalg.expm(t*A).dot(B))
+                        assert_allclose(solution, sp_expm(t*A).dot(B))
 
     def test_sparse_expm_multiply_interval_dtypes(self):
         # Test A & B int
@@ -197,13 +198,13 @@ def test_sparse_expm_multiply_interval_dtypes(self):
         B = np.ones(5, dtype=int)
         Aexpm = scipy.sparse.diags(np.exp(np.arange(5)),format='csr')
         assert_allclose(expm_multiply(A,B,0,1)[-1], Aexpm.dot(B))
-    
+
         # Test A complex, B int
         A = scipy.sparse.diags(-1j*np.arange(5),format='csr', dtype=complex)
         B = np.ones(5, dtype=int)
         Aexpm = scipy.sparse.diags(np.exp(-1j*np.arange(5)),format='csr')
         assert_allclose(expm_multiply(A,B,0,1)[-1], Aexpm.dot(B))
-    
+
         # Test A int, B complex
         A = scipy.sparse.diags(np.arange(5),format='csr', dtype=int)
         B = np.full(5, 1j, dtype=complex)
@@ -243,7 +244,7 @@ def _help_test_specific_expm_interval_status(self, target_status):
                 samples = np.linspace(start=start, stop=stop,
                         num=num, endpoint=endpoint)
                 for solution, t in zip(X, samples):
-                    assert_allclose(solution, scipy.linalg.expm(t*A).dot(B))
+                    assert_allclose(solution, sp_expm(t*A).dot(B))
                 nsuccesses += 1
         if not nsuccesses:
             msg = 'failed to find a status-' + str(target_status) + ' interval'

From e91e88ca9aa275aae2259db7a24692a4b9c1a80c Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Tue, 23 Nov 2021 02:33:26 +0100
Subject: [PATCH 08/22] MAINT:linalg: expm() code branching cleanup

---
 scipy/linalg/_matfuncs.py | 88 ++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 57 deletions(-)

diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py
index 93d038a8649f..750b5f413b04 100644
--- a/scipy/linalg/_matfuncs.py
+++ b/scipy/linalg/_matfuncs.py
@@ -272,7 +272,7 @@ def expm(A):
     """
     a = np.asarray(A)
     if a.size == 1 and a.ndim < 2:
-        return np.array([[np.exp(a.ravel()[0])]])
+        return np.array([[np.exp(a.item())]])
 
     if a.ndim < 2:
         raise LinAlgError('The input array must be at least two-dimensional')
@@ -295,48 +295,31 @@ def expm(A):
     # Explicit formula for 2x2 case, formula (2.2) in [1]
     # without Kahan's method numerical instabilities can occur.
     if a.shape[-2:] == (2, 2):
-        if a.ndim == 2:
-
-            p, r, s, t = a[0, 0], a[0, 1], a[1, 0], a[1, 1]
-            mu = csqrt((p-t)**2 + 4*r*s)/2
-            coshMu, sinchMu = np.cosh(mu), np.sinh(mu)/mu if mu != 0. else 1.
-            eA = np.empty([2, 2], dtype=mu.dtype)
-            eA[0, 0] = coshMu + 0.5*(p-t)*sinchMu
-            eA[0, 1] = r*sinchMu
-            eA[1, 0] = s*sinchMu
-            eA[1, 1] = coshMu - 0.5*(p-t)*sinchMu
-            eA *= np.exp((p + t)/2.)
-            if np.isrealobj(a):
-                return eA.real
-            return eA
-
-        else:
-            a1, a2, a3, a4 = (a[..., 0, 0],
-                              a[..., 0, 1],
-                              a[..., 1, 0],
-                              a[..., 1, 1])
-            mu = (a[..., 0, 0]-a[..., 1, 1])**2 + 4*a[..., 0, 1]*a[..., 1, 0]
-            mu = csqrt(mu)/2.
-            eApD2 = np.exp((a[..., 0, 0]+a[..., 1, 1])/2.)
-            AmD2 = (a1 - a4)/2.
-            coshMu = np.cosh(mu),
-            sinchMu = _sinch(mu)
-            eA = np.empty((a.shape), dtype=mu.dtype)
-            eA[..., 0, 0] = eApD2 * (coshMu + AmD2*sinchMu)
-            eA[..., 0, 1] = eApD2 * a2 * sinchMu
-            eA[..., 1, 0] = eApD2 * a3 * sinchMu
-            eA[..., 1, 1] = eApD2 * (coshMu - AmD2*sinchMu)
-            if np.isrealobj(a):
-                return eA.real
-            return eA
-
-    # Generic case with unspecified stacked dimensions.
-    # array to hold the result
+        a1, a2, a3, a4 = (a[..., 0, 0],
+                          a[..., 0, 1],
+                          a[..., 1, 0],
+                          a[..., 1, 1])
+        mu = csqrt((a1-a4)**2 + 4*a2*a3)/2.
+        eApD2 = np.exp((a1+a4)/2.)
+        AmD2 = (a1 - a4)/2.
+        coshMu = np.cosh(mu)
+        sinchMu = _sinch(mu)
+        eA = np.empty((a.shape), dtype=mu.dtype)
+        eA[..., 0, 0] = eApD2 * (coshMu + AmD2*sinchMu)
+        eA[..., 0, 1] = eApD2 * a2 * sinchMu
+        eA[..., 1, 0] = eApD2 * a3 * sinchMu
+        eA[..., 1, 1] = eApD2 * (coshMu - AmD2*sinchMu)
+        if np.isrealobj(a):
+            return eA.real
+        return eA
+
+    # larger problem with unspecified stacked dimensions.
     n = a.shape[-1]
     eA = np.empty(a.shape, dtype=a.dtype)
     # working memory to hold intermediate arrays
     Am = np.empty((5, n, n), dtype=a.dtype)
 
+    # Main loop to go through the slices of an ndarray and passing to expm
     for ind in product(*[range(x) for x in a.shape[:-2]]):
         aw = a[ind]
 
@@ -345,7 +328,8 @@ def expm(A):
             eA[ind] = np.diag(np.exp(np.diag(aw)))
             continue
 
-        # # Generic/triangular case; copy the slice into scratch and send
+        # Generic/triangular case; copy the slice into scratch and send.
+        # Am will be mutated by pick_pade_structure
         Am[0, :, :] = aw
         m, s = pick_pade_structure(Am)
 
@@ -357,35 +341,25 @@ def expm(A):
 
         if s != 0:  # squaring needed
 
-            if lu[1] == 0:  # lower triangular
+            if (lu[1] == 0) or (lu[0] == 0):  # lower/upper triangular
                 diag_aw = np.diag(aw)
                 # einsum returns a writable view
                 np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-s))
                 for i in range(s-1, -1, -1):
                     eAw = eAw @ eAw
-                    # diagonal
-                    np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-i))
-                    # superdiagonal
-                    sd = np.diag(aw, k=-1) * 2**(-i)
-                    l1_plus_l2 = (diag_aw[:-1] + diag_aw[1:]) * 2**(-i-1)
-                    l1_minus_l2 = (diag_aw[:-1] - diag_aw[1:]) * 2**(-i-1)
-                    esd = sd*np.exp(l1_plus_l2)*_sinch(l1_minus_l2)
-                    np.einsum('ii->i', eAw[1:, :-1])[:] = esd
 
-            elif lu[0] == 0:  # upper triangular
-                diag_aw = np.diag(aw)
-                # einsum returns a writable view
-                np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-s))
-                for i in range(s-1, -1, -1):
-                    eAw = eAw @ eAw
                     # diagonal
                     np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-i))
-                    # superdiagonal
-                    sd = np.diag(aw, k=1) * 2**(-i)
+                    # super/sub diagonal
+                    sd = np.diag(aw, k=-1 if lu[1] == 0 else 1) * 2**(-i)
+
                     l1_plus_l2 = (diag_aw[:-1] + diag_aw[1:]) * 2**(-i-1)
                     l1_minus_l2 = (diag_aw[:-1] - diag_aw[1:]) * 2**(-i-1)
                     esd = sd*np.exp(l1_plus_l2)*_sinch(l1_minus_l2)
-                    np.einsum('ii->i', eAw[:-1, 1:])[:] = esd
+                    if lu[1] == 0:  # lower
+                        np.einsum('ii->i', eAw[1:, :-1])[:] = esd
+                    else:  # upper
+                        np.einsum('ii->i', eAw[:-1, 1:])[:] = esd
 
             else:  # generic
                 for _ in range(s):

From 74cdf624f83cede973b31500863c84077abfdb67 Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Tue, 23 Nov 2021 11:51:42 +0100
Subject: [PATCH 09/22] MAINT:linalg:Adjust array slicing for 2x2 arrays

---
 scipy/linalg/_matfuncs.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py
index 750b5f413b04..96ca25957753 100644
--- a/scipy/linalg/_matfuncs.py
+++ b/scipy/linalg/_matfuncs.py
@@ -295,20 +295,20 @@ def expm(A):
     # Explicit formula for 2x2 case, formula (2.2) in [1]
     # without Kahan's method numerical instabilities can occur.
     if a.shape[-2:] == (2, 2):
-        a1, a2, a3, a4 = (a[..., 0, 0],
-                          a[..., 0, 1],
-                          a[..., 1, 0],
-                          a[..., 1, 1])
+        a1, a2, a3, a4 = (a[..., [0], [0]],
+                          a[..., [0], [1]],
+                          a[..., [1], [0]],
+                          a[..., [1], [1]])
         mu = csqrt((a1-a4)**2 + 4*a2*a3)/2.
         eApD2 = np.exp((a1+a4)/2.)
         AmD2 = (a1 - a4)/2.
         coshMu = np.cosh(mu)
         sinchMu = _sinch(mu)
         eA = np.empty((a.shape), dtype=mu.dtype)
-        eA[..., 0, 0] = eApD2 * (coshMu + AmD2*sinchMu)
-        eA[..., 0, 1] = eApD2 * a2 * sinchMu
-        eA[..., 1, 0] = eApD2 * a3 * sinchMu
-        eA[..., 1, 1] = eApD2 * (coshMu - AmD2*sinchMu)
+        eA[..., [0], [0]] = eApD2 * (coshMu + AmD2*sinchMu)
+        eA[..., [0], [1]] = eApD2 * a2 * sinchMu
+        eA[..., [1], [0]] = eApD2 * a3 * sinchMu
+        eA[..., [1], [1]] = eApD2 * (coshMu - AmD2*sinchMu)
         if np.isrealobj(a):
             return eA.real
         return eA

From 1e4793c8d4e6b9639e6018bf7a2d958ea7d12d13 Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Tue, 23 Nov 2021 03:31:54 +0100
Subject: [PATCH 10/22] MAINT:linalg: modify _sinch function for scalars

---
 scipy/linalg/_matfuncs.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py
index 96ca25957753..feeb1e5538ce 100644
--- a/scipy/linalg/_matfuncs.py
+++ b/scipy/linalg/_matfuncs.py
@@ -375,6 +375,8 @@ def expm(A):
 
 
 def _sinch(x):
+    if np.isscalar(x):
+        return 1. if x == 0. else np.sinh(x)/x
     m = x == 0.
     x[m] = 1.
     x[~m] = np.sinh(x[~m])/x[~m]

From 1bf2177acc502d9b9405abe542d504ea2948bc05 Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Wed, 24 Nov 2021 02:26:13 +0100
Subject: [PATCH 11/22] MAINT:linalg: fix _norm1est call argument type

---
 scipy/linalg/_matfuncs.py          |  55 +++++++------
 scipy/linalg/_matfuncs_expm.pyx.in | 120 ++++++++++-------------------
 2 files changed, 73 insertions(+), 102 deletions(-)

diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py
index feeb1e5538ce..06fb448fc6b7 100644
--- a/scipy/linalg/_matfuncs.py
+++ b/scipy/linalg/_matfuncs.py
@@ -1,20 +1,15 @@
 #
 # Author: Travis Oliphant, March 2002
 #
+from itertools import product
 
-__all__ = ['expm','cosm','sinm','tanm','coshm','sinhm',
-           'tanhm','logm','funm','signm','sqrtm',
-           'expm_frechet', 'expm_cond', 'fractional_matrix_power',
-           'khatri_rao']
-
-from numpy import (Inf, dot, diag, prod, logical_not, ravel,
-        transpose, conjugate, absolute, amax, sign, isfinite, single)
-from numpy.lib.scimath import sqrt as csqrt
 import numpy as np
-from itertools import product
-from scipy.linalg import LinAlgError, bandwidth
+from numpy import (Inf, dot, diag, prod, logical_not, ravel, transpose,
+                   conjugate, absolute, amax, sign, isfinite)
+from numpy.lib.scimath import sqrt as csqrt
 
 # Local imports
+from scipy.linalg import LinAlgError, bandwidth
 from ._misc import norm
 from ._basic import solve, inv
 from ._special_matrices import triu
@@ -23,8 +18,14 @@
 from ._expm_frechet import expm_frechet, expm_cond
 from ._matfuncs_sqrtm import sqrtm
 from ._matfuncs_expm import pick_pade_structure, pade_UV_calc
-eps = np.finfo(float).eps
-feps = np.finfo(single).eps
+
+__all__ = ['expm','cosm','sinm','tanm','coshm','sinhm',
+           'tanhm','logm','funm','signm','sqrtm',
+           'expm_frechet', 'expm_cond', 'fractional_matrix_power',
+           'khatri_rao']
+
+eps = np.finfo('d').eps
+feps = np.finfo('f').eps
 
 _array_precision = {'i': 1, 'l': 1, 'f': 0, 'd': 1, 'F': 0, 'D': 1}
 
@@ -221,21 +222,21 @@ def expm(A):
     Returns
     -------
     eA : ndarray
-        The resulting matrix exponential with the same shape of A
+        The resulting matrix exponential with the same shape of ``A``
 
     Notes
     -----
-    Implements the algorithm given in [1], which is essentially a careful Pade
-    approximation and thus higher powers (up to eight) of the input are
-    computed. This has to be taken into account as it is not difficult to have
-    exponential growth leading to floating point overflows. For input with size
-    ``n``, the memory usage is in the worst case in the order of ``8*(n**2)``.
-    If the input data is not of single and double precision of real and complex
-    dtypes, it is copied to a new array. Same applies for noncontiguous inputs.
+    Implements the algorithm given in [1], which is essentially a Pade
+    approximation with a variable order that is decided based on the array
+    data.
+
+    For input with size ``n``, the memory usage is in the worst case in the
+    order of ``8*(n**2)``. If the input data is not of single and double
+    precision of real and complex dtypes, it is copied to a new array.
 
     For cases ``n >= 400``, the exact 1-norm computation cost, breaks even with
     1-norm estimation and from that point on the estimation scheme given in
-    [2] is used.
+    [2] is used to decide on the approximation order.
 
     References
     ----------
@@ -254,9 +255,15 @@ def expm(A):
 
     Matrix version of the formula exp(0) = 1:
 
-    >>> expm(np.zeros((2,2)))
-    array([[ 1.,  0.],
-           [ 0.,  1.]])
+    >>> expm(np.zeros((3, 2, 2)))
+    array([[[1., 0.],
+            [0., 1.]],
+
+           [[1., 0.],
+            [0., 1.]],
+
+           [[1., 0.],
+            [0., 1.]]])
 
     Euler's identity (exp(i*theta) = cos(theta) + i*sin(theta))
     applied to a matrix:
diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in
index d4fded2d2fe0..047d5c4a9ecc 100644
--- a/scipy/linalg/_matfuncs_expm.pyx.in
+++ b/scipy/linalg/_matfuncs_expm.pyx.in
@@ -1,9 +1,9 @@
 # cython: language_level=3
 cimport cython
+import math
+import random
 cimport numpy as cnp
 import numpy as np
-import random
-import math
 from libc.stdlib cimport malloc, free
 from libc.math cimport fabs, ceil, log2, pow
 from scipy.linalg.cython_lapack cimport (sgetrf, sgetrs, dgetrf, dgetrs,
@@ -200,7 +200,7 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, :, ::1] Am):
             {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[2, 0, 0], &intn, &Amv[2, 0, 0], &intn, &zero, &Amv[4, 0, 0], &intn)
             d8 = norm1(Amv[4, :, :]) ** (1./8.)
         else:
-            d8  = _norm1est(Am[0], m=8) ** (1./8.)
+            d8  = _norm1est(np.asarray(Am[0]), m=8) ** (1./8.)
 
         eta2 = max(d6, d8)
         temp = kth_power_norm_d(absA, work, work2, n, 4)
@@ -220,7 +220,7 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, :, ::1] Am):
             {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[3, 0, 0], &intn, &Amv[2, 0, 0], &intn, &zero, &Amv[4, 0, 0], &intn)
             d10 = norm1(Amv[4, :, :]) ** (1./10.)
         else:
-            d10  = _norm1est(Am[0], m=10) ** (1./10.)
+            d10  = _norm1est(np.asarray(Am[0]), m=10) ** (1./10.)
 
         eta3 = max(d8, d10)
         eta4 = min(eta2, eta3)
@@ -500,8 +500,23 @@ cdef void pade_13_UV_calc_{{ pfx }}({{ tname }}[:, :, ::1]Am, int n) nogil:
 # ============================================================================
 
 # ====================== norm1est ============================================
+
 def _norm1est(A, m=1, t=2, max_iter=5):
-    """Compute a lower bound for the 1-norm of 2D matrix A or its powers
+    """Compute a lower bound for the 1-norm of 2D matrix A or its powers.
+
+    Computing the 1-norm of 8th or 10th power of a very large array is a very
+    wasteful computation if we explicitly compute the actual power. The
+    estimation exploits (in a nutshell) the following:
+
+        (A @ A @ ... A) @ <thin array> = (A @ (A @ (... @ (A @ <thin array>)))
+
+    And in fact all the rest is practically Ward's power method with ``t``
+    starting vectors, hence, thin array and smarter selection of those vectors.
+
+    Thus at some point ``expm`` which uses this function to scale-square, will
+    switch to estimating when ``np.abs(A).sum(axis=0).max()`` becomes slower
+    than the estimate (``linalg.norm`` is even slower). Currently the switch
+    is chosen to be ``n=400``.
 
     Parameters
     ----------
@@ -527,21 +542,21 @@ def _norm1est(A, m=1, t=2, max_iter=5):
     Fortran code given in [2].
 
     The algorithm involves randomized elements and hence if needed, the seed
-    of the Python built-in random module can be set for reproducible results.
+    of the Python built-in "random" module can be set for reproducible results.
 
     References
     ----------
-    .. [1] Nicholas J. Higham and Françoise Tisseur (2000),
-           "A Block Algorithm for Matrix 1-Norm Estimation,
-           with an Application to 1-Norm Pseudospectra."
-           SIAM J. Matrix Anal. Appl. Vol. 21, No. 4, pp. 1185-1201.
+    .. [1] Nicholas J. Higham and Francoise Tisseur (2000), "A Block Algorithm
+           for Matrix 1-Norm Estimation, with an Application to 1-Norm
+           Pseudospectra." SIAM J. Matrix Anal. Appl. 21(4):1185-1201,
+           :doi:`10.1137/S0895479899356080`
 
     .. [2] Sheung Hun Cheng, Nicholas J. Higham (2001), "Implementation for
            LAPACK of a Block Algorithm for Matrix 1-Norm Estimation",
            NA Report 393
 
     """
-    # Ref [1] skips parallel col test for complex inputs
+    # We skip parallel col test for complex inputs
     real_A = np.isrealobj(A)
     n = A.shape[0]
     est_old = 0
@@ -573,14 +588,27 @@ def _norm1est(A, m=1, t=2, max_iter=5):
             # w = Y[:, best_j]
         est_old = est
         S[:, :t] = S[:, t:]
-        _custom_signum(Y, S[:, t:], not real_A)
+        if real_A:
+            S[:, t:] = np.signbit(Y)
+        else:
+            S[:, t:].fill(1)
+            mask = Y != 0.
+            S[:, t:][mask] = Y[mask] / np.abs(Y[mask])
+
         if t > 1 and real_A:
             # (2)
             if ((S[:, t:].T @ S[:, :t]).max(axis=1) == n).all() and k > 0:
                 break
             else:
-                # replaces S in-place
-                _replace_parallel_cols(S)
+                max_spin = math.ceil(n / t)
+                for col in range(t):
+                    curr_col = t + col
+                    n_it = 0
+                    while round(np.abs(S[:, col] @ S[:, :curr_col]).max()) == n:
+                        S[:, col] = random.choices([1, -1], k=n)
+                        n_it += 1
+                        if n_it > max_spin:
+                            break
 
         # (3)
         Z = A.conj().T @ S
@@ -609,70 +637,6 @@ def _norm1est(A, m=1, t=2, max_iter=5):
     return est  # , v, w
 
 
-def _replace_parallel_cols(S):
-    """Helper function for 1-norm condition number estimator.
-
-    This function is akin to ?LARPC function given in Cheng, Higham (see Ref
-    section of expm, [2]).
-
-    Parameters
-    ----------
-    S : ndarray
-        Working array of the current iteration
-    S_old : ndarray
-        Working array of the previous iteration. Pass None if only S exists.
-    rng : Generator
-        The random generator used inside the body of condition estimator
-
-    Returns
-    -------
-    None
-
-    """
-    n, t = S.shape
-    max_spin = math.ceil(n / t)
-
-    for col in range(t):
-        curr_col = t + col
-        n_it = 0
-        while round(np.abs(S[:, col] @ S[:, :curr_col]).max()) == n:
-            S[:, col] = random.choices([1, -1], k=n)
-            n_it += 1
-            if n_it > max_spin:
-                break
-
-    return
-
-
-def _custom_signum(M, sgnM, is_complex=False):
-    """Helper function for 1-norm condition number estimator.
-
-    This function is defined to comply with ref. [1] of condest. For reals,
-    nonnegative inputs are mapped to 1 and to -1 otherwise. For complex inputs
-    sgn(x) = x / ||x|| and sgn(0 + 0j) := 1.
-
-    Parameters
-    ----------
-    M : ndarray
-        Input array
-
-    sgnM : ndarray
-        Output array that will be overwritten
-
-    Returns
-    -------
-    sgnM : ndarray
-        The resulting sign(M) array.
-
-    """
-    if is_complex:
-        sgnM.fill(1.)
-        mask = M != 0.
-        sgnM[mask] = M[mask] / np.abs(M[mask])
-    else:
-        np.signbit(M, out=sgnM)
-
-
 @cython.initializedcheck(False)
 def pade_UV_calc(lapack_t[:, :, ::1]Am, int n, int m):
     """Helper functions for expm to solve the final polynomial evaluation"""

From 89bad715c1d0f64ccf422c427d64297d2514d58a Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Tue, 23 Nov 2021 03:27:35 +0100
Subject: [PATCH 12/22] TST:linalg: add 2x2 array tests to expm()

---
 scipy/linalg/tests/test_matfuncs.py | 32 +++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/scipy/linalg/tests/test_matfuncs.py b/scipy/linalg/tests/test_matfuncs.py
index 97e52886ed51..fbbbd915958a 100644
--- a/scipy/linalg/tests/test_matfuncs.py
+++ b/scipy/linalg/tests/test_matfuncs.py
@@ -611,6 +611,38 @@ def test_empty_matrix_input(self):
         result = expm(A)
         assert result.size == 0
 
+    def test_2x2_input(self):
+        E = np.e
+        a = array([[1, 4], [1, 1]])
+        aa = (E**4 + 1)/(2*E)
+        bb = (E**4 - 1)/E
+        assert_allclose(expm(a), array([[aa, bb], [bb/4, aa]]))
+        assert expm(a.astype(np.complex64)).dtype.char == 'F'
+        assert expm(a.astype(np.float32)).dtype.char == 'f'
+
+    def test_nx2x2_input(self):
+        E = np.e
+        # These are integer matrices with integer eigenvalues
+        a = np.array([[[1, 4], [1, 1]],
+                      [[1, 3], [1, -1]],
+                      [[1, 3], [4, 5]],
+                      [[1, 3], [5, 3]],
+                      [[4, 5], [-3, -4]]], order='F')
+        # Exact results are computed symbolically
+        a_res = np.array([
+                          [[(E**4+1)/(2*E), (E**4-1)/E],
+                           [(E**4-1)/4/E, (E**4+1)/(2*E)]],
+                          [[1/(4*E**2)+(3*E**2)/4, (3*E**2)/4-3/(4*E**2)],
+                           [E**2/4-1/(4*E**2), 3/(4*E**2)+E**2/4]],
+                          [[3/(4*E)+E**7/4, -3/(8*E)+(3*E**7)/8],
+                           [-1/(2*E)+E**7/2, 1/(4*E)+(3*E**7)/4]],
+                          [[5/(8*E**2)+(3*E**6)/8, -3/(8*E**2)+(3*E**6)/8],
+                           [-5/(8*E**2)+(5*E**6)/8, 3/(8*E**2)+(5*E**6)/8]],
+                          [[-3/(2*E)+(5*E)/2, -5/(2*E)+(5*E)/2],
+                           [3/(2*E)-(3*E)/2, 5/(2*E)-(3*E)/2]]
+                         ])
+        assert_allclose(expm(a), a_res)
+
 
 class TestExpmFrechet:
 

From ba356d9ca6cc1f74c0c9871d3346f22b51add383 Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Sat, 27 Nov 2021 13:05:45 +0100
Subject: [PATCH 13/22] DOC:add BLANKLINE to 3d example

---
 scipy/linalg/_matfuncs.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py
index 06fb448fc6b7..4480e3f77fbf 100644
--- a/scipy/linalg/_matfuncs.py
+++ b/scipy/linalg/_matfuncs.py
@@ -19,10 +19,9 @@
 from ._matfuncs_sqrtm import sqrtm
 from ._matfuncs_expm import pick_pade_structure, pade_UV_calc
 
-__all__ = ['expm','cosm','sinm','tanm','coshm','sinhm',
-           'tanhm','logm','funm','signm','sqrtm',
-           'expm_frechet', 'expm_cond', 'fractional_matrix_power',
-           'khatri_rao']
+__all__ = ['expm', 'cosm', 'sinm', 'tanm', 'coshm', 'sinhm', 'tanhm', 'logm',
+           'funm', 'signm', 'sqrtm', 'fractional_matrix_power','expm_frechet',
+           'expm_cond', 'khatri_rao']
 
 eps = np.finfo('d').eps
 feps = np.finfo('f').eps
@@ -258,10 +257,10 @@ def expm(A):
     >>> expm(np.zeros((3, 2, 2)))
     array([[[1., 0.],
             [0., 1.]],
-
+    <BLANKLINE>
            [[1., 0.],
             [0., 1.]],
-
+    <BLANKLINE>
            [[1., 0.],
             [0., 1.]]])
 

From 1b0678190415280bfd689c179525c92a0638c471 Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Sat, 27 Nov 2021 13:06:38 +0100
Subject: [PATCH 14/22] MAINT:linalg: fixed the transposed gemm calls in expm

---
 scipy/linalg/_matfuncs_expm.pyx.in | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in
index 047d5c4a9ecc..e8a2f0cb7f32 100644
--- a/scipy/linalg/_matfuncs_expm.pyx.in
+++ b/scipy/linalg/_matfuncs_expm.pyx.in
@@ -249,6 +249,10 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, :, ::1] Am):
 # Note: MSVC does not accept "Memview[i, j, k] += 1.+0.j" as a valid operation
 # and results with error "C2088: '+=': illegal for struct".
 # OCD is unbearable but we do explicit addition for that reason.
+
+# Note: gemm calls for A @ B is done via dgemm(.., B, .., A, ..) because arrays
+# are in C-contiguous memory layout. This also the reason why getrs has 'T' as
+# the argument for TRANSA.
 {{for tname, pfx, prec, sc in zip(typenames, prefixes, precision, ['s', 'd', 'cs', 'zd'])}}
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -288,7 +292,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi
         if m == 3:
             # U = Am[0] @ Am[1] + 60.*Am[0]
             {{ pfx }}copy(&n2, &Am[0, 0, 0], &int_one, &Am[3, 0, 0], &int_one)
-            {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[1, 0, 0], &n, &b[1], &Am[3, 0, 0], &n)
+            {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[1, 0, 0], &n, &Am[0, 0, 0], &n, &b[1], &Am[3, 0, 0], &n)
             # V = 12.*Am[1] + 120*I_n
             {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one)
             for i in range(n):
@@ -301,7 +305,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi
             {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
             for i in range(n):
                 Am[4, i, i] = Am[4, i, i] + b[1]
-            {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n)
+            {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[4, 0, 0], &n, &Am[0, 0, 0], &n, &zero, &Am[3, 0, 0], &n)
             # V = b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n
             {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one)
             {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
@@ -324,7 +328,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi
             for i in range(n):
                 Am[1, i, i] = Am[1, i, i] + b[0]
             # Now we can scratch A[2] or A[3]
-            {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n)
+            {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[4, 0, 0], &n, &Am[0, 0, 0], &n, &zero, &Am[3, 0, 0], &n)
 
         # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U
         {{ pfx }}axpy(&n2, &neg_one, &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one)
@@ -388,7 +392,7 @@ cdef void pade_9_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n) nogil:
         for i in range(n):
             Am[1, i, i] = Am[1, i, i] + b[0]
         # Now we can scratch A[2] or A[3]
-        {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &work[0], &n, &zero, &Am[3, 0, 0], &n)
+        {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &work[0], &n, &Am[0, 0, 0], &n, &zero, &Am[3, 0, 0], &n)
 
         # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U
         {{ pfx }}axpy(&n2, &neg_one, &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one)

From 196f522e33899d96fda12f30428cb6985058ec7d Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Wed, 1 Dec 2021 19:00:17 +0100
Subject: [PATCH 15/22] MAINT:linalg: replaced sinch function for expm

---
 scipy/linalg/_matfuncs.py | 40 +++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py
index 4480e3f77fbf..8195f3f8882f 100644
--- a/scipy/linalg/_matfuncs.py
+++ b/scipy/linalg/_matfuncs.py
@@ -305,11 +305,14 @@ def expm(A):
                           a[..., [0], [1]],
                           a[..., [1], [0]],
                           a[..., [1], [1]])
-        mu = csqrt((a1-a4)**2 + 4*a2*a3)/2.
+        mu = csqrt((a1-a4)**2 + 4*a2*a3)/2.  # csqrt slow but handles neg.vals
+
         eApD2 = np.exp((a1+a4)/2.)
         AmD2 = (a1 - a4)/2.
         coshMu = np.cosh(mu)
-        sinchMu = _sinch(mu)
+        sinchMu = np.ones_like(coshMu)
+        mask = mu != 0
+        sinchMu[mask] = np.sinh(mu[mask]) / mu[mask]
         eA = np.empty((a.shape), dtype=mu.dtype)
         eA[..., [0], [0]] = eApD2 * (coshMu + AmD2*sinchMu)
         eA[..., [0], [1]] = eApD2 * a2 * sinchMu
@@ -348,24 +351,24 @@ def expm(A):
         if s != 0:  # squaring needed
 
             if (lu[1] == 0) or (lu[0] == 0):  # lower/upper triangular
+                # This branch implements Code Fragment 2.1 of [1]
+
                 diag_aw = np.diag(aw)
                 # einsum returns a writable view
                 np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-s))
+                # super/sub diagonal
+                sd = np.diag(aw, k=-1 if lu[1] == 0 else 1)
+
                 for i in range(s-1, -1, -1):
                     eAw = eAw @ eAw
 
                     # diagonal
-                    np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-i))
-                    # super/sub diagonal
-                    sd = np.diag(aw, k=-1 if lu[1] == 0 else 1) * 2**(-i)
-
-                    l1_plus_l2 = (diag_aw[:-1] + diag_aw[1:]) * 2**(-i-1)
-                    l1_minus_l2 = (diag_aw[:-1] - diag_aw[1:]) * 2**(-i-1)
-                    esd = sd*np.exp(l1_plus_l2)*_sinch(l1_minus_l2)
+                    np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2.**(-i))
+                    exp_sd = _exp_sinch(diag_aw * (2.**(-i))) * (sd * 2**(-i))
                     if lu[1] == 0:  # lower
-                        np.einsum('ii->i', eAw[1:, :-1])[:] = esd
+                        np.einsum('ii->i', eAw[1:, :-1])[:] = exp_sd
                     else:  # upper
-                        np.einsum('ii->i', eAw[:-1, 1:])[:] = esd
+                        np.einsum('ii->i', eAw[:-1, 1:])[:] = exp_sd
 
             else:  # generic
                 for _ in range(s):
@@ -380,13 +383,14 @@ def expm(A):
     return eA
 
 
-def _sinch(x):
-    if np.isscalar(x):
-        return 1. if x == 0. else np.sinh(x)/x
-    m = x == 0.
-    x[m] = 1.
-    x[~m] = np.sinh(x[~m])/x[~m]
-    return x
+def _exp_sinch(x):
+    # Higham's formula (10.42), might overflow, see GH-11839
+    lexp_diff = np.diff(np.exp(x))
+    l_diff = np.diff(x)
+    mask_z = l_diff == 0.
+    lexp_diff[~mask_z] /= l_diff[~mask_z]
+    lexp_diff[mask_z] = np.exp(x[:-1][mask_z])
+    return lexp_diff
 
 
 def cosm(A):

From f26a9d00f66266ac119af410b7888c8653cd5894 Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Thu, 2 Dec 2021 09:44:00 +0100
Subject: [PATCH 16/22] PEP8:linalg: correct spacing _matfuncs.py

---
 scipy/linalg/_matfuncs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py
index 8195f3f8882f..0f650719b43e 100644
--- a/scipy/linalg/_matfuncs.py
+++ b/scipy/linalg/_matfuncs.py
@@ -20,7 +20,7 @@
 from ._matfuncs_expm import pick_pade_structure, pade_UV_calc
 
 __all__ = ['expm', 'cosm', 'sinm', 'tanm', 'coshm', 'sinhm', 'tanhm', 'logm',
-           'funm', 'signm', 'sqrtm', 'fractional_matrix_power','expm_frechet',
+           'funm', 'signm', 'sqrtm', 'fractional_matrix_power', 'expm_frechet',
            'expm_cond', 'khatri_rao']
 
 eps = np.finfo('d').eps

From 3d4f5dc843e04e047e2c6f8a7110f9dd945f2a00 Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Thu, 2 Dec 2021 09:45:51 +0100
Subject: [PATCH 17/22] MAINT:linalg: Fix access in pade structure

---
 scipy/linalg/_matfuncs_expm.pyx.in | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in
index e8a2f0cb7f32..8c218a1b30a1 100644
--- a/scipy/linalg/_matfuncs_expm.pyx.in
+++ b/scipy/linalg/_matfuncs_expm.pyx.in
@@ -196,7 +196,7 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, :, ::1] Am):
             return 5, s
 
         # m = 7
-        if n < 360:
+        if n < 400:
             {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[2, 0, 0], &intn, &Amv[2, 0, 0], &intn, &zero, &Amv[4, 0, 0], &intn)
             d8 = norm1(Amv[4, :, :]) ** (1./8.)
         else:
@@ -216,7 +216,7 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, :, ::1] Am):
 
         # m = 13
         # Scale-square
-        if n < 360:
+        if n < 400:
             {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[3, 0, 0], &intn, &Amv[2, 0, 0], &intn, &zero, &Amv[4, 0, 0], &intn)
             d10 = norm1(Amv[4, :, :]) ** (1./10.)
         else:
@@ -264,7 +264,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi
     cdef {{ prec }} two=2.0
     cdef {{ tname }} one = 1.0, zero = 0.0, neg_one = -1.0
     if not ipiv:
-        raise MemoryError('Internal function "pade_357_UV_calc_c" failed to allocate memory.')
+        raise MemoryError('Internal function "pade_357_UV_calc" failed to allocate memory.')
     try:
         # b[m] is always 1. hence skipped
         if m == 3:
@@ -286,7 +286,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi
             b[5] = 1512.
             b[6] = 56.
         else:
-            raise ValueError(f'Internal function "pade_357_UV_calc_c" received an invalid value {m}')
+            raise ValueError(f'Internal function "pade_357_UV_calc" received an invalid value {m}')
 
         # Utilize the unused powers of Am as scratch memory
         if m == 3:
@@ -302,7 +302,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi
             # U = Am[0] @ (b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n)
             {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
             {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one)
-            {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
+            {{ pfx }}axpy(&n2, &one, &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
             for i in range(n):
                 Am[4, i, i] = Am[4, i, i] + b[1]
             {{ pfx }}gemm(<char*>'N', <char*>'N', &n, &n, &n, &one, &Am[4, 0, 0], &n, &Am[0, 0, 0], &n, &zero, &Am[3, 0, 0], &n)
@@ -317,7 +317,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi
             {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
             {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one)
             {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
-            {{ pfx }}axpy(&n2, &b[7], &Am[3, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
+            {{ pfx }}axpy(&n2, &one, &Am[3, 0, 0], &int_one, &Am[4, 0, 0], &int_one)
             for i in range(n):
                 Am[4, i, i] = Am[4, i, i] + b[1]
             # We ran out of space for dgemm; first compute V and then reuse space for U

From 48c5966b3960b63479a51d40b1b7d9c96c86d2e9 Mon Sep 17 00:00:00 2001
From: Ilhan Polat <ilhanpolat@gmail.com>
Date: Thu, 2 Dec 2021 09:46:23 +0100
Subject: [PATCH 18/22] ENH:linalg: Typing stubs for expm cython funcs

---
 scipy/linalg/_matfuncs_expm.pyi | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 scipy/linalg/_matfuncs_expm.pyi

diff --git a/scipy/linalg/_matfuncs_expm.pyi b/scipy/linalg/_matfuncs_expm.pyi
new file mode 100644
index 000000000000..880c69558ec8
--- /dev/null
+++ b/scipy/linalg/_matfuncs_expm.pyi
@@ -0,0 +1,6 @@
+from numpy.typing import NDArray
+from typing import Any, Tuple
+
+def pick_pade_structure(a: NDArray[Any]) -> Tuple[int, int]: ...
+
+def pade_UV_calc(Am: NDArray[Any], n: int, m: int) -> None: ...

From 60f7e99d1655554efacd9d1c4999d8e8568e5bbf Mon Sep 17 00:00:00 2001
From: ilayn <ilhanpolat@gmail.com>
Date: Sat, 12 Mar 2022 15:00:13 +0100
Subject: [PATCH 19/22] BLD:linalg: Add _matfuncs_expm.pyx to meson.build

---
 scipy/linalg/meson.build | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/scipy/linalg/meson.build b/scipy/linalg/meson.build
index b432b7349f5b..e61a596818fd 100644
--- a/scipy/linalg/meson.build
+++ b/scipy/linalg/meson.build
@@ -244,6 +244,22 @@ py3.extension_module('_decomp_update',
   subdir: 'scipy/linalg'
 )
 
+# _matfuncs_expm:
+_matfuncs_expm_pyx = custom_target('_matfuncs_expm',
+  output: '_matfuncs_expm.pyx',
+  input: '_matfuncs_expm.pyx.in',
+  command: [py3, tempita, '@INPUT@', '-o', '@OUTDIR@']
+)
+
+py3.extension_module('_matfuncs_expm',
+  linalg_init_cython_gen.process(_matfuncs_expm_pyx),
+  c_args: cython_c_args,
+  include_directories: inc_np,
+  dependencies: py3_dep,
+  install: true,
+  subdir: 'scipy/linalg'
+)
+
 _cythonized_array_utils = py3.extension_module('_cythonized_array_utils',
   linalg_init_cython_gen.process('_cythonized_array_utils.pyx'),
   c_args: [cython_c_args, c_undefined_ok],

From 93d9b56ecae681982d200c115819a48dd3ed3499 Mon Sep 17 00:00:00 2001
From: ilayn <ilhanpolat@gmail.com>
Date: Sun, 13 Mar 2022 15:31:12 +0100
Subject: [PATCH 20/22] BENCH:linalg: Adjust tests for new dense expm

---
 benchmarks/benchmarks/linalg.py             | 2 --
 benchmarks/benchmarks/sparse_linalg_expm.py | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmarks/linalg.py b/benchmarks/benchmarks/linalg.py
index f7f674ce1497..2e51d93939d4 100644
--- a/benchmarks/benchmarks/linalg.py
+++ b/benchmarks/benchmarks/linalg.py
@@ -233,8 +233,6 @@ def time_invpascal(self, size):
     def time_toeplitz(self, size):
         sl.toeplitz(self.x)
 
-    def time_tri(self, size):
-        sl.tri(size)
 
 
 class GetFuncs(Benchmark):
diff --git a/benchmarks/benchmarks/sparse_linalg_expm.py b/benchmarks/benchmarks/sparse_linalg_expm.py
index 1dc25090f975..fac933f2c192 100644
--- a/benchmarks/benchmarks/sparse_linalg_expm.py
+++ b/benchmarks/benchmarks/sparse_linalg_expm.py
@@ -6,6 +6,7 @@
 
 with safe_import():
     import scipy.linalg
+    from scipy.sparse.linalg import expm as sp_expm
     from scipy.sparse.linalg import expm_multiply
 
 
@@ -67,6 +68,6 @@ def setup(self, n, format):
 
     def time_expm(self, n, format):
         if format == 'sparse':
-            scipy.linalg.expm(self.A_sparse)
+            sp_expm(self.A_sparse)
         elif format == 'dense':
             scipy.linalg.expm(self.A_dense)

From a6f95fffdb409302b67b1ceb02abdc75ee083c16 Mon Sep 17 00:00:00 2001
From: ilayn <ilhanpolat@gmail.com>
Date: Sun, 13 Mar 2022 20:45:54 +0100
Subject: [PATCH 21/22] MAINT:PEP8:linalg: Remove extra blank line

[ci skip] [skip ci] [skip azp]
---
 benchmarks/benchmarks/linalg.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/benchmarks/linalg.py b/benchmarks/benchmarks/linalg.py
index 2e51d93939d4..f870e8049038 100644
--- a/benchmarks/benchmarks/linalg.py
+++ b/benchmarks/benchmarks/linalg.py
@@ -234,7 +234,6 @@ def time_toeplitz(self, size):
         sl.toeplitz(self.x)
 
 
-
 class GetFuncs(Benchmark):
     def setup(self):
         self.x = np.eye(1)

From 41b9362fcad54fd6d0c74e4b0843497339027a1d Mon Sep 17 00:00:00 2001
From: ilayn <ilhanpolat@gmail.com>
Date: Sun, 13 Mar 2022 23:49:25 +0100
Subject: [PATCH 22/22] MAINT:linalg: Add cnp.import_array() to expm pyx file

---
 scipy/linalg/_matfuncs_expm.pyx.in | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in
index 8c218a1b30a1..05f712ac93b7 100644
--- a/scipy/linalg/_matfuncs_expm.pyx.in
+++ b/scipy/linalg/_matfuncs_expm.pyx.in
@@ -16,6 +16,9 @@ from scipy.linalg.cython_blas cimport (sgemm, saxpy, sscal, scopy, sgemv,
 
 __all__ = ['pick_pade_structure', 'pade_UV_calc']
 
+# See GH-14813
+cnp.import_array()
+
 ctypedef fused lapack_t:
     float
     double