From 9c1854e1bd8de2eb1393993927c7c899fdcaadd4 Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Sat, 20 Nov 2021 00:28:38 +0100 Subject: [PATCH 01/22] ENH:linalg: Cythonized core expm functionality --- .gitignore | 2 + scipy/linalg/_matfuncs_expm.pyx.in | 722 +++++++++++++++++++++++++++++ scipy/linalg/setup.py | 3 + 3 files changed, 727 insertions(+) create mode 100644 scipy/linalg/_matfuncs_expm.pyx.in diff --git a/.gitignore b/.gitignore index ab34160de68f..b22d18d96e92 100644 --- a/.gitignore +++ b/.gitignore @@ -198,6 +198,8 @@ scipy/linalg/cython_lapack.pyx scipy/linalg/src/id_dist/src/*_subr_*.f scipy/linalg/_matfuncs_sqrtm_triu.c scipy/linalg/_matfuncs_sqrtm_triu.cpp +scipy/linalg/_matfuncs_expm.c +scipy/linalg/_matfuncs_expm.pyx scipy/ndimage/src/_ni_label.c scipy/ndimage/src/_cytest.c scipy/optimize/_bglu_dense.c diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in new file mode 100644 index 000000000000..5c3384701710 --- /dev/null +++ b/scipy/linalg/_matfuncs_expm.pyx.in @@ -0,0 +1,722 @@ +# cython: language_level=3 +cimport cython +cimport numpy as cnp +import numpy as np +import random +import math +from libc.stdlib cimport malloc, free +from libc.math cimport fabs, ceil, log2, pow +from scipy.linalg.cython_lapack cimport (sgetrf, sgetrs, dgetrf, dgetrs, + cgetrf, cgetrs, zgetrf, zgetrs) +from scipy.linalg.cython_blas cimport (sgemm, saxpy, sscal, scopy, sgemv, + dgemm, daxpy, dscal, dcopy, dgemv, + cgemm, caxpy, cscal, ccopy, cgemv, + zgemm, zaxpy, zscal, zcopy, dgemv, + csscal, zdscal, idamax) + +__all__ = ['pick_pade_structure', 'pade_UV_calc'] + +ctypedef fused lapack_t: + float + double + (float complex) + (double complex) + +ctypedef fused lapack_cz_t: + (float complex) + (double complex) + +ctypedef fused lapack_sd_t: + float + double + +{{py: +# Tempita variables +typenames = ['float', 'double', 'float complex', 'double complex'] +precision = ['float', 'double', 'float', 'double'] +prefixes = ['s', 'd', 'c', 'z'] +}} +# ====================== swap_c_and_f_layout : s, d, c, z ==================== +@cython.cdivision(True) +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.initializedcheck(False) +cdef void swap_c_and_f_layout(lapack_t *a, lapack_t *b, int r, int c, int n) nogil: + """Recursive matrix transposition for square arrays""" + cdef int i, j, ith_row, r2, c2 + cdef lapack_t *bb=b + cdef lapack_t *aa=a + if r < 16: + for j in range(c): + ith_row = 0 + for i in range(r): + # Basically b[i*n+j] = a[j*n+i] without index math + bb[ith_row] = aa[i] + ith_row += n + aa += n + bb += 1 + else: + # If tall + if (r > c): + r2 = r//2 + swap_c_and_f_layout(a, b, r2, c, n) + swap_c_and_f_layout(a + r2, b+(r2)*n, r-r2, c, n) + else: # Nope + c2 = c//2 + swap_c_and_f_layout(a, b, r, c2, n); + swap_c_and_f_layout(a+(c2)*n, b+c2, r, c-c2, n) +# ============================================================================ + +# ========================= norm1 : s, d, c, z =============================== +# The generic abs() function is made nogil friendly in Cython 3.x (unreleased) +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.initializedcheck(False) +cdef double norm1(lapack_t[:, ::1] A): + """ + Fast 1-norm computation for C-contiguous square arrays. + Regardless of the dtype we work in double precision to prevent overflows + """ + cdef Py_ssize_t n = A.shape[0] + cdef Py_ssize_t i, j + cdef int ind = 1, intn = n + cdef double temp = 0. + cdef double complex temp_cz = 0. + cdef double temp_sd = 0. + cdef double *work = malloc(n*sizeof(double)) + if not work: + raise MemoryError('Internal function "norm1" failed to allocate memory.') + try: + if lapack_t in lapack_cz_t: + for j in range(n): + temp_cz = A[0, j] + work[j] = abs(temp_cz) + for i in range(1, n): + for j in range(n): + temp_cz = A[i, j] + work[j] += abs(temp_cz) + else: + for j in range(n): + temp_sd = A[0, j] + work[j] = abs(temp_sd) + for i in range(1, n): + for j in range(n): + temp_sd = A[i, j] + work[j] += abs(temp_sd) + + ind = idamax(&intn, &work[0], &ind) + temp = work[ind-1] + return temp + finally: + free(work) +# ============================================================================ + +# ========================= kth power norm : d =============================== +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.initializedcheck(False) +cdef double kth_power_norm_d(double* A, double* v1, double* v2, Py_ssize_t n, int spins): + cdef int k, int_one = 1, intn = n + cdef double one =1., zero = 0. + for k in range(spins): + dgemv('C', &intn, &intn, &one, &A[0], &intn, &v1[0], &int_one, &zero, &v2[0], &int_one) + dcopy(&intn, &v2[0], &int_one, &v1[0], &int_one) + k = idamax(&intn, &v1[0], &int_one) + return v1[k-1] +# ============================================================================ + +# ====================== pick_pade_structure : s, d, c, z ==================== +{{for tname, pfx, dchar, dt in zip(typenames, prefixes, ['f', 'd', 'F', 'D'], + ['float32', 'float64', 'complex64', 'complex128'], + )}} +@cython.cdivision(True) +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.initializedcheck(False) +cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, ::1] A): + cdef Py_ssize_t n = A.shape[0], i, j, k + cdef int lm = 0, s = 0, intn = n, n2= intn*intn, int_one = 1 + cdef cnp.ndarray[cnp.{{ dt }}_t, ndim=3] Am = np.empty((5, n, n), dtype='{{ dchar }}') + cdef {{ tname }}[:, :, ::1] Amv = Am + cdef: + {{ tname }} one = 1.0, zero = 0.0 + double normA + double d4, d6, d8, d10 + double eta0, eta1, eta2, eta3, eta4 + double u = (2.)**({{if pfx in ['s', 'c']}}-24.{{else}}-53{{endif}}) + double two_pow_s + double temp + cdef double *absA = malloc(n*n*sizeof(double)) + cdef double *work = malloc(n*sizeof(double)) + cdef double *work2 = malloc(n*sizeof(double)) + if not work or not work2 or not absA: + raise MemoryError('Internal function "pick_pade_structure" failed to allocate memory.') + cdef double [5] theta + cdef double [5] coeff + cdef char* cN = 'N' + + theta[0] = 1.495585217958292e-002 + theta[1] = 2.539398330063230e-001 + theta[2] = 9.504178996162932e-001 + theta[3] = 2.097847961257068e+000 + theta[4] = 4.250000000000000e+000 + coeff[0] = u*100800. + coeff[1] = u*10059033600. + coeff[2] = u*4487938430976000. + coeff[3] = u*5914384781877411840000. + coeff[4] = u*113250775606021113483283660800000000. + try: + for j in range(n): + work[j] = 1. + + {{ pfx }}copy(&n2, &A[0, 0], &int_one, &Amv[0, 0, 0], &int_one) + for i in range(n): + for j in range(n): + absA[i*n + j] = abs(A[i, j]) + + # First spin = normest(|A|, 1), increase m when spun more + normA = kth_power_norm_d(absA, work, work2, n, 1) + {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[0, 0, 0], &intn, &Amv[0, 0, 0], &intn, &zero, &Amv[1, 0, 0], &intn) + {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[1, 0, 0], &intn, &Amv[1, 0, 0], &intn, &zero, &Amv[2, 0, 0], &intn) + {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[2, 0, 0], &intn, &Amv[1, 0, 0], &intn, &zero, &Amv[3, 0, 0], &intn) + d4 = norm1(Amv[2]) ** (1./4.) + d6 = norm1(Amv[3]) ** (1./6.) + eta0 = max(d4, d6) + eta1 = eta0 + + # m = 3 + temp = kth_power_norm_d(absA, work, work2, n, 6) + lm = max(ceil(log2(temp/normA/coeff[0])/6), 0) + if eta0 < theta[0] and lm == 0: + return Am, 3, s + + # m = 5 + temp = kth_power_norm_d(absA, work, work2, n, 4) + lm = max(ceil(log2(temp/normA/coeff[1])/10), 0) + if eta1 < theta[1] and lm == 0: + return Am, 5, s + + # m = 7 + if n < 360: + {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[2, 0, 0], &intn, &Amv[2, 0, 0], &intn, &zero, &Amv[4, 0, 0], &intn) + d8 = norm1(Amv[4, :, :]) ** (1./8.) + else: + d8 = _norm1est(Am[0], m=8) ** (1./8.) + + eta2 = max(d6, d8) + temp = kth_power_norm_d(absA, work, work2, n, 4) + lm = max(ceil(log2(temp/normA/coeff[2])/14), 0) + if eta2 < theta[2] and lm == 0: + return Am, 7, s + + # m = 9 + temp = kth_power_norm_d(absA, work, work2, n, 4) + lm = max(ceil(log2(temp/normA/coeff[3])/18), 0) + if eta2 < theta[3] and lm == 0: + return Am, 9, s + + # m = 13 + # Scale-square + if n < 360: + {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[3, 0, 0], &intn, &Amv[2, 0, 0], &intn, &zero, &Amv[4, 0, 0], &intn) + d10 = norm1(Amv[4, :, :]) ** (1./10.) + else: + d10 = _norm1est(Am[0], m=10) ** (1./10.) + + eta3 = max(d8, d10) + eta4 = min(eta2, eta3) + s = max(ceil(log2(eta4/theta[4])), 0) + if s != 0: + two_pow_s = 2.** (-s) + dscal(&n2, &two_pow_s, absA, &int_one) + # kth_power_norm has spun 19 times already + two_pow_s = 2.** ((-s)*19.) + for i in range(n): + work[i] *= two_pow_s + normA *= 2.**(-s) + temp = kth_power_norm_d(absA, work, work2, n, 8) + s += max(ceil(log2(temp/normA/coeff[4])/26), 0) + return Am, 13, s + finally: + free(work) + free(work2) + free(absA) + + +{{endfor}} +# ============================================================================ + +# ====================== pade_m_UV_calc : s, d, c, z ========================= +# Note: MSVC does not accept "Memview[i, j, k] += 1.+0.j" as a valid operation +# and results with error "C2088: '+=': illegal for struct". +# OCD is unbearable but we do explicit addition for that reason. +{{for tname, pfx, prec, sc in zip(typenames, prefixes, precision, ['s', 'd', 'cs', 'zd'])}} +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.initializedcheck(False) +cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogil: + cdef {{ tname }} b[7] + cdef int *ipiv = malloc(n*sizeof(int)) + cdef int i, info, n2 = n*n, int_one = 1 + cdef {{ prec }} two=2.0 + cdef {{ tname }} one = 1.0, zero = 0.0, neg_one = -1.0 + if not ipiv: + raise MemoryError('Internal function "pade_357_UV_calc_c" failed to allocate memory.') + # b[m] is always 1. hence skipped + if m == 3: + b[0] = 120. + b[1] = 60. + b[2] = 12. + elif m == 5: + b[0] = 30240. + b[1] = 15120. + b[2] = 3360. + b[3] = 420. + b[4] = 30. + elif m == 7: + b[0] = 17297280. + b[1] = 8648640. + b[2] = 1995840. + b[3] = 277200. + b[4] = 25200. + b[5] = 1512. + b[6] = 56. + else: + raise ValueError(f'Internal function "pade_357_UV_calc_c" received an invalid value {m}') + + # Utilize the unused powers of Am as scratch memory + if m == 3: + # U = Am[0] @ Am[1] + 60.*Am[0] + {{ pfx }}copy(&n2, &Am[0, 0, 0], &int_one, &Am[3, 0, 0], &int_one) + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[1, 0, 0], &n, &b[1], &Am[3, 0, 0], &n) + # V = 12.*Am[1] + 120*I_n + {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one) + for i in range(n): + Am[1, i, i] = Am[1, i, i] + b[0] + + elif m == 5: + # U = Am[0] @ (b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n) + {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one) + {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one) + for i in range(n): + Am[4, i, i] = Am[4, i, i] + b[1] + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n) + # V = b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n + {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one) + for i in range(n): + Am[1, i, i] = Am[1, i, i] + b[0] + + else: + # U = Am[0] @ (b[7]*Am[3] + b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n) + {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one) + {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[7], &Am[3, 0, 0], &int_one, &Am[4, 0, 0], &int_one) + for i in range(n): + Am[4, i, i] = Am[4, i, i] + b[1] + # We ran out of space for dgemm; first compute V and then reuse space for U + # V = b[6]*Am[3] + b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n + {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[6], &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one) + for i in range(n): + Am[1, i, i] = Am[1, i, i] + b[0] + # Now we can scratch A[2] or A[3] + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n) + + # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U + {{ pfx }}axpy(&n2, &neg_one, &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one) + + # Convert array layout for solving AX = B into Am[2] + swap_c_and_f_layout(&Am[3, 0, 0], &Am[2, 0, 0], n, n, n) + + {{ pfx }}getrf( &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &info ) + {{ pfx }}getrs('C', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info ) + {{ sc }}scal(&n2, &two, &Am[2, 0, 0], &int_one) + for i in range(n): + Am[2, i, i] = Am[2, i, i] + 1. + + # Put it back in Am in C order + swap_c_and_f_layout(&Am[2, 0, 0], &Am[0, 0, 0], n, n, n) + free(ipiv) + + +{{endfor}} +{{for tname, pfx, sc in zip(typenames, prefixes, ['s', 'd', 'cs', 'zd'])}} +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.initializedcheck(False) +cdef void pade_9_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n) nogil: + cdef {{ tname }} b[9] + cdef {{ tname }} *work = <{{ tname }}*>malloc(n*n*sizeof({{ tname }})) + cdef int *ipiv = malloc(n*sizeof(int)) + cdef int i, info, n2 = n*n, int_one = 1 + cdef {{ tname }} one = 1.0, zero = 0.0, neg_one = -1.0 + cdef {{if pfx in ['s', 'c']}}float{{else}}double{{endif}} two = 2.0 + + if not (work and ipiv): + raise MemoryError('Internal function "pade_9_UV_calc" failed to allocate memory.') + try: + # b[9] = 1. hence skipped + b[0] = 17643225600. + b[1] = 8821612800. + b[2] = 2075673600. + b[3] = 302702400. + b[4] = 30270240. + b[5] = 2162160. + b[6] = 110880. + b[7] = 3960. + b[8] = 90. + + # Utilize the unused powers of Am as scratch memory + # U = Am[0] @ (b[9]*Am[4] + b[7]*Am[3] + b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n) + {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &work[0], &int_one) + {{ pfx }}scal(&n2, &b[3], &work[0], &int_one) + {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &work[0], &int_one) + {{ pfx }}axpy(&n2, &b[7], &Am[3, 0, 0], &int_one, &work[0], &int_one) + {{ pfx }}axpy(&n2, &one, &Am[4, 0, 0], &int_one, &work[0], &int_one) + for i in range(n): + work[i*(n+1)] += b[1] + # V = b[8]*Am[4] + b[6]*Am[3] + b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n + {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[6], &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[8], &Am[4, 0, 0], &int_one, &Am[1, 0, 0], &int_one) + for i in range(n): + Am[1, i, i] = Am[1, i, i] + b[0] + # Now we can scratch A[2] or A[3] + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &work[0], &n, &zero, &Am[3, 0, 0], &n) + + # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U + {{ pfx }}axpy(&n2, &neg_one, &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one) + + # Convert array layout for solving AX = B into Am[2] + swap_c_and_f_layout(&Am[3, 0, 0], &Am[2, 0, 0], n, n, n) + + {{ pfx }}getrf( &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &info ) + {{ pfx }}getrs('C', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info) + {{ sc }}scal(&n2, &two, &Am[2, 0, 0], &int_one) + for i in range(n): + Am[2, i, i] = Am[2, i, i] + 1. + + # Put it back in Am in C order + swap_c_and_f_layout(&Am[2, 0, 0], &Am[0, 0, 0], n, n, n) + finally: + free(work) + free(ipiv) + + +{{endfor}} +{{for tname, pfx, sc in zip(typenames, prefixes, ['s', 'd', 'cs', 'zd'])}} +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.initializedcheck(False) +cdef void pade_13_UV_calc_{{ pfx }}({{ tname }}[:, :, ::1]Am, int n) nogil: + cdef {{ tname }} *work2 = <{{ tname }}*>malloc(n*n*sizeof({{ tname }})) + cdef {{ tname }} *work3 = <{{ tname }}*>malloc(n*n*sizeof({{ tname }})) + cdef {{ tname }} *work4 = <{{ tname }}*>malloc(n*n*sizeof({{ tname }})) + cdef int *ipiv = malloc(n*sizeof(int)) + cdef Py_ssize_t i, j, k + cdef int info, int1 = 1, n2 = n*n + cdef {{ tname }} one = 1.0, zero = 0.0, neg_one = -1. + cdef {{if pfx in ['s', 'c']}}float{{else}}double{{endif}} two = 2.0 + cdef {{ tname }} b[14] + b[0] = 64764752532480000. + b[1] = 32382376266240000. + b[2] = 7771770303897600. + b[3] = 1187353796428800. + b[4] = 129060195264000. + b[5] = 10559470521600. + b[6] = 670442572800. + b[7] = 33522128640. + b[8] = 1323241920. + b[9] = 40840800. + b[10] = 960960. + b[11] = 16380. + b[12] = 182. + b[13] = 1. + + if not (work2 and work3 and work4 and ipiv): + raise MemoryError('Internal function "pade_13_UV_calc" failed to allocate memory.') + try: + # U = Am[0] @ (Am[3] @ (b[13]*Am[3] + b[11]*Am[2] + b[9]*Am[1]) + + # b[7]*Am[3] + b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n) + # V = Am[3] @ (b[12]*Am[3] + b[10]*Am[2] + b[8]*Am[1]) + + # b[6]*Am[3] + b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n + + {{ pfx }}copy(&n2, &Am[1, 0, 0], &int1, &work2[0], &int1) + {{ pfx }}scal(&n2, &b[9], &work2[0], &int1) + {{ pfx }}axpy(&n2, &b[11], &Am[2, 0, 0], &int1, &work2[0], &int1) + {{ pfx }}axpy(&n2, &b[13], &Am[3, 0, 0], &int1, &work2[0], &int1) + + {{ pfx }}copy(&n2, &Am[1, 0, 0], &int1, &work3[0], &int1) + {{ pfx }}scal(&n2, &b[2], &work3[0], &int1) + {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int1, &work3[0], &int1) + {{ pfx }}axpy(&n2, &b[6], &Am[3, 0, 0], &int1, &work3[0], &int1) + + {{ pfx }}copy(&n2, &Am[1, 0, 0], &int1, &work4[0], &int1) + {{ pfx }}scal(&n2, &b[8], &work4[0], &int1) + {{ pfx }}axpy(&n2, &b[10], &Am[2, 0, 0], &int1, &work4[0], &int1) + {{ pfx }}axpy(&n2, &b[12], &Am[3, 0, 0], &int1, &work4[0], &int1) + + # Overwrite Am[1] as it is not used further + {{ pfx }}scal(&n2, &b[3], &Am[1, 0, 0], &int1) + {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int1, &Am[1, 0, 0], &int1) + {{ pfx }}axpy(&n2, &b[7], &Am[3, 0, 0], &int1, &Am[1, 0, 0], &int1) + + for i in range(n): + Am[1, i, i] = Am[1, i, i] + b[1] + work3[i*(n+1)] += b[0] + + # U = D @ (A @ B + C) + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &work2[0], &n, &Am[3, 0, 0], &n, &one, &Am[1, 0, 0], &n) + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[1, 0, 0], &n, &Am[0, 0, 0], &n, &zero, &work2[0], &n) + # V = A @ B + C + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &work4[0], &n, &Am[3, 0, 0], &n, &one, &work3[0], &n) + # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U + {{ pfx }}axpy(&n2, &neg_one, &work2[0], &int1, &work3[0], &int1) + + # Convert array layout for solving AX = B into work4 + swap_c_and_f_layout(work2, work4, n, n, n) + free(work2) + + {{ pfx }}getrf( &n, &n, &work3[0], &n, &ipiv[0], &info ) + {{ pfx }}getrs('C', &n, &n, &work3[0], &n, &ipiv[0], &work4[0], &n, &info ) + {{ sc }}scal(&n2, &two, &work4[0], &int1) + for i in range(n): + work4[i*(n+1)] += 1. + # Put it back in Am in C order + swap_c_and_f_layout(work4, &Am[0, 0, 0], n, n, n) + finally: + free(ipiv) + free(work3) + free(work4) + + +{{endfor}} +# ============================================================================ + +# ====================== norm1est ============================================ +def _norm1est(A, m=1, t=2, max_iter=5): + """Compute a lower bound for the 1-norm of 2D matrix A or its powers + + Parameters + ---------- + A : ndarray + Input square array of shape (N, N). + m : int, optional + If it is different than one, then m-th power of the matrix norm is + computed. + t : int, optional + The number of columns of the internal matrix used in the iterations. + max_iter : int, optional + The number of total iterations to be performed. Problems that require + more than 5 iterations are rarely reported in practice. + + Returns + ------- + c : float + The resulting 1-norm condition number estimate of A. + + Notes + ----- + Implements a SciPy adaptation of Algorithm 2.4 of [1], and the original + Fortran code given in [2]. + + The algorithm involves randomized elements and hence if needed, the seed + of the Python built-in random module can be set for reproducible results. + + References + ---------- + .. [1] Nicholas J. Higham and Françoise Tisseur (2000), + "A Block Algorithm for Matrix 1-Norm Estimation, + with an Application to 1-Norm Pseudospectra." + SIAM J. Matrix Anal. Appl. Vol. 21, No. 4, pp. 1185-1201. + + .. [2] Sheung Hun Cheng, Nicholas J. Higham (2001), "Implementation for + LAPACK of a Block Algorithm for Matrix 1-Norm Estimation", + NA Report 393 + + """ + # Ref [1] skips parallel col test for complex inputs + real_A = np.isrealobj(A) + n = A.shape[0] + est_old = 0 + ind_hist = [] + S = np.zeros([n, 2*t], dtype=np.int8 if real_A else A.dtype) + Y = np.empty([n, t], dtype=A.dtype) + Y[:, 0] = A.sum(axis=1) / n + + # Higham and Tisseur assigns random 1, -1 for initialization but they also + # mention that it is arbitrary. Hence instead we use e_j to already start + # the while loop. Also we don't use a temporary X but keep indices instead + if t > 1: + cols = random.sample(population=range(n), k=t-1) + Y[:, 1:t] = A[:, cols] + ind_hist += cols + + for k in range(max_iter): + if m >= 1: + for _ in range(m-1): + Y = A @ Y + + Y_sums = (np.abs(Y)).sum(axis=0) + best_j = np.argmax(Y_sums) + est = Y_sums[best_j] + if est <= est_old: # (1) + est = est_old + break + # else: + # w = Y[:, best_j] + est_old = est + S[:, :t] = S[:, t:] + _custom_signum(Y, S[:, t:], not real_A) + if t > 1 and real_A: + # (2) + if ((S[:, t:].T @ S[:, :t]).max(axis=1) == n).all() and k > 0: + break + else: + # replaces S in-place + _replace_parallel_cols(S) + + # (3) + Z = A.conj().T @ S + if m >= 1: + for _ in range(m-1): + Z = A.conj().T @ Z + + Z_sums = (np.abs(Z)).sum(axis=1) + if np.argmax(Z_sums) == best_j: # (4) + break + h_sorter = np.argsort(Z_sums) + if all([x in ind_hist for x in h_sorter[:t]]): # (5) + break + else: + pick = random.choice(range(n)) + for _ in range(t): + while pick in ind_hist: + pick = random.choice(range(n)) + ind_hist += [pick] + + Y = A[:, ind_hist[-t:]] + + # v = np.zeros_like(X[:, 0]) # just some equal size array + # v[best_j] = 1 + + return est # , v, w + + +def _replace_parallel_cols(S): + """Helper function for 1-norm condition number estimator. + + This function is akin to ?LARPC function given in Cheng, Higham (see Ref + section of expm, [2]). + + Parameters + ---------- + S : ndarray + Working array of the current iteration + S_old : ndarray + Working array of the previous iteration. Pass None if only S exists. + rng : Generator + The random generator used inside the body of condition estimator + + Returns + ------- + None + + """ + n, t = S.shape + max_spin = math.ceil(n / t) + + for col in range(t): + curr_col = t + col + n_it = 0 + while round(np.abs(S[:, col] @ S[:, :curr_col]).max()) == n: + S[:, col] = random.choices([1, -1], k=n) + n_it += 1 + if n_it > max_spin: + break + + return + + +def _custom_signum(M, sgnM, is_complex=False): + """Helper function for 1-norm condition number estimator. + + This function is defined to comply with ref. [1] of condest. For reals, + nonnegative inputs are mapped to 1 and to -1 otherwise. For complex inputs + sgn(x) = x / ||x|| and sgn(0 + 0j) := 1. + + Parameters + ---------- + M : ndarray + Input array + + sgnM : ndarray + Output array that will be overwritten + + Returns + ------- + sgnM : ndarray + The resulting sign(M) array. + + """ + if is_complex: + sgnM.fill(1.) + mask = M != 0. + sgnM[mask] = M[mask] / np.abs(M[mask]) + else: + np.signbit(M, out=sgnM) + + +@cython.initializedcheck(False) +def pade_UV_calc(lapack_t[:, :, ::1]Am, int n, int m): + """Helper functions for expm to solve the final polynomial evaluation""" + if lapack_t is float: + if m in [3, 5, 7]: + pade_357_UV_calc_s(Am, n, m) + elif m == 9: + pade_9_UV_calc_s(Am, n) + else: + pade_13_UV_calc_s(Am, n) + elif lapack_t is double: + if m in [3, 5, 7]: + pade_357_UV_calc_d(Am, n, m) + elif m == 9: + pade_9_UV_calc_d(Am, n) + else: + pade_13_UV_calc_d(Am, n) + elif lapack_t is floatcomplex: + if m in [3, 5, 7]: + pade_357_UV_calc_c(Am, n, m) + elif m == 9: + pade_9_UV_calc_c(Am, n) + else: + pade_13_UV_calc_c(Am, n) + elif lapack_t is doublecomplex: + if m in [3, 5, 7]: + pade_357_UV_calc_z(Am, n, m) + elif m == 9: + pade_9_UV_calc_z(Am, n) + else: + pade_13_UV_calc_z(Am, n) + else: + raise ValueError('Internal function "pade_UV_calc" received an unsupported dtype') + + +@cython.initializedcheck(False) +def pick_pade_structure(lapack_t[:, ::1]A): + """Helper functions for expm to choose Pade approximation order""" + if lapack_t is float: + return pick_pade_structure_s(A) + elif lapack_t is double: + return pick_pade_structure_d(A) + elif lapack_t is floatcomplex: + return pick_pade_structure_c(A) + elif lapack_t is doublecomplex: + return pick_pade_structure_z(A) + else: + raise ValueError('Internal function "pick_pade_structure" received an unsupported dtype.') diff --git a/scipy/linalg/setup.py b/scipy/linalg/setup.py index 2f0ef0a48c17..f1ee9b40df64 100644 --- a/scipy/linalg/setup.py +++ b/scipy/linalg/setup.py @@ -152,6 +152,9 @@ def configuration(parent_package='', top_path=None): config.add_extension('_cythonized_array_utils', sources=['_cythonized_array_utils.c']) + config.add_extension('_matfuncs_expm', + sources=['_matfuncs_expm.c']) + # Add any license files config.add_data_files('src/id_dist/doc/doc.tex') config.add_data_files('src/lapack_deprecations/LICENSE') From de83d35c56fa01ad6255e8029f4a5ac50e3c90bb Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Sat, 20 Nov 2021 15:11:06 +0100 Subject: [PATCH 02/22] MAINT:linalg: Confirm malloc() objects freed --- scipy/linalg/_matfuncs_expm.pyx.in | 150 +++++++++++++++-------------- 1 file changed, 76 insertions(+), 74 deletions(-) diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in index 5c3384701710..5356d8e55472 100644 --- a/scipy/linalg/_matfuncs_expm.pyx.in +++ b/scipy/linalg/_matfuncs_expm.pyx.in @@ -262,85 +262,87 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi cdef {{ tname }} one = 1.0, zero = 0.0, neg_one = -1.0 if not ipiv: raise MemoryError('Internal function "pade_357_UV_calc_c" failed to allocate memory.') - # b[m] is always 1. hence skipped - if m == 3: - b[0] = 120. - b[1] = 60. - b[2] = 12. - elif m == 5: - b[0] = 30240. - b[1] = 15120. - b[2] = 3360. - b[3] = 420. - b[4] = 30. - elif m == 7: - b[0] = 17297280. - b[1] = 8648640. - b[2] = 1995840. - b[3] = 277200. - b[4] = 25200. - b[5] = 1512. - b[6] = 56. - else: - raise ValueError(f'Internal function "pade_357_UV_calc_c" received an invalid value {m}') - - # Utilize the unused powers of Am as scratch memory - if m == 3: - # U = Am[0] @ Am[1] + 60.*Am[0] - {{ pfx }}copy(&n2, &Am[0, 0, 0], &int_one, &Am[3, 0, 0], &int_one) - {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[1, 0, 0], &n, &b[1], &Am[3, 0, 0], &n) - # V = 12.*Am[1] + 120*I_n - {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one) - for i in range(n): - Am[1, i, i] = Am[1, i, i] + b[0] + try: + # b[m] is always 1. hence skipped + if m == 3: + b[0] = 120. + b[1] = 60. + b[2] = 12. + elif m == 5: + b[0] = 30240. + b[1] = 15120. + b[2] = 3360. + b[3] = 420. + b[4] = 30. + elif m == 7: + b[0] = 17297280. + b[1] = 8648640. + b[2] = 1995840. + b[3] = 277200. + b[4] = 25200. + b[5] = 1512. + b[6] = 56. + else: + raise ValueError(f'Internal function "pade_357_UV_calc_c" received an invalid value {m}') - elif m == 5: - # U = Am[0] @ (b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n) - {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one) - {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one) - {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one) - for i in range(n): - Am[4, i, i] = Am[4, i, i] + b[1] - {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n) - # V = b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n - {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one) - {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one) - for i in range(n): - Am[1, i, i] = Am[1, i, i] + b[0] + # Utilize the unused powers of Am as scratch memory + if m == 3: + # U = Am[0] @ Am[1] + 60.*Am[0] + {{ pfx }}copy(&n2, &Am[0, 0, 0], &int_one, &Am[3, 0, 0], &int_one) + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[1, 0, 0], &n, &b[1], &Am[3, 0, 0], &n) + # V = 12.*Am[1] + 120*I_n + {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one) + for i in range(n): + Am[1, i, i] = Am[1, i, i] + b[0] - else: - # U = Am[0] @ (b[7]*Am[3] + b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n) - {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one) - {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one) - {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one) - {{ pfx }}axpy(&n2, &b[7], &Am[3, 0, 0], &int_one, &Am[4, 0, 0], &int_one) - for i in range(n): - Am[4, i, i] = Am[4, i, i] + b[1] - # We ran out of space for dgemm; first compute V and then reuse space for U - # V = b[6]*Am[3] + b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n - {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one) - {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one) - {{ pfx }}axpy(&n2, &b[6], &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one) - for i in range(n): - Am[1, i, i] = Am[1, i, i] + b[0] - # Now we can scratch A[2] or A[3] - {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n) + elif m == 5: + # U = Am[0] @ (b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n) + {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one) + {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one) + for i in range(n): + Am[4, i, i] = Am[4, i, i] + b[1] + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n) + # V = b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n + {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one) + for i in range(n): + Am[1, i, i] = Am[1, i, i] + b[0] + + else: + # U = Am[0] @ (b[7]*Am[3] + b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n) + {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one) + {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[7], &Am[3, 0, 0], &int_one, &Am[4, 0, 0], &int_one) + for i in range(n): + Am[4, i, i] = Am[4, i, i] + b[1] + # We ran out of space for dgemm; first compute V and then reuse space for U + # V = b[6]*Am[3] + b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n + {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &b[6], &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one) + for i in range(n): + Am[1, i, i] = Am[1, i, i] + b[0] + # Now we can scratch A[2] or A[3] + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n) - # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U - {{ pfx }}axpy(&n2, &neg_one, &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one) + # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U + {{ pfx }}axpy(&n2, &neg_one, &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one) - # Convert array layout for solving AX = B into Am[2] - swap_c_and_f_layout(&Am[3, 0, 0], &Am[2, 0, 0], n, n, n) + # Convert array layout for solving AX = B into Am[2] + swap_c_and_f_layout(&Am[3, 0, 0], &Am[2, 0, 0], n, n, n) - {{ pfx }}getrf( &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &info ) - {{ pfx }}getrs('C', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info ) - {{ sc }}scal(&n2, &two, &Am[2, 0, 0], &int_one) - for i in range(n): - Am[2, i, i] = Am[2, i, i] + 1. + {{ pfx }}getrf( &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &info ) + {{ pfx }}getrs('C', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info ) + {{ sc }}scal(&n2, &two, &Am[2, 0, 0], &int_one) + for i in range(n): + Am[2, i, i] = Am[2, i, i] + 1. - # Put it back in Am in C order - swap_c_and_f_layout(&Am[2, 0, 0], &Am[0, 0, 0], n, n, n) - free(ipiv) + # Put it back in Am in C order + swap_c_and_f_layout(&Am[2, 0, 0], &Am[0, 0, 0], n, n, n) + finally: + free(ipiv) {{endfor}} @@ -480,7 +482,6 @@ cdef void pade_13_UV_calc_{{ pfx }}({{ tname }}[:, :, ::1]Am, int n) nogil: # Convert array layout for solving AX = B into work4 swap_c_and_f_layout(work2, work4, n, n, n) - free(work2) {{ pfx }}getrf( &n, &n, &work3[0], &n, &ipiv[0], &info ) {{ pfx }}getrs('C', &n, &n, &work3[0], &n, &ipiv[0], &work4[0], &n, &info ) @@ -491,6 +492,7 @@ cdef void pade_13_UV_calc_{{ pfx }}({{ tname }}[:, :, ::1]Am, int n) nogil: swap_c_and_f_layout(work4, &Am[0, 0, 0], n, n, n) finally: free(ipiv) + free(work2) free(work3) free(work4) From 5bb5fe9f1c398319f1a3f373bfe5ba2122d23b0b Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Sat, 20 Nov 2021 21:33:21 +0100 Subject: [PATCH 03/22] MAINT:linalg: allocate expm scratch array once at the start --- scipy/linalg/_matfuncs_expm.pyx.in | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in index 5356d8e55472..2e2db8e96551 100644 --- a/scipy/linalg/_matfuncs_expm.pyx.in +++ b/scipy/linalg/_matfuncs_expm.pyx.in @@ -133,10 +133,9 @@ cdef double kth_power_norm_d(double* A, double* v1, double* v2, Py_ssize_t n, in @cython.wraparound(False) @cython.boundscheck(False) @cython.initializedcheck(False) -cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, ::1] A): - cdef Py_ssize_t n = A.shape[0], i, j, k +cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, :, ::1] Am): + cdef Py_ssize_t n = Am.shape[1], i, j, k cdef int lm = 0, s = 0, intn = n, n2= intn*intn, int_one = 1 - cdef cnp.ndarray[cnp.{{ dt }}_t, ndim=3] Am = np.empty((5, n, n), dtype='{{ dchar }}') cdef {{ tname }}[:, :, ::1] Amv = Am cdef: {{ tname }} one = 1.0, zero = 0.0 @@ -169,10 +168,10 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, ::1] A): for j in range(n): work[j] = 1. - {{ pfx }}copy(&n2, &A[0, 0], &int_one, &Amv[0, 0, 0], &int_one) + # {{ pfx }}copy(&n2, &A[0, 0], &int_one, &Amv[0, 0, 0], &int_one) for i in range(n): for j in range(n): - absA[i*n + j] = abs(A[i, j]) + absA[i*n + j] = abs(Am[0, i, j]) # First spin = normest(|A|, 1), increase m when spun more normA = kth_power_norm_d(absA, work, work2, n, 1) @@ -188,13 +187,13 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, ::1] A): temp = kth_power_norm_d(absA, work, work2, n, 6) lm = max(ceil(log2(temp/normA/coeff[0])/6), 0) if eta0 < theta[0] and lm == 0: - return Am, 3, s + return 3, s # m = 5 temp = kth_power_norm_d(absA, work, work2, n, 4) lm = max(ceil(log2(temp/normA/coeff[1])/10), 0) if eta1 < theta[1] and lm == 0: - return Am, 5, s + return 5, s # m = 7 if n < 360: @@ -207,13 +206,13 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, ::1] A): temp = kth_power_norm_d(absA, work, work2, n, 4) lm = max(ceil(log2(temp/normA/coeff[2])/14), 0) if eta2 < theta[2] and lm == 0: - return Am, 7, s + return 7, s # m = 9 temp = kth_power_norm_d(absA, work, work2, n, 4) lm = max(ceil(log2(temp/normA/coeff[3])/18), 0) if eta2 < theta[3] and lm == 0: - return Am, 9, s + return 9, s # m = 13 # Scale-square @@ -236,7 +235,7 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, ::1] A): normA *= 2.**(-s) temp = kth_power_norm_d(absA, work, work2, n, 8) s += max(ceil(log2(temp/normA/coeff[4])/26), 0) - return Am, 13, s + return 13, s finally: free(work) free(work2) @@ -710,7 +709,7 @@ def pade_UV_calc(lapack_t[:, :, ::1]Am, int n, int m): @cython.initializedcheck(False) -def pick_pade_structure(lapack_t[:, ::1]A): +def pick_pade_structure(lapack_t[:, :, ::1]A): """Helper functions for expm to choose Pade approximation order""" if lapack_t is float: return pick_pade_structure_s(A) From 876a1b57d11c9aba803869542e658ff4685ec263 Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Sun, 21 Nov 2021 03:41:24 +0100 Subject: [PATCH 04/22] FIX:linalg: correct expm spurious array conjugation --- scipy/linalg/_matfuncs_expm.pyx.in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in index 2e2db8e96551..d4fded2d2fe0 100644 --- a/scipy/linalg/_matfuncs_expm.pyx.in +++ b/scipy/linalg/_matfuncs_expm.pyx.in @@ -333,7 +333,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi swap_c_and_f_layout(&Am[3, 0, 0], &Am[2, 0, 0], n, n, n) {{ pfx }}getrf( &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &info ) - {{ pfx }}getrs('C', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info ) + {{ pfx }}getrs('T', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info ) {{ sc }}scal(&n2, &two, &Am[2, 0, 0], &int_one) for i in range(n): Am[2, i, i] = Am[2, i, i] + 1. @@ -397,7 +397,7 @@ cdef void pade_9_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n) nogil: swap_c_and_f_layout(&Am[3, 0, 0], &Am[2, 0, 0], n, n, n) {{ pfx }}getrf( &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &info ) - {{ pfx }}getrs('C', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info) + {{ pfx }}getrs('T', &n, &n, &Am[1, 0, 0], &n, &ipiv[0], &Am[2, 0, 0], &n, &info) {{ sc }}scal(&n2, &two, &Am[2, 0, 0], &int_one) for i in range(n): Am[2, i, i] = Am[2, i, i] + 1. @@ -483,7 +483,7 @@ cdef void pade_13_UV_calc_{{ pfx }}({{ tname }}[:, :, ::1]Am, int n) nogil: swap_c_and_f_layout(work2, work4, n, n, n) {{ pfx }}getrf( &n, &n, &work3[0], &n, &ipiv[0], &info ) - {{ pfx }}getrs('C', &n, &n, &work3[0], &n, &ipiv[0], &work4[0], &n, &info ) + {{ pfx }}getrs('T', &n, &n, &work3[0], &n, &ipiv[0], &work4[0], &n, &info ) {{ sc }}scal(&n2, &two, &work4[0], &int1) for i in range(n): work4[i*(n+1)] += 1. From a0cd10b96e49f0123b367687f826cac267ad128d Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Sun, 21 Nov 2021 23:42:00 +0100 Subject: [PATCH 05/22] ENH:linalg: expm public API function --- scipy/linalg/_matfuncs.py | 180 +++++++++++++++++++++++++++++++++++--- 1 file changed, 166 insertions(+), 14 deletions(-) diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py index 27c0a96b450c..93d038a8649f 100644 --- a/scipy/linalg/_matfuncs.py +++ b/scipy/linalg/_matfuncs.py @@ -9,7 +9,10 @@ from numpy import (Inf, dot, diag, prod, logical_not, ravel, transpose, conjugate, absolute, amax, sign, isfinite, single) +from numpy.lib.scimath import sqrt as csqrt import numpy as np +from itertools import product +from scipy.linalg import LinAlgError, bandwidth # Local imports from ._misc import norm @@ -19,7 +22,7 @@ from ._decomp_schur import schur, rsf2csf from ._expm_frechet import expm_frechet, expm_cond from ._matfuncs_sqrtm import sqrtm - +from ._matfuncs_expm import pick_pade_structure, pade_UV_calc eps = np.finfo(float).eps feps = np.finfo(single).eps @@ -208,25 +211,42 @@ def logm(A, disp=True): def expm(A): - """ - Compute the matrix exponential using Pade approximation. + """Compute the matrix exponential of an array. Parameters ---------- - A : (N, N) array_like or sparse matrix - Matrix to be exponentiated. + A : ndarray + Input with last two dimensions are square ``(..., n, n)``. Returns ------- - expm : (N, N) ndarray - Matrix exponential of `A`. + eA : ndarray + The resulting matrix exponential with the same shape of A + + Notes + ----- + Implements the algorithm given in [1], which is essentially a careful Pade + approximation and thus higher powers (up to eight) of the input are + computed. This has to be taken into account as it is not difficult to have + exponential growth leading to floating point overflows. For input with size + ``n``, the memory usage is in the worst case in the order of ``8*(n**2)``. + If the input data is not of single and double precision of real and complex + dtypes, it is copied to a new array. Same applies for noncontiguous inputs. + + For cases ``n >= 400``, the exact 1-norm computation cost, breaks even with + 1-norm estimation and from that point on the estimation scheme given in + [2] is used. References ---------- - .. [1] Awad H. Al-Mohy and Nicholas J. Higham (2009) - "A New Scaling and Squaring Algorithm for the Matrix Exponential." - SIAM Journal on Matrix Analysis and Applications. - 31 (3). pp. 970-989. ISSN 1095-7162 + .. [1] Awad H. Al-Mohy and Nicholas J. Higham, (2009), "A New Scaling + and Squaring Algorithm for the Matrix Exponential", SIAM J. Matrix + Anal. Appl. 31(3):970-989, :doi:`10.1137/09074721X` + + .. [2] Nicholas J. Higham and Francoise Tisseur (2000), "A Block Algorithm + for Matrix 1-Norm Estimation, with an Application to 1-Norm + Pseudospectra." SIAM J. Matrix Anal. Appl. 21(4):1185-1201, + :doi:`10.1137/S0895479899356080` Examples -------- @@ -250,9 +270,141 @@ def expm(A): [ 1.06860742+0.48905626j, -1.71075555+0.91406299j]]) """ - # Input checking and conversion is provided by sparse.linalg.expm(). - import scipy.sparse.linalg - return scipy.sparse.linalg.expm(A) + a = np.asarray(A) + if a.size == 1 and a.ndim < 2: + return np.array([[np.exp(a.ravel()[0])]]) + + if a.ndim < 2: + raise LinAlgError('The input array must be at least two-dimensional') + if a.shape[-1] != a.shape[-2]: + raise LinAlgError('Last 2 dimensions of the array must be square') + n = a.shape[-1] + # Empty array + if min(*a.shape) == 0: + return np.empty_like(a) + + # Scalar case + if a.shape[-2:] == (1, 1): + return np.exp(a) + + if not np.issubdtype(a.dtype, np.inexact): + a = a.astype(float) + elif a.dtype == np.float16: + a = a.astype(np.float32) + + # Explicit formula for 2x2 case, formula (2.2) in [1] + # without Kahan's method numerical instabilities can occur. + if a.shape[-2:] == (2, 2): + if a.ndim == 2: + + p, r, s, t = a[0, 0], a[0, 1], a[1, 0], a[1, 1] + mu = csqrt((p-t)**2 + 4*r*s)/2 + coshMu, sinchMu = np.cosh(mu), np.sinh(mu)/mu if mu != 0. else 1. + eA = np.empty([2, 2], dtype=mu.dtype) + eA[0, 0] = coshMu + 0.5*(p-t)*sinchMu + eA[0, 1] = r*sinchMu + eA[1, 0] = s*sinchMu + eA[1, 1] = coshMu - 0.5*(p-t)*sinchMu + eA *= np.exp((p + t)/2.) + if np.isrealobj(a): + return eA.real + return eA + + else: + a1, a2, a3, a4 = (a[..., 0, 0], + a[..., 0, 1], + a[..., 1, 0], + a[..., 1, 1]) + mu = (a[..., 0, 0]-a[..., 1, 1])**2 + 4*a[..., 0, 1]*a[..., 1, 0] + mu = csqrt(mu)/2. + eApD2 = np.exp((a[..., 0, 0]+a[..., 1, 1])/2.) + AmD2 = (a1 - a4)/2. + coshMu = np.cosh(mu), + sinchMu = _sinch(mu) + eA = np.empty((a.shape), dtype=mu.dtype) + eA[..., 0, 0] = eApD2 * (coshMu + AmD2*sinchMu) + eA[..., 0, 1] = eApD2 * a2 * sinchMu + eA[..., 1, 0] = eApD2 * a3 * sinchMu + eA[..., 1, 1] = eApD2 * (coshMu - AmD2*sinchMu) + if np.isrealobj(a): + return eA.real + return eA + + # Generic case with unspecified stacked dimensions. + # array to hold the result + n = a.shape[-1] + eA = np.empty(a.shape, dtype=a.dtype) + # working memory to hold intermediate arrays + Am = np.empty((5, n, n), dtype=a.dtype) + + for ind in product(*[range(x) for x in a.shape[:-2]]): + aw = a[ind] + + lu = bandwidth(aw) + if not any(lu): # a is diagonal? + eA[ind] = np.diag(np.exp(np.diag(aw))) + continue + + # # Generic/triangular case; copy the slice into scratch and send + Am[0, :, :] = aw + m, s = pick_pade_structure(Am) + + if s != 0: # scaling needed + Am[:4] *= [[[2**(-s)]], [[4**(-s)]], [[16**(-s)]], [[64**(-s)]]] + + pade_UV_calc(Am, n, m) + eAw = Am[0] + + if s != 0: # squaring needed + + if lu[1] == 0: # lower triangular + diag_aw = np.diag(aw) + # einsum returns a writable view + np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-s)) + for i in range(s-1, -1, -1): + eAw = eAw @ eAw + # diagonal + np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-i)) + # superdiagonal + sd = np.diag(aw, k=-1) * 2**(-i) + l1_plus_l2 = (diag_aw[:-1] + diag_aw[1:]) * 2**(-i-1) + l1_minus_l2 = (diag_aw[:-1] - diag_aw[1:]) * 2**(-i-1) + esd = sd*np.exp(l1_plus_l2)*_sinch(l1_minus_l2) + np.einsum('ii->i', eAw[1:, :-1])[:] = esd + + elif lu[0] == 0: # upper triangular + diag_aw = np.diag(aw) + # einsum returns a writable view + np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-s)) + for i in range(s-1, -1, -1): + eAw = eAw @ eAw + # diagonal + np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-i)) + # superdiagonal + sd = np.diag(aw, k=1) * 2**(-i) + l1_plus_l2 = (diag_aw[:-1] + diag_aw[1:]) * 2**(-i-1) + l1_minus_l2 = (diag_aw[:-1] - diag_aw[1:]) * 2**(-i-1) + esd = sd*np.exp(l1_plus_l2)*_sinch(l1_minus_l2) + np.einsum('ii->i', eAw[:-1, 1:])[:] = esd + + else: # generic + for _ in range(s): + eAw = eAw @ eAw + + # Zero out the entries from np.empty in case of triangular input + if (lu[0] == 0) or (lu[1] == 0): + eA[ind] = np.triu(eAw) if lu[0] == 0 else np.tril(eAw) + else: + eA[ind] = eAw + + return eA + + +def _sinch(x): + m = x == 0. + x[m] = 1. + x[~m] = np.sinh(x[~m])/x[~m] + return x def cosm(A): From d40fa4f80b8578a9e9b754e9012de5e21974cb57 Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Sun, 21 Nov 2021 23:56:20 +0100 Subject: [PATCH 06/22] TST:linalg: Tweaks to tests for the new expm() --- scipy/linalg/tests/test_matfuncs.py | 33 ++++++----------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/scipy/linalg/tests/test_matfuncs.py b/scipy/linalg/tests/test_matfuncs.py index 9ff68beae506..97e52886ed51 100644 --- a/scipy/linalg/tests/test_matfuncs.py +++ b/scipy/linalg/tests/test_matfuncs.py @@ -9,10 +9,8 @@ import numpy as np from numpy import array, identity, dot, sqrt -from numpy.testing import ( - assert_array_equal, assert_array_less, assert_equal, - assert_array_almost_equal, - assert_allclose, assert_, assert_warns) +from numpy.testing import (assert_array_almost_equal, assert_allclose, assert_, + assert_array_less, assert_array_equal, assert_warns) import pytest import scipy.linalg @@ -108,7 +106,7 @@ def test_al_mohy_higham_2012_experiment_1_logm(self): A = _get_al_mohy_higham_2012_experiment_1() A_logm, info = logm(A, disp=False) A_round_trip = expm(A_logm) - assert_allclose(A_round_trip, A, rtol=1e-5, atol=1e-14) + assert_allclose(A_round_trip, A, rtol=5e-5, atol=1e-14) def test_al_mohy_higham_2012_experiment_1_funm_log(self): # The raw funm with np.log does not complete the round trip. @@ -604,25 +602,8 @@ def test_zero(self): assert_array_almost_equal(expm(a),[[1,0],[0,1]]) def test_single_elt(self): - # See gh-5853 - from scipy.sparse import csc_matrix - - vOne = -2.02683397006j - vTwo = -2.12817566856j - - mOne = csc_matrix([[vOne]], dtype='complex') - mTwo = csc_matrix([[vTwo]], dtype='complex') - - outOne = expm(mOne) - outTwo = expm(mTwo) - - assert_equal(type(outOne), type(mOne)) - assert_equal(type(outTwo), type(mTwo)) - - assert_allclose(outOne[0, 0], complex(-0.44039415155949196, - -0.8978045395698304)) - assert_allclose(outTwo[0, 0], complex(-0.52896401032626006, - -0.84864425749518878)) + elt = expm(1) + assert_allclose(elt, np.array([[np.e]])) def test_empty_matrix_input(self): # handle gh-11082 @@ -713,8 +694,8 @@ def test_fuzz(self): expected_expm = scipy.linalg.expm(A) expected_frechet = scipy.linalg.expm(M)[:n, n:] observed_expm, observed_frechet = expm_frechet(A, E) - assert_allclose(expected_expm, observed_expm) - assert_allclose(expected_frechet, observed_frechet) + assert_allclose(expected_expm, observed_expm, atol=5e-8) + assert_allclose(expected_frechet, observed_frechet, atol=1e-7) def test_problematic_matrix(self): # this test case uncovered a bug which has since been fixed From 4ef8e3d82925fc3bbd50ca040a9eec7ef5d413a2 Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Mon, 22 Nov 2021 12:07:00 +0100 Subject: [PATCH 07/22] TST:sparse:linalg: Use sparse.linalg own expm in tests --- .../sparse/linalg/tests/test_expm_multiply.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/scipy/sparse/linalg/tests/test_expm_multiply.py b/scipy/sparse/linalg/tests/test_expm_multiply.py index 0e00ad291ab6..250668c87e68 100644 --- a/scipy/sparse/linalg/tests/test_expm_multiply.py +++ b/scipy/sparse/linalg/tests/test_expm_multiply.py @@ -6,6 +6,7 @@ suppress_warnings) from scipy.sparse import SparseEfficiencyWarning import scipy.linalg +from scipy.sparse.linalg import expm as sp_expm from scipy.sparse.linalg._expm_multiply import (_theta, _compute_p_max, _onenormest_matrix_power, expm_multiply, _expm_multiply_simple, _expm_multiply_interval) @@ -63,7 +64,7 @@ def test_expm_multiply(self): A = scipy.linalg.inv(np.random.randn(n, n)) B = np.random.randn(n, k) observed = expm_multiply(A, B) - expected = np.dot(scipy.linalg.expm(A), B) + expected = np.dot(sp_expm(A), B) assert_allclose(observed, expected) def test_matrix_vector_multiply(self): @@ -74,7 +75,7 @@ def test_matrix_vector_multiply(self): A = scipy.linalg.inv(np.random.randn(n, n)) v = np.random.randn(n) observed = expm_multiply(A, v) - expected = np.dot(scipy.linalg.expm(A), v) + expected = np.dot(sp_expm(A), v) assert_allclose(observed, expected) def test_scaled_expm_multiply(self): @@ -88,7 +89,7 @@ def test_scaled_expm_multiply(self): A = scipy.linalg.inv(np.random.randn(n, n)) B = np.random.randn(n, k) observed = _expm_multiply_simple(A, B, t=t) - expected = np.dot(scipy.linalg.expm(t*A), B) + expected = np.dot(sp_expm(t*A), B) assert_allclose(observed, expected) def test_scaled_expm_multiply_single_timepoint(self): @@ -99,7 +100,7 @@ def test_scaled_expm_multiply_single_timepoint(self): A = np.random.randn(n, n) B = np.random.randn(n, k) observed = _expm_multiply_simple(A, B, t=t) - expected = scipy.linalg.expm(t*A).dot(B) + expected = sp_expm(t*A).dot(B) assert_allclose(observed, expected) def test_sparse_expm_multiply(self): @@ -115,8 +116,9 @@ def test_sparse_expm_multiply(self): sup.filter(SparseEfficiencyWarning, "splu requires CSC matrix format") sup.filter(SparseEfficiencyWarning, - "spsolve is more efficient when sparse b is in the CSC matrix format") - expected = scipy.linalg.expm(A).dot(B) + "spsolve is more efficient when sparse b is in the" + " CSC matrix format") + expected = sp_expm(A).dot(B) assert_allclose(observed, expected) def test_complex(self): @@ -155,8 +157,7 @@ def test_sparse_expm_multiply_interval(self): sup.filter(SparseEfficiencyWarning, "spsolve is more efficient when sparse b is in the CSC matrix format") for solution, t in zip(X, samples): - assert_allclose(solution, - scipy.linalg.expm(t*A).dot(target)) + assert_allclose(solution, sp_expm(t*A).dot(target)) def test_expm_multiply_interval_vector(self): np.random.seed(1234) @@ -172,7 +173,7 @@ def test_expm_multiply_interval_vector(self): samples = np.linspace(start=start, stop=stop, num=num, endpoint=endpoint) for solution, t in zip(X, samples): - assert_allclose(solution, scipy.linalg.expm(t*A).dot(v)) + assert_allclose(solution, sp_expm(t*A).dot(v)) def test_expm_multiply_interval_matrix(self): np.random.seed(1234) @@ -189,7 +190,7 @@ def test_expm_multiply_interval_matrix(self): samples = np.linspace(start=start, stop=stop, num=num, endpoint=endpoint) for solution, t in zip(X, samples): - assert_allclose(solution, scipy.linalg.expm(t*A).dot(B)) + assert_allclose(solution, sp_expm(t*A).dot(B)) def test_sparse_expm_multiply_interval_dtypes(self): # Test A & B int @@ -197,13 +198,13 @@ def test_sparse_expm_multiply_interval_dtypes(self): B = np.ones(5, dtype=int) Aexpm = scipy.sparse.diags(np.exp(np.arange(5)),format='csr') assert_allclose(expm_multiply(A,B,0,1)[-1], Aexpm.dot(B)) - + # Test A complex, B int A = scipy.sparse.diags(-1j*np.arange(5),format='csr', dtype=complex) B = np.ones(5, dtype=int) Aexpm = scipy.sparse.diags(np.exp(-1j*np.arange(5)),format='csr') assert_allclose(expm_multiply(A,B,0,1)[-1], Aexpm.dot(B)) - + # Test A int, B complex A = scipy.sparse.diags(np.arange(5),format='csr', dtype=int) B = np.full(5, 1j, dtype=complex) @@ -243,7 +244,7 @@ def _help_test_specific_expm_interval_status(self, target_status): samples = np.linspace(start=start, stop=stop, num=num, endpoint=endpoint) for solution, t in zip(X, samples): - assert_allclose(solution, scipy.linalg.expm(t*A).dot(B)) + assert_allclose(solution, sp_expm(t*A).dot(B)) nsuccesses += 1 if not nsuccesses: msg = 'failed to find a status-' + str(target_status) + ' interval' From e91e88ca9aa275aae2259db7a24692a4b9c1a80c Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Tue, 23 Nov 2021 02:33:26 +0100 Subject: [PATCH 08/22] MAINT:linalg: expm() code branching cleanup --- scipy/linalg/_matfuncs.py | 88 ++++++++++++++------------------------- 1 file changed, 31 insertions(+), 57 deletions(-) diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py index 93d038a8649f..750b5f413b04 100644 --- a/scipy/linalg/_matfuncs.py +++ b/scipy/linalg/_matfuncs.py @@ -272,7 +272,7 @@ def expm(A): """ a = np.asarray(A) if a.size == 1 and a.ndim < 2: - return np.array([[np.exp(a.ravel()[0])]]) + return np.array([[np.exp(a.item())]]) if a.ndim < 2: raise LinAlgError('The input array must be at least two-dimensional') @@ -295,48 +295,31 @@ def expm(A): # Explicit formula for 2x2 case, formula (2.2) in [1] # without Kahan's method numerical instabilities can occur. if a.shape[-2:] == (2, 2): - if a.ndim == 2: - - p, r, s, t = a[0, 0], a[0, 1], a[1, 0], a[1, 1] - mu = csqrt((p-t)**2 + 4*r*s)/2 - coshMu, sinchMu = np.cosh(mu), np.sinh(mu)/mu if mu != 0. else 1. - eA = np.empty([2, 2], dtype=mu.dtype) - eA[0, 0] = coshMu + 0.5*(p-t)*sinchMu - eA[0, 1] = r*sinchMu - eA[1, 0] = s*sinchMu - eA[1, 1] = coshMu - 0.5*(p-t)*sinchMu - eA *= np.exp((p + t)/2.) - if np.isrealobj(a): - return eA.real - return eA - - else: - a1, a2, a3, a4 = (a[..., 0, 0], - a[..., 0, 1], - a[..., 1, 0], - a[..., 1, 1]) - mu = (a[..., 0, 0]-a[..., 1, 1])**2 + 4*a[..., 0, 1]*a[..., 1, 0] - mu = csqrt(mu)/2. - eApD2 = np.exp((a[..., 0, 0]+a[..., 1, 1])/2.) - AmD2 = (a1 - a4)/2. - coshMu = np.cosh(mu), - sinchMu = _sinch(mu) - eA = np.empty((a.shape), dtype=mu.dtype) - eA[..., 0, 0] = eApD2 * (coshMu + AmD2*sinchMu) - eA[..., 0, 1] = eApD2 * a2 * sinchMu - eA[..., 1, 0] = eApD2 * a3 * sinchMu - eA[..., 1, 1] = eApD2 * (coshMu - AmD2*sinchMu) - if np.isrealobj(a): - return eA.real - return eA - - # Generic case with unspecified stacked dimensions. - # array to hold the result + a1, a2, a3, a4 = (a[..., 0, 0], + a[..., 0, 1], + a[..., 1, 0], + a[..., 1, 1]) + mu = csqrt((a1-a4)**2 + 4*a2*a3)/2. + eApD2 = np.exp((a1+a4)/2.) + AmD2 = (a1 - a4)/2. + coshMu = np.cosh(mu) + sinchMu = _sinch(mu) + eA = np.empty((a.shape), dtype=mu.dtype) + eA[..., 0, 0] = eApD2 * (coshMu + AmD2*sinchMu) + eA[..., 0, 1] = eApD2 * a2 * sinchMu + eA[..., 1, 0] = eApD2 * a3 * sinchMu + eA[..., 1, 1] = eApD2 * (coshMu - AmD2*sinchMu) + if np.isrealobj(a): + return eA.real + return eA + + # larger problem with unspecified stacked dimensions. n = a.shape[-1] eA = np.empty(a.shape, dtype=a.dtype) # working memory to hold intermediate arrays Am = np.empty((5, n, n), dtype=a.dtype) + # Main loop to go through the slices of an ndarray and passing to expm for ind in product(*[range(x) for x in a.shape[:-2]]): aw = a[ind] @@ -345,7 +328,8 @@ def expm(A): eA[ind] = np.diag(np.exp(np.diag(aw))) continue - # # Generic/triangular case; copy the slice into scratch and send + # Generic/triangular case; copy the slice into scratch and send. + # Am will be mutated by pick_pade_structure Am[0, :, :] = aw m, s = pick_pade_structure(Am) @@ -357,35 +341,25 @@ def expm(A): if s != 0: # squaring needed - if lu[1] == 0: # lower triangular + if (lu[1] == 0) or (lu[0] == 0): # lower/upper triangular diag_aw = np.diag(aw) # einsum returns a writable view np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-s)) for i in range(s-1, -1, -1): eAw = eAw @ eAw - # diagonal - np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-i)) - # superdiagonal - sd = np.diag(aw, k=-1) * 2**(-i) - l1_plus_l2 = (diag_aw[:-1] + diag_aw[1:]) * 2**(-i-1) - l1_minus_l2 = (diag_aw[:-1] - diag_aw[1:]) * 2**(-i-1) - esd = sd*np.exp(l1_plus_l2)*_sinch(l1_minus_l2) - np.einsum('ii->i', eAw[1:, :-1])[:] = esd - elif lu[0] == 0: # upper triangular - diag_aw = np.diag(aw) - # einsum returns a writable view - np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-s)) - for i in range(s-1, -1, -1): - eAw = eAw @ eAw # diagonal np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-i)) - # superdiagonal - sd = np.diag(aw, k=1) * 2**(-i) + # super/sub diagonal + sd = np.diag(aw, k=-1 if lu[1] == 0 else 1) * 2**(-i) + l1_plus_l2 = (diag_aw[:-1] + diag_aw[1:]) * 2**(-i-1) l1_minus_l2 = (diag_aw[:-1] - diag_aw[1:]) * 2**(-i-1) esd = sd*np.exp(l1_plus_l2)*_sinch(l1_minus_l2) - np.einsum('ii->i', eAw[:-1, 1:])[:] = esd + if lu[1] == 0: # lower + np.einsum('ii->i', eAw[1:, :-1])[:] = esd + else: # upper + np.einsum('ii->i', eAw[:-1, 1:])[:] = esd else: # generic for _ in range(s): From 74cdf624f83cede973b31500863c84077abfdb67 Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Tue, 23 Nov 2021 11:51:42 +0100 Subject: [PATCH 09/22] MAINT:linalg:Adjust array slicing for 2x2 arrays --- scipy/linalg/_matfuncs.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py index 750b5f413b04..96ca25957753 100644 --- a/scipy/linalg/_matfuncs.py +++ b/scipy/linalg/_matfuncs.py @@ -295,20 +295,20 @@ def expm(A): # Explicit formula for 2x2 case, formula (2.2) in [1] # without Kahan's method numerical instabilities can occur. if a.shape[-2:] == (2, 2): - a1, a2, a3, a4 = (a[..., 0, 0], - a[..., 0, 1], - a[..., 1, 0], - a[..., 1, 1]) + a1, a2, a3, a4 = (a[..., [0], [0]], + a[..., [0], [1]], + a[..., [1], [0]], + a[..., [1], [1]]) mu = csqrt((a1-a4)**2 + 4*a2*a3)/2. eApD2 = np.exp((a1+a4)/2.) AmD2 = (a1 - a4)/2. coshMu = np.cosh(mu) sinchMu = _sinch(mu) eA = np.empty((a.shape), dtype=mu.dtype) - eA[..., 0, 0] = eApD2 * (coshMu + AmD2*sinchMu) - eA[..., 0, 1] = eApD2 * a2 * sinchMu - eA[..., 1, 0] = eApD2 * a3 * sinchMu - eA[..., 1, 1] = eApD2 * (coshMu - AmD2*sinchMu) + eA[..., [0], [0]] = eApD2 * (coshMu + AmD2*sinchMu) + eA[..., [0], [1]] = eApD2 * a2 * sinchMu + eA[..., [1], [0]] = eApD2 * a3 * sinchMu + eA[..., [1], [1]] = eApD2 * (coshMu - AmD2*sinchMu) if np.isrealobj(a): return eA.real return eA From 1e4793c8d4e6b9639e6018bf7a2d958ea7d12d13 Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Tue, 23 Nov 2021 03:31:54 +0100 Subject: [PATCH 10/22] MAINT:linalg: modify _sinch function for scalars --- scipy/linalg/_matfuncs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py index 96ca25957753..feeb1e5538ce 100644 --- a/scipy/linalg/_matfuncs.py +++ b/scipy/linalg/_matfuncs.py @@ -375,6 +375,8 @@ def expm(A): def _sinch(x): + if np.isscalar(x): + return 1. if x == 0. else np.sinh(x)/x m = x == 0. x[m] = 1. x[~m] = np.sinh(x[~m])/x[~m] From 1bf2177acc502d9b9405abe542d504ea2948bc05 Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Wed, 24 Nov 2021 02:26:13 +0100 Subject: [PATCH 11/22] MAINT:linalg: fix _norm1est call argument type --- scipy/linalg/_matfuncs.py | 55 +++++++------ scipy/linalg/_matfuncs_expm.pyx.in | 120 ++++++++++------------------- 2 files changed, 73 insertions(+), 102 deletions(-) diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py index feeb1e5538ce..06fb448fc6b7 100644 --- a/scipy/linalg/_matfuncs.py +++ b/scipy/linalg/_matfuncs.py @@ -1,20 +1,15 @@ # # Author: Travis Oliphant, March 2002 # +from itertools import product -__all__ = ['expm','cosm','sinm','tanm','coshm','sinhm', - 'tanhm','logm','funm','signm','sqrtm', - 'expm_frechet', 'expm_cond', 'fractional_matrix_power', - 'khatri_rao'] - -from numpy import (Inf, dot, diag, prod, logical_not, ravel, - transpose, conjugate, absolute, amax, sign, isfinite, single) -from numpy.lib.scimath import sqrt as csqrt import numpy as np -from itertools import product -from scipy.linalg import LinAlgError, bandwidth +from numpy import (Inf, dot, diag, prod, logical_not, ravel, transpose, + conjugate, absolute, amax, sign, isfinite) +from numpy.lib.scimath import sqrt as csqrt # Local imports +from scipy.linalg import LinAlgError, bandwidth from ._misc import norm from ._basic import solve, inv from ._special_matrices import triu @@ -23,8 +18,14 @@ from ._expm_frechet import expm_frechet, expm_cond from ._matfuncs_sqrtm import sqrtm from ._matfuncs_expm import pick_pade_structure, pade_UV_calc -eps = np.finfo(float).eps -feps = np.finfo(single).eps + +__all__ = ['expm','cosm','sinm','tanm','coshm','sinhm', + 'tanhm','logm','funm','signm','sqrtm', + 'expm_frechet', 'expm_cond', 'fractional_matrix_power', + 'khatri_rao'] + +eps = np.finfo('d').eps +feps = np.finfo('f').eps _array_precision = {'i': 1, 'l': 1, 'f': 0, 'd': 1, 'F': 0, 'D': 1} @@ -221,21 +222,21 @@ def expm(A): Returns ------- eA : ndarray - The resulting matrix exponential with the same shape of A + The resulting matrix exponential with the same shape of ``A`` Notes ----- - Implements the algorithm given in [1], which is essentially a careful Pade - approximation and thus higher powers (up to eight) of the input are - computed. This has to be taken into account as it is not difficult to have - exponential growth leading to floating point overflows. For input with size - ``n``, the memory usage is in the worst case in the order of ``8*(n**2)``. - If the input data is not of single and double precision of real and complex - dtypes, it is copied to a new array. Same applies for noncontiguous inputs. + Implements the algorithm given in [1], which is essentially a Pade + approximation with a variable order that is decided based on the array + data. + + For input with size ``n``, the memory usage is in the worst case in the + order of ``8*(n**2)``. If the input data is not of single and double + precision of real and complex dtypes, it is copied to a new array. For cases ``n >= 400``, the exact 1-norm computation cost, breaks even with 1-norm estimation and from that point on the estimation scheme given in - [2] is used. + [2] is used to decide on the approximation order. References ---------- @@ -254,9 +255,15 @@ def expm(A): Matrix version of the formula exp(0) = 1: - >>> expm(np.zeros((2,2))) - array([[ 1., 0.], - [ 0., 1.]]) + >>> expm(np.zeros((3, 2, 2))) + array([[[1., 0.], + [0., 1.]], + + [[1., 0.], + [0., 1.]], + + [[1., 0.], + [0., 1.]]]) Euler's identity (exp(i*theta) = cos(theta) + i*sin(theta)) applied to a matrix: diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in index d4fded2d2fe0..047d5c4a9ecc 100644 --- a/scipy/linalg/_matfuncs_expm.pyx.in +++ b/scipy/linalg/_matfuncs_expm.pyx.in @@ -1,9 +1,9 @@ # cython: language_level=3 cimport cython +import math +import random cimport numpy as cnp import numpy as np -import random -import math from libc.stdlib cimport malloc, free from libc.math cimport fabs, ceil, log2, pow from scipy.linalg.cython_lapack cimport (sgetrf, sgetrs, dgetrf, dgetrs, @@ -200,7 +200,7 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, :, ::1] Am): {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[2, 0, 0], &intn, &Amv[2, 0, 0], &intn, &zero, &Amv[4, 0, 0], &intn) d8 = norm1(Amv[4, :, :]) ** (1./8.) else: - d8 = _norm1est(Am[0], m=8) ** (1./8.) + d8 = _norm1est(np.asarray(Am[0]), m=8) ** (1./8.) eta2 = max(d6, d8) temp = kth_power_norm_d(absA, work, work2, n, 4) @@ -220,7 +220,7 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, :, ::1] Am): {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[3, 0, 0], &intn, &Amv[2, 0, 0], &intn, &zero, &Amv[4, 0, 0], &intn) d10 = norm1(Amv[4, :, :]) ** (1./10.) else: - d10 = _norm1est(Am[0], m=10) ** (1./10.) + d10 = _norm1est(np.asarray(Am[0]), m=10) ** (1./10.) eta3 = max(d8, d10) eta4 = min(eta2, eta3) @@ -500,8 +500,23 @@ cdef void pade_13_UV_calc_{{ pfx }}({{ tname }}[:, :, ::1]Am, int n) nogil: # ============================================================================ # ====================== norm1est ============================================ + def _norm1est(A, m=1, t=2, max_iter=5): - """Compute a lower bound for the 1-norm of 2D matrix A or its powers + """Compute a lower bound for the 1-norm of 2D matrix A or its powers. + + Computing the 1-norm of 8th or 10th power of a very large array is a very + wasteful computation if we explicitly compute the actual power. The + estimation exploits (in a nutshell) the following: + + (A @ A @ ... A) @ = (A @ (A @ (... @ (A @ ))) + + And in fact all the rest is practically Ward's power method with ``t`` + starting vectors, hence, thin array and smarter selection of those vectors. + + Thus at some point ``expm`` which uses this function to scale-square, will + switch to estimating when ``np.abs(A).sum(axis=0).max()`` becomes slower + than the estimate (``linalg.norm`` is even slower). Currently the switch + is chosen to be ``n=400``. Parameters ---------- @@ -527,21 +542,21 @@ def _norm1est(A, m=1, t=2, max_iter=5): Fortran code given in [2]. The algorithm involves randomized elements and hence if needed, the seed - of the Python built-in random module can be set for reproducible results. + of the Python built-in "random" module can be set for reproducible results. References ---------- - .. [1] Nicholas J. Higham and Françoise Tisseur (2000), - "A Block Algorithm for Matrix 1-Norm Estimation, - with an Application to 1-Norm Pseudospectra." - SIAM J. Matrix Anal. Appl. Vol. 21, No. 4, pp. 1185-1201. + .. [1] Nicholas J. Higham and Francoise Tisseur (2000), "A Block Algorithm + for Matrix 1-Norm Estimation, with an Application to 1-Norm + Pseudospectra." SIAM J. Matrix Anal. Appl. 21(4):1185-1201, + :doi:`10.1137/S0895479899356080` .. [2] Sheung Hun Cheng, Nicholas J. Higham (2001), "Implementation for LAPACK of a Block Algorithm for Matrix 1-Norm Estimation", NA Report 393 """ - # Ref [1] skips parallel col test for complex inputs + # We skip parallel col test for complex inputs real_A = np.isrealobj(A) n = A.shape[0] est_old = 0 @@ -573,14 +588,27 @@ def _norm1est(A, m=1, t=2, max_iter=5): # w = Y[:, best_j] est_old = est S[:, :t] = S[:, t:] - _custom_signum(Y, S[:, t:], not real_A) + if real_A: + S[:, t:] = np.signbit(Y) + else: + S[:, t:].fill(1) + mask = Y != 0. + S[:, t:][mask] = Y[mask] / np.abs(Y[mask]) + if t > 1 and real_A: # (2) if ((S[:, t:].T @ S[:, :t]).max(axis=1) == n).all() and k > 0: break else: - # replaces S in-place - _replace_parallel_cols(S) + max_spin = math.ceil(n / t) + for col in range(t): + curr_col = t + col + n_it = 0 + while round(np.abs(S[:, col] @ S[:, :curr_col]).max()) == n: + S[:, col] = random.choices([1, -1], k=n) + n_it += 1 + if n_it > max_spin: + break # (3) Z = A.conj().T @ S @@ -609,70 +637,6 @@ def _norm1est(A, m=1, t=2, max_iter=5): return est # , v, w -def _replace_parallel_cols(S): - """Helper function for 1-norm condition number estimator. - - This function is akin to ?LARPC function given in Cheng, Higham (see Ref - section of expm, [2]). - - Parameters - ---------- - S : ndarray - Working array of the current iteration - S_old : ndarray - Working array of the previous iteration. Pass None if only S exists. - rng : Generator - The random generator used inside the body of condition estimator - - Returns - ------- - None - - """ - n, t = S.shape - max_spin = math.ceil(n / t) - - for col in range(t): - curr_col = t + col - n_it = 0 - while round(np.abs(S[:, col] @ S[:, :curr_col]).max()) == n: - S[:, col] = random.choices([1, -1], k=n) - n_it += 1 - if n_it > max_spin: - break - - return - - -def _custom_signum(M, sgnM, is_complex=False): - """Helper function for 1-norm condition number estimator. - - This function is defined to comply with ref. [1] of condest. For reals, - nonnegative inputs are mapped to 1 and to -1 otherwise. For complex inputs - sgn(x) = x / ||x|| and sgn(0 + 0j) := 1. - - Parameters - ---------- - M : ndarray - Input array - - sgnM : ndarray - Output array that will be overwritten - - Returns - ------- - sgnM : ndarray - The resulting sign(M) array. - - """ - if is_complex: - sgnM.fill(1.) - mask = M != 0. - sgnM[mask] = M[mask] / np.abs(M[mask]) - else: - np.signbit(M, out=sgnM) - - @cython.initializedcheck(False) def pade_UV_calc(lapack_t[:, :, ::1]Am, int n, int m): """Helper functions for expm to solve the final polynomial evaluation""" From 89bad715c1d0f64ccf422c427d64297d2514d58a Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Tue, 23 Nov 2021 03:27:35 +0100 Subject: [PATCH 12/22] TST:linalg: add 2x2 array tests to expm() --- scipy/linalg/tests/test_matfuncs.py | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/scipy/linalg/tests/test_matfuncs.py b/scipy/linalg/tests/test_matfuncs.py index 97e52886ed51..fbbbd915958a 100644 --- a/scipy/linalg/tests/test_matfuncs.py +++ b/scipy/linalg/tests/test_matfuncs.py @@ -611,6 +611,38 @@ def test_empty_matrix_input(self): result = expm(A) assert result.size == 0 + def test_2x2_input(self): + E = np.e + a = array([[1, 4], [1, 1]]) + aa = (E**4 + 1)/(2*E) + bb = (E**4 - 1)/E + assert_allclose(expm(a), array([[aa, bb], [bb/4, aa]])) + assert expm(a.astype(np.complex64)).dtype.char == 'F' + assert expm(a.astype(np.float32)).dtype.char == 'f' + + def test_nx2x2_input(self): + E = np.e + # These are integer matrices with integer eigenvalues + a = np.array([[[1, 4], [1, 1]], + [[1, 3], [1, -1]], + [[1, 3], [4, 5]], + [[1, 3], [5, 3]], + [[4, 5], [-3, -4]]], order='F') + # Exact results are computed symbolically + a_res = np.array([ + [[(E**4+1)/(2*E), (E**4-1)/E], + [(E**4-1)/4/E, (E**4+1)/(2*E)]], + [[1/(4*E**2)+(3*E**2)/4, (3*E**2)/4-3/(4*E**2)], + [E**2/4-1/(4*E**2), 3/(4*E**2)+E**2/4]], + [[3/(4*E)+E**7/4, -3/(8*E)+(3*E**7)/8], + [-1/(2*E)+E**7/2, 1/(4*E)+(3*E**7)/4]], + [[5/(8*E**2)+(3*E**6)/8, -3/(8*E**2)+(3*E**6)/8], + [-5/(8*E**2)+(5*E**6)/8, 3/(8*E**2)+(5*E**6)/8]], + [[-3/(2*E)+(5*E)/2, -5/(2*E)+(5*E)/2], + [3/(2*E)-(3*E)/2, 5/(2*E)-(3*E)/2]] + ]) + assert_allclose(expm(a), a_res) + class TestExpmFrechet: From ba356d9ca6cc1f74c0c9871d3346f22b51add383 Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Sat, 27 Nov 2021 13:05:45 +0100 Subject: [PATCH 13/22] DOC:add BLANKLINE to 3d example --- scipy/linalg/_matfuncs.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py index 06fb448fc6b7..4480e3f77fbf 100644 --- a/scipy/linalg/_matfuncs.py +++ b/scipy/linalg/_matfuncs.py @@ -19,10 +19,9 @@ from ._matfuncs_sqrtm import sqrtm from ._matfuncs_expm import pick_pade_structure, pade_UV_calc -__all__ = ['expm','cosm','sinm','tanm','coshm','sinhm', - 'tanhm','logm','funm','signm','sqrtm', - 'expm_frechet', 'expm_cond', 'fractional_matrix_power', - 'khatri_rao'] +__all__ = ['expm', 'cosm', 'sinm', 'tanm', 'coshm', 'sinhm', 'tanhm', 'logm', + 'funm', 'signm', 'sqrtm', 'fractional_matrix_power','expm_frechet', + 'expm_cond', 'khatri_rao'] eps = np.finfo('d').eps feps = np.finfo('f').eps @@ -258,10 +257,10 @@ def expm(A): >>> expm(np.zeros((3, 2, 2))) array([[[1., 0.], [0., 1.]], - + [[1., 0.], [0., 1.]], - + [[1., 0.], [0., 1.]]]) From 1b0678190415280bfd689c179525c92a0638c471 Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Sat, 27 Nov 2021 13:06:38 +0100 Subject: [PATCH 14/22] MAINT:linalg: fixed the transposed gemm calls in expm --- scipy/linalg/_matfuncs_expm.pyx.in | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in index 047d5c4a9ecc..e8a2f0cb7f32 100644 --- a/scipy/linalg/_matfuncs_expm.pyx.in +++ b/scipy/linalg/_matfuncs_expm.pyx.in @@ -249,6 +249,10 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, :, ::1] Am): # Note: MSVC does not accept "Memview[i, j, k] += 1.+0.j" as a valid operation # and results with error "C2088: '+=': illegal for struct". # OCD is unbearable but we do explicit addition for that reason. + +# Note: gemm calls for A @ B is done via dgemm(.., B, .., A, ..) because arrays +# are in C-contiguous memory layout. This also the reason why getrs has 'T' as +# the argument for TRANSA. {{for tname, pfx, prec, sc in zip(typenames, prefixes, precision, ['s', 'd', 'cs', 'zd'])}} @cython.wraparound(False) @cython.boundscheck(False) @@ -288,7 +292,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi if m == 3: # U = Am[0] @ Am[1] + 60.*Am[0] {{ pfx }}copy(&n2, &Am[0, 0, 0], &int_one, &Am[3, 0, 0], &int_one) - {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[1, 0, 0], &n, &b[1], &Am[3, 0, 0], &n) + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[1, 0, 0], &n, &Am[0, 0, 0], &n, &b[1], &Am[3, 0, 0], &n) # V = 12.*Am[1] + 120*I_n {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one) for i in range(n): @@ -301,7 +305,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one) for i in range(n): Am[4, i, i] = Am[4, i, i] + b[1] - {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n) + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[4, 0, 0], &n, &Am[0, 0, 0], &n, &zero, &Am[3, 0, 0], &n) # V = b[4]*Am[2] + b[2]*Am[1] + b[0]*I_n {{ pfx }}scal(&n2, &b[2], &Am[1, 0, 0], &int_one) {{ pfx }}axpy(&n2, &b[4], &Am[2, 0, 0], &int_one, &Am[1, 0, 0], &int_one) @@ -324,7 +328,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi for i in range(n): Am[1, i, i] = Am[1, i, i] + b[0] # Now we can scratch A[2] or A[3] - {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &Am[4, 0, 0], &n, &zero, &Am[3, 0, 0], &n) + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[4, 0, 0], &n, &Am[0, 0, 0], &n, &zero, &Am[3, 0, 0], &n) # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U {{ pfx }}axpy(&n2, &neg_one, &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one) @@ -388,7 +392,7 @@ cdef void pade_9_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n) nogil: for i in range(n): Am[1, i, i] = Am[1, i, i] + b[0] # Now we can scratch A[2] or A[3] - {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[0, 0, 0], &n, &work[0], &n, &zero, &Am[3, 0, 0], &n) + {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &work[0], &n, &Am[0, 0, 0], &n, &zero, &Am[3, 0, 0], &n) # inv(V-U) (V+U) = inv(V-U) (V-U+2V) = I + 2 inv(V-U) U {{ pfx }}axpy(&n2, &neg_one, &Am[3, 0, 0], &int_one, &Am[1, 0, 0], &int_one) From 196f522e33899d96fda12f30428cb6985058ec7d Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Wed, 1 Dec 2021 19:00:17 +0100 Subject: [PATCH 15/22] MAINT:linalg: replaced sinch function for expm --- scipy/linalg/_matfuncs.py | 40 +++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py index 4480e3f77fbf..8195f3f8882f 100644 --- a/scipy/linalg/_matfuncs.py +++ b/scipy/linalg/_matfuncs.py @@ -305,11 +305,14 @@ def expm(A): a[..., [0], [1]], a[..., [1], [0]], a[..., [1], [1]]) - mu = csqrt((a1-a4)**2 + 4*a2*a3)/2. + mu = csqrt((a1-a4)**2 + 4*a2*a3)/2. # csqrt slow but handles neg.vals + eApD2 = np.exp((a1+a4)/2.) AmD2 = (a1 - a4)/2. coshMu = np.cosh(mu) - sinchMu = _sinch(mu) + sinchMu = np.ones_like(coshMu) + mask = mu != 0 + sinchMu[mask] = np.sinh(mu[mask]) / mu[mask] eA = np.empty((a.shape), dtype=mu.dtype) eA[..., [0], [0]] = eApD2 * (coshMu + AmD2*sinchMu) eA[..., [0], [1]] = eApD2 * a2 * sinchMu @@ -348,24 +351,24 @@ def expm(A): if s != 0: # squaring needed if (lu[1] == 0) or (lu[0] == 0): # lower/upper triangular + # This branch implements Code Fragment 2.1 of [1] + diag_aw = np.diag(aw) # einsum returns a writable view np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-s)) + # super/sub diagonal + sd = np.diag(aw, k=-1 if lu[1] == 0 else 1) + for i in range(s-1, -1, -1): eAw = eAw @ eAw # diagonal - np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2**(-i)) - # super/sub diagonal - sd = np.diag(aw, k=-1 if lu[1] == 0 else 1) * 2**(-i) - - l1_plus_l2 = (diag_aw[:-1] + diag_aw[1:]) * 2**(-i-1) - l1_minus_l2 = (diag_aw[:-1] - diag_aw[1:]) * 2**(-i-1) - esd = sd*np.exp(l1_plus_l2)*_sinch(l1_minus_l2) + np.einsum('ii->i', eAw)[:] = np.exp(diag_aw * 2.**(-i)) + exp_sd = _exp_sinch(diag_aw * (2.**(-i))) * (sd * 2**(-i)) if lu[1] == 0: # lower - np.einsum('ii->i', eAw[1:, :-1])[:] = esd + np.einsum('ii->i', eAw[1:, :-1])[:] = exp_sd else: # upper - np.einsum('ii->i', eAw[:-1, 1:])[:] = esd + np.einsum('ii->i', eAw[:-1, 1:])[:] = exp_sd else: # generic for _ in range(s): @@ -380,13 +383,14 @@ def expm(A): return eA -def _sinch(x): - if np.isscalar(x): - return 1. if x == 0. else np.sinh(x)/x - m = x == 0. - x[m] = 1. - x[~m] = np.sinh(x[~m])/x[~m] - return x +def _exp_sinch(x): + # Higham's formula (10.42), might overflow, see GH-11839 + lexp_diff = np.diff(np.exp(x)) + l_diff = np.diff(x) + mask_z = l_diff == 0. + lexp_diff[~mask_z] /= l_diff[~mask_z] + lexp_diff[mask_z] = np.exp(x[:-1][mask_z]) + return lexp_diff def cosm(A): From f26a9d00f66266ac119af410b7888c8653cd5894 Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Thu, 2 Dec 2021 09:44:00 +0100 Subject: [PATCH 16/22] PEP8:linalg: correct spacing _matfuncs.py --- scipy/linalg/_matfuncs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scipy/linalg/_matfuncs.py b/scipy/linalg/_matfuncs.py index 8195f3f8882f..0f650719b43e 100644 --- a/scipy/linalg/_matfuncs.py +++ b/scipy/linalg/_matfuncs.py @@ -20,7 +20,7 @@ from ._matfuncs_expm import pick_pade_structure, pade_UV_calc __all__ = ['expm', 'cosm', 'sinm', 'tanm', 'coshm', 'sinhm', 'tanhm', 'logm', - 'funm', 'signm', 'sqrtm', 'fractional_matrix_power','expm_frechet', + 'funm', 'signm', 'sqrtm', 'fractional_matrix_power', 'expm_frechet', 'expm_cond', 'khatri_rao'] eps = np.finfo('d').eps From 3d4f5dc843e04e047e2c6f8a7110f9dd945f2a00 Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Thu, 2 Dec 2021 09:45:51 +0100 Subject: [PATCH 17/22] MAINT:linalg: Fix access in pade structure --- scipy/linalg/_matfuncs_expm.pyx.in | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in index e8a2f0cb7f32..8c218a1b30a1 100644 --- a/scipy/linalg/_matfuncs_expm.pyx.in +++ b/scipy/linalg/_matfuncs_expm.pyx.in @@ -196,7 +196,7 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, :, ::1] Am): return 5, s # m = 7 - if n < 360: + if n < 400: {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[2, 0, 0], &intn, &Amv[2, 0, 0], &intn, &zero, &Amv[4, 0, 0], &intn) d8 = norm1(Amv[4, :, :]) ** (1./8.) else: @@ -216,7 +216,7 @@ cdef pick_pade_structure_{{ pfx }}({{ tname }}[:, :, ::1] Am): # m = 13 # Scale-square - if n < 360: + if n < 400: {{ pfx }}gemm(cN, cN, &intn, &intn, &intn, &one, &Amv[3, 0, 0], &intn, &Amv[2, 0, 0], &intn, &zero, &Amv[4, 0, 0], &intn) d10 = norm1(Amv[4, :, :]) ** (1./10.) else: @@ -264,7 +264,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi cdef {{ prec }} two=2.0 cdef {{ tname }} one = 1.0, zero = 0.0, neg_one = -1.0 if not ipiv: - raise MemoryError('Internal function "pade_357_UV_calc_c" failed to allocate memory.') + raise MemoryError('Internal function "pade_357_UV_calc" failed to allocate memory.') try: # b[m] is always 1. hence skipped if m == 3: @@ -286,7 +286,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi b[5] = 1512. b[6] = 56. else: - raise ValueError(f'Internal function "pade_357_UV_calc_c" received an invalid value {m}') + raise ValueError(f'Internal function "pade_357_UV_calc" received an invalid value {m}') # Utilize the unused powers of Am as scratch memory if m == 3: @@ -302,7 +302,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi # U = Am[0] @ (b[5]*Am[2] + b[3]*Am[1] + b[1]*I_n) {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one) {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one) - {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &one, &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one) for i in range(n): Am[4, i, i] = Am[4, i, i] + b[1] {{ pfx }}gemm('N', 'N', &n, &n, &n, &one, &Am[4, 0, 0], &n, &Am[0, 0, 0], &n, &zero, &Am[3, 0, 0], &n) @@ -317,7 +317,7 @@ cdef void pade_357_UV_calc_{{ pfx }}({{ tname }}[:, :, ::]Am, int n, int m) nogi {{ pfx }}copy(&n2, &Am[1, 0, 0], &int_one, &Am[4, 0, 0], &int_one) {{ pfx }}scal(&n2, &b[3], &Am[4, 0, 0], &int_one) {{ pfx }}axpy(&n2, &b[5], &Am[2, 0, 0], &int_one, &Am[4, 0, 0], &int_one) - {{ pfx }}axpy(&n2, &b[7], &Am[3, 0, 0], &int_one, &Am[4, 0, 0], &int_one) + {{ pfx }}axpy(&n2, &one, &Am[3, 0, 0], &int_one, &Am[4, 0, 0], &int_one) for i in range(n): Am[4, i, i] = Am[4, i, i] + b[1] # We ran out of space for dgemm; first compute V and then reuse space for U From 48c5966b3960b63479a51d40b1b7d9c96c86d2e9 Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Thu, 2 Dec 2021 09:46:23 +0100 Subject: [PATCH 18/22] ENH:linalg: Typing stubs for expm cython funcs --- scipy/linalg/_matfuncs_expm.pyi | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 scipy/linalg/_matfuncs_expm.pyi diff --git a/scipy/linalg/_matfuncs_expm.pyi b/scipy/linalg/_matfuncs_expm.pyi new file mode 100644 index 000000000000..880c69558ec8 --- /dev/null +++ b/scipy/linalg/_matfuncs_expm.pyi @@ -0,0 +1,6 @@ +from numpy.typing import NDArray +from typing import Any, Tuple + +def pick_pade_structure(a: NDArray[Any]) -> Tuple[int, int]: ... + +def pade_UV_calc(Am: NDArray[Any], n: int, m: int) -> None: ... From 60f7e99d1655554efacd9d1c4999d8e8568e5bbf Mon Sep 17 00:00:00 2001 From: ilayn Date: Sat, 12 Mar 2022 15:00:13 +0100 Subject: [PATCH 19/22] BLD:linalg: Add _matfuncs_expm.pyx to meson.build --- scipy/linalg/meson.build | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/scipy/linalg/meson.build b/scipy/linalg/meson.build index b432b7349f5b..e61a596818fd 100644 --- a/scipy/linalg/meson.build +++ b/scipy/linalg/meson.build @@ -244,6 +244,22 @@ py3.extension_module('_decomp_update', subdir: 'scipy/linalg' ) +# _matfuncs_expm: +_matfuncs_expm_pyx = custom_target('_matfuncs_expm', + output: '_matfuncs_expm.pyx', + input: '_matfuncs_expm.pyx.in', + command: [py3, tempita, '@INPUT@', '-o', '@OUTDIR@'] +) + +py3.extension_module('_matfuncs_expm', + linalg_init_cython_gen.process(_matfuncs_expm_pyx), + c_args: cython_c_args, + include_directories: inc_np, + dependencies: py3_dep, + install: true, + subdir: 'scipy/linalg' +) + _cythonized_array_utils = py3.extension_module('_cythonized_array_utils', linalg_init_cython_gen.process('_cythonized_array_utils.pyx'), c_args: [cython_c_args, c_undefined_ok], From 93d9b56ecae681982d200c115819a48dd3ed3499 Mon Sep 17 00:00:00 2001 From: ilayn Date: Sun, 13 Mar 2022 15:31:12 +0100 Subject: [PATCH 20/22] BENCH:linalg: Adjust tests for new dense expm --- benchmarks/benchmarks/linalg.py | 2 -- benchmarks/benchmarks/sparse_linalg_expm.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmarks/linalg.py b/benchmarks/benchmarks/linalg.py index f7f674ce1497..2e51d93939d4 100644 --- a/benchmarks/benchmarks/linalg.py +++ b/benchmarks/benchmarks/linalg.py @@ -233,8 +233,6 @@ def time_invpascal(self, size): def time_toeplitz(self, size): sl.toeplitz(self.x) - def time_tri(self, size): - sl.tri(size) class GetFuncs(Benchmark): diff --git a/benchmarks/benchmarks/sparse_linalg_expm.py b/benchmarks/benchmarks/sparse_linalg_expm.py index 1dc25090f975..fac933f2c192 100644 --- a/benchmarks/benchmarks/sparse_linalg_expm.py +++ b/benchmarks/benchmarks/sparse_linalg_expm.py @@ -6,6 +6,7 @@ with safe_import(): import scipy.linalg + from scipy.sparse.linalg import expm as sp_expm from scipy.sparse.linalg import expm_multiply @@ -67,6 +68,6 @@ def setup(self, n, format): def time_expm(self, n, format): if format == 'sparse': - scipy.linalg.expm(self.A_sparse) + sp_expm(self.A_sparse) elif format == 'dense': scipy.linalg.expm(self.A_dense) From a6f95fffdb409302b67b1ceb02abdc75ee083c16 Mon Sep 17 00:00:00 2001 From: ilayn Date: Sun, 13 Mar 2022 20:45:54 +0100 Subject: [PATCH 21/22] MAINT:PEP8:linalg: Remove extra blank line [ci skip] [skip ci] [skip azp] --- benchmarks/benchmarks/linalg.py | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/benchmarks/linalg.py b/benchmarks/benchmarks/linalg.py index 2e51d93939d4..f870e8049038 100644 --- a/benchmarks/benchmarks/linalg.py +++ b/benchmarks/benchmarks/linalg.py @@ -234,7 +234,6 @@ def time_toeplitz(self, size): sl.toeplitz(self.x) - class GetFuncs(Benchmark): def setup(self): self.x = np.eye(1) From 41b9362fcad54fd6d0c74e4b0843497339027a1d Mon Sep 17 00:00:00 2001 From: ilayn Date: Sun, 13 Mar 2022 23:49:25 +0100 Subject: [PATCH 22/22] MAINT:linalg: Add cnp.import_array() to expm pyx file --- scipy/linalg/_matfuncs_expm.pyx.in | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scipy/linalg/_matfuncs_expm.pyx.in b/scipy/linalg/_matfuncs_expm.pyx.in index 8c218a1b30a1..05f712ac93b7 100644 --- a/scipy/linalg/_matfuncs_expm.pyx.in +++ b/scipy/linalg/_matfuncs_expm.pyx.in @@ -16,6 +16,9 @@ from scipy.linalg.cython_blas cimport (sgemm, saxpy, sscal, scopy, sgemv, __all__ = ['pick_pade_structure', 'pade_UV_calc'] +# See GH-14813 +cnp.import_array() + ctypedef fused lapack_t: float double