In [103]:
import numpy as np
import numba as nb
import pandas as pd
arr = np.random.randn(1000000,3)

@nb.jit(fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def temporary(z):
    return (z[0] + z[2])/ z[1]

@nb.jit(fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def temporarysum(z):
    return np.sum(z)
    
@nb.jit(fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True, parallel = True)
def func_int_identifier_loop(Z):
    n = Z.shape[0]
    res = np.zeros((n, 1))
    for i in nb.prange(n):
        res[i, 0] = temporarysum(Z[i, :])
    return res

%timeit func_int_identifier_loop(arr)

614 µs ± 195 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [2]:
@nb.jit(nopython=True, nogil=True)
def temporaryD(z):
    return (z[0] + z[2])/ z[1]
    
@nb.jit(fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True, parallel = True)
def func_int_identifier_loopD(Z):
    n = Z.shape[0]
    res = np.zeros((n, 1))
    for i in nb.prange(n):
        res[i, 0] = temporaryD(Z[i, :])
    return res

%timeit func_int_identifier_loopD(arr)

686 µs ± 192 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
@nb.jit(fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True, parallel = True)
def temporaryE(z):
    return (z[0] + z[2])/ z[1]
    
@nb.jit(nopython=True, nogil=True)
def func_int_identifier_loopE(Z):
    n = Z.shape[0]
    res = np.zeros((n, 1))
    for i in nb.prange(n):
        res[i, 0] = temporaryE(Z[i, :])
    return res

%timeit func_int_identifier_loopE(arr)

1.34 ms ± 8.45 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [4]:
from numba import vectorize, float64

@vectorize([float64(float64, float64, float64)])
def vectorized_function(x, y, z):
    return (x + z) / y

%timeit vectorized_function(arr[:,0], arr[:,1], arr[:,2])


1.06 ms ± 43.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [5]:
import numpy as np
import numba as nb

arr = np.random.randn(1000000,3)



In [42]:
%timeit np.sum(arr)

422 µs ± 12.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [7]:
%timeit np.sum(arr.T)

460 µs ± 17 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [8]:
arr = np.random.randn(1000000,3)
x=arr.copy(order='F')
%timeit np.sum(x.copy(order='F'))
x=arr.copy(order='C')
%timeit np.sum(x.copy(order='C'))


1.71 ms ± 6.43 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
1.72 ms ± 23 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [9]:
arr = np.random.randn(1000000,3)
x=arr.copy(order='F')
%timeit np.sum(x.copy(order='F'), axis = 1)
x=arr.copy(order='C')
%timeit np.sum(x.copy(order='C'), axis = 1)

2.09 ms ± 97.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
14.4 ms ± 24.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
for k in range(10):
    arr = np.random.randn(int(10**k),5)
    
    x=arr.copy(order='C')
    %timeit np.sum(x, axis = 1)
    x=arr.copy(order='C')
    %timeit np.sum(x.copy(order='F'), axis = 1)
    
    
    x=arr.copy(order='F')
    %timeit np.sum(x, axis = 0)
    x=arr.copy(order='F')
    %timeit np.sum(x.copy(order='C'), axis = 0)
    
    print()
    print()

2.48 µs ± 61.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
2.78 µs ± 53.1 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
2.47 µs ± 48.6 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
2.79 µs ± 17.9 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


2.72 µs ± 11.3 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
2.96 µs ± 19.6 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
2.6 µs ± 15.1 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
2.99 µs ± 16.2 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


4.12 µs ± 15.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
3.37 µs ± 16.3 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
2.81 µs ± 17.8 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
4.68 µs ± 20.9 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


17 µs ± 8.95 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
5.78 µs ±

In [43]:
arr = np.random.randn(int(10**1),3)

x=arr.copy(order='C')
func_int_identifier_loop(x.T.copy(order='F'))
#func_int_identifier_loop(x.copy(order='F'))

array([[-0.57280297],
       [ 1.46045317],
       [ 1.27275171]])

In [39]:
x

array([[ 1.14503651,  0.44101358,  0.76099434],
       [ 1.52688392, -0.35970674, -2.20340554],
       [-0.51794687, -0.34889599, -1.13465599],
       ...,
       [-0.33717176,  0.75993405, -1.31054421],
       [-0.87156672,  0.71320783,  1.07239225],
       [-0.17289965, -0.06576006,  0.63412154]])

In [46]:
np.sum(x, axis=1).reshape(-1,1) - func_int_identifier_loop(x)

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [45]:
func_int_identifier_loop(x)

array([[-1.14945893],
       [-2.42084956],
       [ 1.63433516],
       [-1.03034721],
       [ 1.40409159],
       [ 1.92513534],
       [ 0.46273955],
       [-0.48655743],
       [ 2.12282142],
       [-0.30150803]])

In [116]:
@nb.jit(nopython=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_rewrites=True, nogil=True, parallel=True)
def temporarysum(z):
    return np.sum(z)
    
@nb.jit(nopython=True, nogil=True, parallel = True, fastmath=True, looplift=True, inline='always', target_backend='host', no_rewrites=True)
def func_int_identifier_loop(Z):
    n = Z.shape[0]
    res = np.zeros((n, 1))
    for i in nb.prange(n):
        res[i, 0] = temporarysum(Z[i, :])
    return res

@nb.jit(nopython=True, nogil=True, parallel = True, fastmath=True, looplift=True, inline='always', target_backend='host', no_rewrites=True)
def func_int_identifier_loop_dtype(Z):
    n = Z.shape[0]
    res = np.zeros((n, 1), dtype=np.float32)
    for i in nb.prange(n):
        res[i, 0] = temporarysum(Z[i, :])
    return res

@nb.jit(nopython=True, nogil=True, parallel = True, fastmath=True, looplift=True, inline='always', target_backend='host', no_rewrites=True)
def func_int_identifier_loop_empty(Z):
    n = Z.shape[0]
    res = np.empty((n, 1), dtype=np.float32)
    for i in nb.prange(n):
        res[i, 0] = temporarysum(Z[i, :])
    return res




@nb.jit(nb.float32[:](nb.float32[:,:], nb.float32[:,:]), nopython=True, nogil=True, parallel = True,fastmath=True, looplift=True, target_backend='host', inline='always')
def func_int_identifier_loopK(Z, res):
    n = Z.shape[0]
    for i in nb.prange(n):
        res[i, 0] = temporarysum(Z[i, :])
    return res[:n, 0]

arr = np.random.randn(10_00_000,5)
resF = np.zeros(tuple(s * 2 for s in arr.shape)).astype(np.float32).copy(order='F')
resC = resF.copy(order='F')

print('division of column over F-contiguous array')
x=arr.astype(np.float32).copy(order='F')
print('numba base', end=' ')
%timeit func_int_identifier_loop(x)
print('numba type initialized returned array', end=' ')
%timeit func_int_identifier_loop_dtype(x)
print('numba type initialized returned empty array', end=' ')
%timeit func_int_identifier_loop_empty(x)
print('numpy sum', end=' ')
%timeit np.sum(x, axis=1)
print('numpy copied for contiguity', end='')
%timeit np.sum(x.copy(order='F'), axis=1)
print('numba giving an output array F-contiguous', end=' ')
%timeit func_int_identifier_loopK(x, resF)
print('numba giving an output array C-contiguous', end=' ')
%timeit func_int_identifier_loopK(x, resC)

print()
print('division of column over C-contiguous array')
x=arr.astype(np.float32).copy(order='C')
print('numba base', end=' ')
%timeit func_int_identifier_loop(x)
print('numba type initialized returned array', end=' ')
%timeit func_int_identifier_loop_dtype(x)
print('numba type initialized returned empty array', end=' ')
%timeit func_int_identifier_loop_empty(x)
print('numpy sum', end=' ')
%timeit np.sum(x, axis=1)
print('numpy copied for contiguity', end=' ')
%timeit np.sum(x.copy(order='F'), axis=1)
print('numba giving an output array F-contiguous', end=' ')
%timeit func_int_identifier_loopK(x, resF)
print('numba giving an output array C-contiguous', end=' ')
%timeit func_int_identifier_loopK(x, resC)

Sum of column over F-contiguous array
numba base 20.4 ms ± 1.74 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
numba type initialized returned array 13.1 ms ± 555 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
numba type initialized returned empty array 10.5 ms ± 343 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
numpy sum 14.5 ms ± 85.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
numpy copied for contiguity37.4 ms ± 445 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
numba giving an output array F-contiguous 7.45 ms ± 102 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
numba giving an output array C-contiguous 7.46 ms ± 83.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Sum of column over C-contiguous array
numba base 19.5 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
numba type initialized returned array 13.6 ms ± 493 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
numba type initialized returned em

In [183]:
import numpy as np
import numba as nb

@nb.guvectorize([(nb.float32[:, :], nb.float32[:])], '(n,k)->(n)', nopython=True)
def summatrix(computed_matrix, result_array):
    for i in range(computed_matrix.shape[0]):
        sum_ = 0.0
        for j in nb.prange(computed_matrix.shape[1]):
            sum_ += computed_matrix[i, j]
        result_array[i] = sum_
        
%timeit summatrix(x)
%timeit np.sum(x)

22.3 ms ± 163 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
7.48 ms ± 56.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [198]:
import numpy as np
import numba as nb

@nb.vectorize#([(nb.float32[:])], '(n)->()', nopython=True)
def sumarray(computed_array):
    result_value = 0.0
    for i in range(computed_array.shape[0]):
        result_value += computed_array[i]
    return result_value

@nb.jit(nb.float32[:](nb.float32[:,:], nb.float32[:,:]), nopython=True)
def summatrix(computed_matrix, result_array):
    for i in range(computed_matrix.shape[0]):
        result_array[i] = sumarray(computed_matrix[i, :])


TypingError: Failed in nopython mode pipeline (step: nopython frontend)
No implementation of function Function(<numba._DUFunc 'sumarray'>) found for signature:
 
 >>> sumarray(array(float32, 1d, A))
 
There are 2 candidate implementations:
  - Of which 2 did not match due to:
  Overload in function 'DUFunc._type_me': File: numba/np/ufunc/dufunc.py: Line 606.
    With argument(s): '(array(float32, 1d, A))':
   Rejected as the implementation raised a specific error:
     TypingError: Failed in nopython mode pipeline (step: nopython frontend)
   Unknown attribute 'shape' of type float32
   
   File "../../../../../tmp/ipykernel_6532/3226392303.py", line 7:
   <source missing, REPL/exec in use?>
   
   During: typing of get attribute at /tmp/ipykernel_6532/3226392303.py (7)
   
   File "../../../../../tmp/ipykernel_6532/3226392303.py", line 7:
   <source missing, REPL/exec in use?>

  raised from /home/remi/.pyenv/versions/3.12.1/lib/python3.12/site-packages/numba/core/typeinfer.py:1091

During: resolving callee type: Function(<numba._DUFunc 'sumarray'>)
During: typing of call at /tmp/ipykernel_6532/3226392303.py (14)


File "../../../../../tmp/ipykernel_6532/3226392303.py", line 14:
<source missing, REPL/exec in use?>


In [176]:
@nb.guvectorize([(nb.float32[:], nb.float32)], '(n)->()')
def sumarray(computed_array, result_value):
    for i in range(computed_array.shape[0]):
        result_value += computed_array[i]

@nb.guvectorize([(nb.float32[:,:], nb.float32[:,:])], '(n,k)->(n,)')
def summatrix(computed_matrix, result_array):
    for i in nb.prange(computed_matrix.shape[0]):
        sumarray(computed_matrix[i, :], result_array[i])




TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Untyped global name 'sumarray': Cannot determine Numba type of <class 'numba.np.ufunc.gufunc.GUFunc'>

File "../../../../../tmp/ipykernel_6532/2071512261.py", line 9:
<source missing, REPL/exec in use?>


In [164]:
x[tuple((0, None))]

array([[-2.8405788,  2.519    , -0.5060638, -0.998097 ,  2.14208  ]],
      dtype=float32)

In [158]:

# @nb.guvectorize((nb.float32[:,:], nb.float32[:,:]))
@nb.guvectorize([(nb.float32[:,:], nb.float32[:,:])], '(n,k)->(n)')
def func_int_identifier_loopQQK(Z, res):
    n, k = Z.shape
    for j in nb.prange(n):
        res[i, 0] = temporarysum(Z[i, j])
    # return res[:n, 0]

arr = np.random.randn(10_00_000,5)
resF = np.zeros(tuple(s * 2 for s in arr.shape)).astype(np.float32).copy(order='F')
resC = resF.copy(order='F')
x = arr.astype(np.float32).copy(order='F')
print('numba giving an output array C-contiguous', end=' ')

func_int_identifier_loopQQK(x, resC[:,0])

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
No implementation of function Function(<function sum at 0x7f358812ade0>) found for signature:
 
 >>> sum(float32)
 
There are 2 candidate implementations:
      - Of which 2 did not match due to:
      Overload of function 'sum': File: numba/core/typing/npydecl.py: Line 373.
        With argument(s): '(float32)':
       No match.

During: resolving callee type: Function(<function sum at 0x7f358812ade0>)
During: typing of call at /tmp/ipykernel_6532/4278288295.py (3)


File "../../../../../tmp/ipykernel_6532/4278288295.py", line 3:
<source missing, REPL/exec in use?>


In [123]:


x_c=arr.astype(np.float32).copy(order='C')
x_f=arr.astype(np.float32).copy(order='F')

df = pd.DataFrame(x_c)
df2= pd.DataFrame(x_f)


# print('division of DataFrame creates over :')

# %timeit df+df2
# %timeit df.to_numpy()+df.to_numpy()
# %timeit df.to_numpy().copy(order='C')+df2.to_numpy().copy(order='C')
# %timeit df2.to_numpy()+df2.to_numpy()
# %timeit df.to_numpy().copy(order='F')+df2.to_numpy().copy(order='F')
# %timeit x_c+x_f
# %timeit x_c+x_c
# %timeit x_f+x_f


    

84.6 ms ± 2.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [144]:
@nb.jit(nb.float32[::,:](nb.float32[:,:], nb.float32[:,:], nb.float32[:,:]), nopython=True, nogil=True, parallel = True,fastmath=True, looplift=True, target_backend='host', inline='always')
def msum(A, B, res):
    n, k = A.shape
    for j in nb.prange(k):
        for i in nb.prange(n):
            res[i, j] = A[i, j] + B[i, j]
    return res[:n, :k]


arr = np.random.randn(10_000_000,3)

resF = np.empty(tuple(s * 2 for s in arr.shape), dtype=np.float32)

x_c=arr.astype(np.float32).copy(order='C')
x_c1=arr.astype(np.float32).copy(order='C')
x_f=arr.astype(np.float32).copy(order='F')
x_f1=arr.astype(np.float32).copy(order='F')

df = pd.DataFrame(x_c)
df1b = pd.DataFrame(x_c1)
df2= pd.DataFrame(x_f)

print('division of DataFrame creates over arrays with same contiguity', end=' ')
%timeit df+df1b
print('division of DataFrame creates over same arrays copies with different contiguity', end=' ')
%timeit df+df2
print('division underlying numpy of DataFrame creates with same C-contiguity', end=' ')
%timeit df.to_numpy()+df1b.to_numpy()
print('division underlying numpy of DataFrame creates with different contiguity', end=' ')
%timeit df.to_numpy()+df2.to_numpy()
print('division underlying numpy of DataFrame creates with different contiguity copied in C order', end=' ')
%timeit df.to_numpy().copy(order='C')+df2.to_numpy().copy(order='C')
print('division underlying numpy of DataFrame creates with same F-contiguity', end=' ')
%timeit df2.to_numpy()+df2.to_numpy()
print('division underlying numpy of DataFrame creates with different contiguity copied in F order', end=' ')
%timeit df.to_numpy().copy(order='F')+df2.to_numpy().copy(order='F')
print('division of numpy array with himself', end=' ')
%timeit x_c+x_c
print('division of raw numpy arrays with different contiguity', end=' ')
%timeit x_c+x_f
print('division of raw numpy arrays with same C-contiguity', end=' ')
%timeit x_c+x_c1
print('division of raw numpy arrays with same F-contiguity', end=' ')
%timeit x_f+x_f
print('using numba mdivision over to numpy of same contiguity df')
%timeit msum(df.to_numpy(), df1b.to_numpy(), resF)
print('using numba mdivision over to numpy of different contiguity df')
%timeit msum(df.to_numpy(), df2.to_numpy(), resF)
print('using numba mdivision same array')
%timeit msum(x_c, x_c, resF)
print('using numba mdivision same contiguityt array')
%timeit msum(x_c, x_c1, resF)
print('using numba mdivision different contiguityt array')
%timeit msum(x_c, x_f, resF)

division of DataFrame creates over arrays with same contiguity 16.3 ms ± 132 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
division of DataFrame creates over same arrays copies with different contiguity 26.3 ms ± 473 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
division underlying numpy of DataFrame creates with same C-contiguity 16.1 ms ± 149 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
division underlying numpy of DataFrame creates with different contiguity 149 ms ± 13.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
division underlying numpy of DataFrame creates with different contiguity copied in C order 72 ms ± 652 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
division underlying numpy of DataFrame creates with same F-contiguity 14.2 ms ± 136 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
division underlying numpy of DataFrame creates with different contiguity copied in F order 49.4 ms ± 409 µs per loop (mean ± std. de

In [141]:
@nb.jit(nb.float32[::,:](nb.float32[:,:], nb.float32[:,:], nb.float32[:,:]), nopython=True, nogil=True, parallel = True,fastmath=True, looplift=True, target_backend='host', inline='always')
def msum(A, B, res):
    n, k = A.shape
    for j in nb.prange(k):
        for i in nb.prange(n):
            res[i, j] = A[i, j] / B[i, j]
    return res[:n, :k]


arr = np.random.randn(10_000_000,5)

resF = np.empty(tuple(s * 2 for s in arr.shape), dtype=np.float32)

x_c=arr.astype(np.float32).copy(order='C')
x_c1=arr.astype(np.float32).copy(order='C')
x_f=arr.astype(np.float32).copy(order='F')
x_f1=arr.astype(np.float32).copy(order='F')

df = pd.DataFrame(x_c)
df1b = pd.DataFrame(x_c1)
df2= pd.DataFrame(x_f)
df2b= pd.DataFrame(x_f1)

print('division of DataFrame creates over arrays with same C-contiguity', end=' ')
%timeit df/df1b
print('division of DataFrame creates over arrays with same F-contiguity', end=' ')
%timeit df2/df2b
print('division of DataFrame creates over same arrays copies with different contiguity', end=' ')
%timeit df/df2
print('division underlying numpy of DataFrame creates with same C-contiguity', end=' ')
%timeit df.to_numpy()/df1b.to_numpy()
print('division underlying numpy of DataFrame creates with different contiguity', end=' ')
%timeit df.to_numpy()/df2.to_numpy()
print('division underlying numpy of DataFrame creates with different contiguity copied in C order', end=' ')
%timeit df.to_numpy().copy(order='C')/df2b.to_numpy().copy(order='C')
print('division underlying numpy of DataFrame creates with same F-contiguity', end=' ')
%timeit df2.to_numpy()/df2b.to_numpy()
print('division underlying numpy of DataFrame creates with different contiguity copied in F order', end=' ')
%timeit df.to_numpy().copy(order='F')/df2b.to_numpy().copy(order='F')
print('division of numpy array with himself', end=' ')
%timeit x_c/x_c
print('division of raw numpy arrays with different contiguity', end=' ')
%timeit x_c/x_f
print('division of raw numpy arrays with same C-contiguity', end=' ')
%timeit x_c/x_c1
print('division of raw numpy arrays with same F-contiguity', end=' ')
%timeit x_f/x_f
print('using numba mdivision over to numpy of same contiguity df')
%timeit msum(df.to_numpy(), df1b.to_numpy(), resF)
print('using numba mdivision over to numpy of different contiguity df')
%timeit msum(df.to_numpy(), df2.to_numpy(), resF)
print('using numba mdivision same array')
%timeit msum(x_c, x_c, resF)
print('using numba mdivision same contiguityt array')
%timeit msum(x_c, x_c1, resF)
print('using numba mdivision different contiguityt array')
%timeit msum(x_c, x_f, resF)

division of DataFrame creates over arrays with same C-contiguity 28.6 ms ± 411 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
division of DataFrame creates over arrays with same F-contiguity 26.8 ms ± 731 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
division of DataFrame creates over same arrays copies with different contiguity 57.4 ms ± 703 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
division underlying numpy of DataFrame creates with same C-contiguity 27 ms ± 177 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
division underlying numpy of DataFrame creates with different contiguity 92.3 ms ± 289 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
division underlying numpy of DataFrame creates with different contiguity copied in C order 97.7 ms ± 124 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
division underlying numpy of DataFrame creates with same F-contiguity 25.4 ms ± 362 µs per loop (mean ± std. dev. of 7 runs, 10 loops each

In [143]:
@nb.jit(nb.float32[::,:](nb.float32[:,:], nb.float32[:,:], nb.float32[:,:]), nopython=True, nogil=True, parallel = True,fastmath=True, looplift=True, target_backend='host', inline='always')
def msum(A, B, res):
    n, k = A.shape
    for j in nb.prange(k):
        for i in nb.prange(n):
            res[i, j] = A[i, j] / B[i, j]
    return res[:n, :k]


arr = np.random.randn(10_000_000,5)

resF = np.empty(tuple(s * 2 for s in arr.shape), dtype=np.float32)

x_c=arr.astype(np.float32).copy(order='F')

df = pd.DataFrame(x_c)

%timeit df / df.shift()
%timeit x_c[1:] / x_c[:-1]


38.1 ms ± 450 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
9.18 ms ± 116 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [140]:
x_c.flags.c_contiguous

True

In [60]:


for k in range(7,8):
    arr = np.random.randn(int(10**k),3)

    print()
    print(f'k={k}')
    print('C-contiguous')
    x=arr.copy(order='C')
    print('numpy')
    %timeit np.sum(x, axis=1)
    print('numba')
    %timeit func_int_identifier_loop(x)

    print()
    print('F-contiguous')
    x=arr.copy(order='F')
    print('numpy')
    %timeit np.sum(x, axis=1)
    print('numba')
    %timeit func_int_identifier_loop(x)

    print()
    print('C contigous with conversion')
    x=arr.copy(order='C')
    print('numpy')
    %timeit np.sum(x.copy(order='F'), axis=1)
    print('numba')
    %timeit func_int_identifier_loop(x.copy(order='F'))

    print()
    print()
    print()



k=7
C-contiguous
numpy
141 ms ± 846 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
numba


TypeError: No matching definition for argument type(s) array(float64, 2d, C)

In [33]:
for k in range(3, 10):
    arr = np.random.randn(int(10**k),3)
    
    x=arr.copy(order='C')
    %timeit func_int_identifier_loop(x)
    x=arr.copy(order='F')
    %timeit func_int_identifier_loop(x)
    # x=arr.copy(order='F')
    # %timeit func_int_identifier_loop(x.copy(order='C'))

    # print()
    
    # x=arr.copy(order='F')
    # %timeit func_int_identifier_loop(x)
    # x=arr.copy(order='F')
    # %timeit func_int_identifier_loop(x.copy(order='C'))
    
    # print()
    # print()

24.2 µs ± 475 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
24 µs ± 597 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
29.1 µs ± 391 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

24.2 µs ± 476 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
28.9 µs ± 553 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


89.2 µs ± 451 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
89.5 µs ± 498 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


KeyboardInterrupt: 

In [None]:
for k in range(10):
    arr = np.random.randn(int(10**k),3)
    
    x=arr.copy(order='C')
    %timeit func_int_identifier_loopD(x.copy(order='C'), axis = 1)
    x=arr.copy(order='C')
    %timeit func_int_identifier_loopD(x.copy(order='C').copy(order='F'), axis = 1)
    
    
    x=arr.copy(order='F')
    %timeit func_int_identifier_loopD(x.copy(order='F'), axis = 0)
    x=arr.copy(order='F')
    %timeit func_int_identifier_loopD(x.copy(order='F').copy(order='C'), axis = 0)
    
    print()
    print()

In [None]:
for k in range(10):
    arr = np.random.randn(int(10**k),3)
    
    x=arr.copy(order='C')
    %timeit func_int_identifier_loop(x.copy(order='C'), axis = 1)
    x=arr.copy(order='C')
    %timeit func_int_identifier_loop(x.copy(order='C').copy(order='F'), axis = 1)
    
    
    x=arr.copy(order='F')
    %timeit func_int_identifier_loop(x.copy(order='F'), axis = 0)
    x=arr.copy(order='F')
    %timeit func_int_identifier_loop(x.copy(order='F').copy(order='C'), axis = 0)
    
    print()
    print()

In [2]:
# import numpy as np

# %timeit x = np.zeros((1_000_000_000, 4), dtype = np.float32)
# %timeit x = np.empty((1_000_000_000, 4), dtype = np.float32)


15.7 µs ± 26.8 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
18.4 µs ± 33 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [None]:
arr = np.random.randn(1000000,3)

x=arr.copy(order='F')
%timeit np.sum(x.copy(order='F'), axis = 0)
x=arr.copy(order='F')
%timeit np.sum(x.copy(order='F').copy(order='C'), axis = 0)

In [None]:
arr = np.random.randn(1000000,3)
x=arr.copy(order='F')
%timeit np.sum(x.copy(order='F'), axis = 0)
x=arr.copy(order='C')
%timeit np.sum(x.copy(order='C'), axis = 0)

In [None]:
from numba import guvectorize, float64

@guvectorize([(float64[:], float64[:])], '(n)->(n)', target='cpu')
def guvectorized_function(x, out):
    for i in range(x.shape[0]):
        out[i] = (x[i] + x[i]) / x[i]  

guvectorized_function


In [None]:
@nb.jit(fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True, parallel = True)
def temporaryE(z):
    return (z[0] + z[2])/ z[1]
    
@nb.jit(nopython=True, nogil=True)
def func_int_identifier_loopE(Z):
    n = Z.shape[0]
    res = np.zeros((n, 1))
    for i in nb.prange(n):
        res[i, 0] = temporaryE(Z[i, :])
    return res

%timeit func_int_identifier_loopE(arr)

In [None]:
@nb.jit(fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True, parallel = True)
def temporaryE(z):
    return (z[0] + z[2])/ z[1]
    
@nb.guvectorize
def func_int_identifier_loopE(Z):
    n = Z.shape[0]
    res = np.zeros((n, 1))
    for i in nb.prange(n):
        res[i, 0] = temporaryE(Z[i, :])
    return res

%timeit func_int_identifier_loopE(arr)