# Understanding how the follwing works :

In [1]:
import numpy as np
import numba as nb
import pandas as pd
import logging
from typing import Callable, Type, Dict, Tuple, Any
from numpy.lib.stride_tricks import sliding_window_view
from IPython.display import display

In [2]:
arr = np.random.randn(100_000,6).astype(np.float32)

sliding_window_view creates a view on the initial array enabling to not need any copy here

In [3]:
window=30

data = sliding_window_view(arr, window, axis=0)

df = pd.DataFrame(arr, columns = ['A', 'B', 'C', 'D', 'E', 'F'])

# Standard case - summing axis = 0 : The aggregation is performed over each columns independantly, it is the only mode available in pandas currently 

In [4]:
def agg_sum(x):
    return np.sum(x)

@nb.njit(nb.float32(nb.float32[:]), cache=True, fastmath=False, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def agg_sumba(x):
    return np.sum(x)

@nb.njit((nb.float32[:, :], nb.float32[:, :],nb.types.uint32,nb.types.uint32), cache=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def vl(z, r, i, m):
    for j in nb.prange(m):
        r[i, j] = agg_sumba(z[j, :])


@nb.njit((nb.types.Array(nb.types.float32, 3, 'A', readonly=True), nb.float32[:, :],nb.types.uint32, nb.types.uint32), cache=True, parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True, nogil=True)
def cvlopt(z, r, n, m):
    for i in nb.prange(n):
        vl(z[i,:,:], r, i, m)

def vlopt(arr, window):
    cvlopt(sliding_window_view(arr, window, axis=0), r:=np.empty((n:=arr.shape[0]- window + 1, m:=arr.shape[1]), dtype=np.float32), np.uint32(n), np.uint32(m))
    return r

display(pd.DataFrame(vlopt(df.to_numpy(), window)))
display(df.rolling(window).sum())

Unnamed: 0,0,1,2,3,4,5
0,-1.887958,6.930119,-5.475718,0.004122,-2.412687,-5.947734
1,-1.889852,6.911351,-5.941976,1.556753,-1.634506,-3.693544
2,-0.096585,6.817742,-4.050787,1.926658,-0.634865,-3.352806
3,1.400353,8.393235,-4.924432,2.198602,-0.085664,-4.108313
4,2.579949,13.503088,-4.513248,2.628873,-1.334002,-2.266179
...,...,...,...,...,...,...
99966,-6.325395,4.436402,3.171073,1.035852,-8.646391,5.627163
99967,-6.889873,2.408478,2.821614,-0.653793,-8.600154,6.063855
99968,-5.478326,3.870380,3.870669,-0.805432,-9.869699,5.817556
99969,-5.928212,5.575984,1.919345,-0.444640,-9.074961,5.937686


Unnamed: 0,A,B,C,D,E,F
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,
4,,,,,,
...,...,...,...,...,...,...
99995,-6.325395,4.436402,3.171073,1.035852,-8.646391,5.627162
99996,-6.889873,2.408477,2.821613,-0.653793,-8.600154,6.063855
99997,-5.478325,3.870380,3.870669,-0.805432,-9.869700,5.817556
99998,-5.928213,5.575983,1.919345,-0.444640,-9.074961,5.937685


In [5]:

%timeit vlopt(df.to_numpy(), window)
%timeit df.rolling(window).apply(lambda x: agg_sum(x))
%timeit df.rolling(window).sum()


KeyboardInterrupt



### Obtained performance outperforms specialized function of pandas while allowing for any custom aggregation function, which would lead to terrible performance using the standard pandas apply as seen above

# Function with multiple columns 

In [None]:
def func_base(x):
    return np.mean(x['A'] * x['C']) - np.std(x['B'] / 2)

#transfcoder jobs to transform columns name as this
@nb.njit(nb.float32(nb.types.Array(nb.types.float32, 2, 'A', readonly=True)), cache=True, fastmath=False, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def func_modified(x):
    return np.mean(x[0, :] * x[2, :]) - np.std(x[1, :] / 2)

@nb.njit((nb.types.Array(nb.types.float32, 3, 'A', readonly=True), nb.float32[:],nb.types.uint32), cache=True, parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True, nogil=True)
def cvlopt(z, r, n):
    for i in nb.prange(n):
        r[i] = func_modified(z[i,:,:])

def vlopt(arr, window):
    cvlopt(sliding_window_view(arr, window, axis=0), r:=np.empty(n:=arr.shape[0]- window + 1, dtype=np.float32), np.uint32(n))
    return r

display(pd.DataFrame(vlopt(df.to_numpy(), window)))
display((df['A'] * df['C']).rolling(window).mean() - (df['B'] / 2).rolling(window).std())

### The slight difference above come from the difference of standard deviation between pandas and numpy, where pandas used the unbiased estimator whereas numpy does not.
Bellow the proof of equality with same metric used

In [None]:
display(((df['A'] * df['C']).rolling(window).mean() - (df['B'] / 2).rolling(window).apply(lambda x: np.std(x))).dropna())

## and so the performance

In [None]:
%timeit vlopt(arr, window)
%timeit df.apply(lambda x: x['A'] * x['C'], axis = 1).rolling(window).mean() - (df['B'] / 2).rolling(window).std()
%timeit ((df['A'] * df['C']).rolling(window).mean() - (df['B'] / 2).rolling(window).std()).dropna()
%timeit ((df['A'] * df['C']).rolling(window).mean() - (df['B'] / 2).rolling(window).apply(lambda x: np.std(x))).dropna()

## and thus we show how the new method can improved readability of rolling aggregation while in the meatime improve greatly the performance and flexibility

# Aggregation example

In [None]:
def func_base(x):
    return np.mean(x['A'] * x['C']) - np.std(x['B'] / 2)

def func_base_2(x):
    r = 0
    for item in x['A']:
        if item > 0:
            r /= item
        else:
            r -= item
    return r * np.mean(x['B'])
    
#transfcoder jobs to transform columns name as this
@nb.njit(nb.float32(nb.types.Array(nb.types.float32, 2, 'A', readonly=True)), cache=True, fastmath=False, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def func_modified(x):
    return np.mean(x[0, :] * x[2, :]) - np.std(x[1, :] / 2)


@nb.njit(nb.float32(nb.types.Array(nb.types.float32, 2, 'A', readonly=True)), cache=True, fastmath=False, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def func_modified_2(x):
    r = 0
    for item in x[:,0]:
        if item > 0:
            r /= item
        else:
            r -= item
    return r * np.mean(x[1, :])



@nb.njit((nb.float32[:,:], nb.float32[:, :],nb.types.uint32), cache=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def vl(z, r, i):
    for k in nb.prange(2):
        if k == 0:
            r[i, k] = func_modified(z)
        elif k == 1:
            r[i, k] = func_modified_2(z)

@nb.njit((nb.types.Array(nb.types.float32, 3, 'A', readonly=True), nb.float32[:, :],nb.types.uint32), cache=True, parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True, nogil=True)
def cvlopt(z, r, n):
    for i in nb.prange(n):
        vl(z[i,:,:], r, i)

def vlopt(arr, window):
    cvlopt(sliding_window_view(arr, window, axis=0), r:=np.empty((n:=arr.shape[0]- window + 1, 2), dtype=np.float32), np.uint32(n))
    return r

display(pd.DataFrame(vlopt(df.to_numpy(), window)))
