# Understanding how the follwing works :

In [1]:
import numpy as np
import numba as nb
import pandas as pd
import logging
from typing import Callable, Type, Dict, Tuple, Any
from numpy.lib.stride_tricks import sliding_window_view
from IPython.display import display

In [2]:
arr = np.random.randn(100_000,6).astype(np.float32)

sliding_window_view creates a view on the initial array enabling to not need any copy here

In [5]:
window=30

data = sliding_window_view(arr, window, axis=0)

df = pd.DataFrame(arr, columns = ['A', 'B', 'C', 'D', 'E', 'F'])

# Standard case - summing axis = 0 : The aggregation is performed over each columns independantly, it is the only mode available in pandas currently 

In [56]:
def agg_sum(x):
    return np.sum(x)

@nb.njit(nb.float32(nb.float32[:]), cache=True, fastmath=False, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def agg_sumba(x):
    return np.sum(x)

@nb.njit((nb.float32[:, :], nb.float32[:, :],nb.types.uint32,nb.types.uint32), cache=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def vl(z, r, i, m):
    for j in nb.prange(m):
        r[i, j] = agg_sumba(z[j, :])


@nb.njit((nb.types.Array(nb.types.float32, 3, 'A', readonly=True), nb.float32[:, :],nb.types.uint32, nb.types.uint32), cache=True, parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True, nogil=True)
def cvlopt(z, r, n, m):
    for i in nb.prange(n):
        vl(z[i,:,:], r, i, m)

def vlopt(arr, window):
    cvlopt(sliding_window_view(arr, window, axis=0), r:=np.empty((n:=arr.shape[0]- window + 1, m:=arr.shape[1]), dtype=np.float32), np.uint32(n), np.uint32(m))
    return r

display(pd.DataFrame(vlopt(df.to_numpy(), window)))
display(df.rolling(window).sum())

In [62]:

%timeit vlopt(df.to_numpy(), window)
%timeit df.rolling(window).apply(lambda x: agg_sum(x))
%timeit df.rolling(window).sum()

86 µs ± 985 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
209 ms ± 785 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
262 µs ± 4.17 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### Obtained performance outperforms specialized function of pandas while allowing for any custom aggregation function, which would lead to terrible performance using the standard pandas apply as seen above

# Function with multiple columns 

In [39]:
def func_base(x):
    return np.mean(x['A'] * x['C']) - np.std(x['B'] / 2)

#transfcoder jobs to transform columns name as this
@nb.njit(nb.float32(nb.types.Array(nb.types.float32, 2, 'A', readonly=True)), cache=True, fastmath=False, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def func_modified(x):
    return np.mean(x[0, :] * x[2, :]) - np.std(x[1, :] / 2)

@nb.njit((nb.types.Array(nb.types.float32, 3, 'A', readonly=True), nb.float32[:],nb.types.uint32), cache=True, parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True, nogil=True)
def cvlopt(z, r, n):
    for i in nb.prange(n):
        r[i] = func_modified(z[i,:,:])

def vlopt(arr, window):
    cvlopt(sliding_window_view(arr, window, axis=0), r:=np.empty(n:=arr.shape[0]- window + 1, dtype=np.float32), np.uint32(n))
    return r

display(pd.DataFrame(vlopt(df.to_numpy(), window)))
display((df['A'] * df['C']).rolling(window).mean() - (df['B'] / 2).rolling(window).std())

Unnamed: 0,0
0,-0.452091
1,-0.425316
2,-0.374303
3,-0.374494
4,-0.388132
...,...
99966,-0.517406
99967,-0.475784
99968,-0.518359
99969,-0.462582


0             NaN
1             NaN
2             NaN
3             NaN
4             NaN
           ...   
99995   -0.526344
99996   -0.484738
99997   -0.527226
99998   -0.471530
99999   -0.467814
Length: 100000, dtype: float64

### The slight difference above come from the difference of standard deviation between pandas and numpy, where pandas used the unbiased estimator whereas numpy does not.
Bellow the proof of equality with same metric used

In [40]:
display(((df['A'] * df['C']).rolling(window).mean() - (df['B'] / 2).rolling(window).apply(lambda x: np.std(x))).dropna())

29      -0.452091
30      -0.425316
31      -0.374303
32      -0.374494
33      -0.388132
           ...   
99995   -0.517406
99996   -0.475784
99997   -0.518359
99998   -0.462582
99999   -0.458891
Length: 99971, dtype: float64

## and so the performance

In [41]:
%timeit vlopt(arr, window)
%timeit df.apply(lambda x: x['A'] * x['C'], axis = 1).rolling(window).mean() - (df['B'] / 2).rolling(window).std()
%timeit ((df['A'] * df['C']).rolling(window).mean() - (df['B'] / 2).rolling(window).std()).dropna()
%timeit ((df['A'] * df['C']).rolling(window).mean() - (df['B'] / 2).rolling(window).apply(lambda x: np.std(x))).dropna()

1.09 ms ± 5.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
466 ms ± 2.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.9 ms ± 22.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.39 s ± 27.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## and thus we show how the new method can improved readability of rolling aggregation while in the meatime improve greatly the performance and flexibility

# Aggregation example

In [49]:
def func_base(x):
    return np.mean(x['A'] * x['C']) - np.std(x['B'] / 2)

def func_base_2(x):
    r = 0
    for item in x['A']:
        if item > 0:
            r /= item
        else:
            r -= item
    return r * np.mean(x['B'])
    
#transfcoder jobs to transform columns name as this
@nb.njit(nb.float32(nb.types.Array(nb.types.float32, 2, 'A', readonly=True)), cache=True, fastmath=False, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def func_modified(x):
    return np.mean(x[0, :] * x[2, :]) - np.std(x[1, :] / 2)


@nb.njit(nb.float32(nb.types.Array(nb.types.float32, 2, 'A', readonly=True)), cache=True, fastmath=False, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def func_modified_2(x):
    r = 0
    for item in x[:,0]:
        if item > 0:
            r /= item
        else:
            r -= item
    return r * np.mean(x[1, :])



@nb.njit((nb.float32[:,:], nb.float32[:, :],nb.types.uint32), cache=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', nogil=True)
def vl(z, r, i):
    for k in nb.prange(2):
        if k == 0:
            r[i, k] = func_modified(z)
        elif k == 1:
            r[i, k] = func_modified_2(z)

@nb.njit((nb.types.Array(nb.types.float32, 3, 'A', readonly=True), nb.float32[:, :],nb.types.uint32), cache=True, parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True, nogil=True)
def cvlopt(z, r, n):
    for i in nb.prange(n):
        vl(z[i,:,:], r, i)

def vlopt(arr, window):
    cvlopt(sliding_window_view(arr, window, axis=0), r:=np.empty((n:=arr.shape[0]- window + 1, 2), dtype=np.float32), np.uint32(n))
    return r

display(pd.DataFrame(vlopt(df.to_numpy(), window)))


Unnamed: 0,0,1
0,-0.452091,-0.607906
1,-0.425316,-3.129244
2,-0.374303,-0.079888
3,-0.374494,0.006210
4,-0.388132,-0.302323
...,...,...
99966,-0.517406,0.360413
99967,-0.475784,0.297359
99968,-0.518359,0.962592
99969,-0.462582,1.288945
