In [1]:
import pandas as pd
import pandopt as pdo
import numpy as np
import numba as nb
import time

njit=nb.jit(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)

df = pdo.DataFrame(np.random.randn(100_000_000, 4), columns=['A', 'B', 'C', 'D'])#.astype(np.float32)
df['E'] = np.random.randint(0, 250, 100_000_000, dtype=np.int16)


In [2]:
# Define the column-function mapping with custom functions@nb.jit
def jcustom_sum(array):
    return np.sum(array)

def jcustom_mean(array):
    return np.mean(array)

def jcustom_mix(array):
    return np.mix(array)
    
def jcustom_max(array):
    return np.max(array)
    
    
col_func_map = {
    'A': ['sum', 'mean'],
    'B': ['max', 'min'],
    'C': ['sum', 'mean'],
    'D': ['max', 'min'],
}

print('pandas classic')
%timeit df.to_pandas().groupby('E').aggregate(col_func_map)
res=df.to_pandas().groupby('E').aggregate(col_func_map)
res.shape

pandas classic
2.37 s ± 11.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


(250, 8)

In [11]:
%timeit res=df.to_pandas().groupby('E').aggregate(col_func_map, engine='numba', engine_kwargs=dict(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', nopython=True, nogil=True))
res=df.to_pandas().groupby('E').aggregate(col_func_map)
res.shape

1.5 s ± 11.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


(250, 8)

In [3]:
@nb.njit
def jcustom_mean(array):
    return np.mean(array)
    
def grp(df, by, func):
    df['grpid'] = df.groupby([by]).ngroup()
    @njit
    def groupby_apply_njit(data, func):
        """The first column of data is group id. The second is y and the third is x."""
        ngroups = int(data[-1,0])+1   # Number of groups
        nrows = data.shape[0]    # Number of rows
        reslist = []
        istart = 0
        for k in range(ngroups):
            # Find start and end rows of the group
            # (istart point to the start and iend-1 point to the end
            iend = istart + 1
            while iend < nrows and data[iend-1,0] == data[iend,0]:
                iend += 1
            # Apply the function to the numpy array in the group
            res = func(data[istart:iend,1:])
            reslist.append(np.hstack((np.array([k]), res)))
            # Move to the next group
            istart = iend
        return reslist

    
    resarr = groupby_apply_njit(df[['grpid', 'y', 'x']].to_numpy(), func)
    res_ga_njit = df[[by, 'grpid']].drop_duplicates().merge(
        pd.DataFrame(resarr, columns=['grpid', 'const', 'x']), 
        on='grpid', 
        how='left'
    ).set_index([by]).drop(columns=['grpid'])
    return res_ga_njit

grp(df, 'E', jcustom_mean)

KeyError: "['y', 'x'] not in index"

In [7]:
import functools

@nb.jit#(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def jcustom_sum(array):
    return np.sum(array)

@nb.jit#(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def jcustom_mean(array):
    return np.mean(array)

@nb.jit#(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def jcustom_max(array):
    return np.max(array)

@nb.jit#(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def jcustom_min(array):
    return np.min(array)

@nb.jit(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def reorder(_tmp_array, rawarray, groups):
    n = len(groups)
    farray = np.zeros(len(groups) + 1, dtype=np.int64)
    c = 0
    for j in range(n): #dont put nb.prange here --> counter c wont work 
        m = len(groups[j])
        for i in nb.prange(m):
            _tmp_array[c+i] = rawarray[groups[j][i]]
        c += m
        farray[j + 1] = c
    return farray


def apply(funcs):
    m=len(funcs)
    ffunc = f"""
@njit
def wapply(_tmp_array, result_array, group_sep):
    n = len(group_sep) 
    c = 0
    for j in nb.prange(n):
        select = _tmp_array[group_sep[j]:group_sep[j+1]]
{'\n'.join(f'        result_array[j, {j}]={funcs[j]}(select)' for j in range(m))}
        """
    exec(ffunc, globals())
    return globals()['wapply']


# Define the column-function mapping with custom functions
col_func_map = {
    ('A',): [jcustom_sum, jcustom_mean],
    ('B',): [jcustom_max, jcustom_min],
    ('C',): [jcustom_sum, jcustom_mean],
    ('D',): [jcustom_max, jcustom_min],
}


def aggregate(df, by, col_func_map):
    name = by+functools.reduce(lambda x,y: x[0]+y[0], col_func_map.keys())
    ucols = [key for keys in col_func_map.keys() for key in keys]
    t1 = time.time()
    groups = [g.values for g in df.groupby('E', sort=False).groups.values()]
    array = df[ucols].to_numpy()    
    pdtype = {}
    sfuncs = []
    scols = []
    print(time.time()-t1)
    t1 = time.time()
    dtypes = df.dtypes.to_dict()
    for cols, func_list in col_func_map.items():
        col_idx = [df.colname_to_colnum[col] for col in cols]
        dtype = set(dtypes[col] for col in cols)
        if len(dtype) > 1:
            raise ValueError
        dtype=dtype.pop()
        scols, sfuncs=pdtype.get(dtype, [[], []])
        for func in func_list:
            scols.append(col_idx)
            sfuncs.append(func.__qualname__)
        pdtype[dtype] = [scols, sfuncs]
    print(time.time()-t1)
    t1 = time.time()
    for dtype, (scols, sfuncs_name) in pdtype.items():
        # Getting unique columns to avoid repetition
        unique_cols = set(col for sublist in scols for col in sublist)
        n, k = len(array), len(unique_cols)
        # Create a temporary array for contiguous memory
        _tmp_array = np.zeros((n, k), dtype=dtype, order='C')
        result_array = np.zeros((len(groups), len(sfuncs_name)), dtype=dtype, order='C')
        group_sep = reorder(_tmp_array, rawarray=array, groups=groups)
        if name not in compiled_groupby:
            compiled_groupby[name] = apply(sfuncs_name)
        compiled_groupby[name](_tmp_array, result_array, group_sep)
    print(time.time()-t1)
    t1 = time.time()
    return result_array

%timeit aggregate(df, 'E', col_func_map)
res=aggregate(df, 'E', col_func_map)
res.shape

1.932863473892212
0.00020647048950195312
5.722116708755493
1.7335107326507568
0.0001888275146484375
1.1779735088348389
1.7515480518341064
0.00016236305236816406
1.1637158393859863
1.7893457412719727
0.00016450881958007812
1.2380762100219727
1.7665760517120361
0.00017142295837402344
1.1544365882873535
1.768754005432129
0.0001709461212158203
1.1515378952026367
1.768615961074829
0.00017261505126953125
1.1625056266784668
1.7636134624481201
0.0001685619354248047
1.1485049724578857
3.17 s ± 38.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


(250, 8)

In [9]:
import functools

@nb.jit(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def jcustom_sum(array):
    return np.sum(array)

@nb.jit(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def jcustom_mean(array):
    return np.mean(array)

@nb.jit(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def jcustom_max(array):
    return np.max(array)

@nb.jit(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def jcustom_min(array):
    return np.min(array)

@nb.jit(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def reordernew(_tmp_array, rawarray, groups):
    n = len(groups)
    farray = np.zeros(len(groups) + 1, dtype=np.int64)
    c = 0
    for j in range(n): #dont put nb.prange here --> counter c wont work 
        m = len(groups[j])
        for i in nb.prange(m):
            _tmp_array[c+i] = rawarray[groups[j][i]]
        c += m
        farray[j + 1] = c
    return farray


def apply(funcs):
    m=len(funcs)
    ffunc = f"""
@njit
def wapply(_tmp_array, result_array, group_sep):
    n = len(group_sep) 
    c = 0
    for j in nb.prange(n):
        select = _tmp_array[group_sep[j]:group_sep[j+1]]
{'\n'.join(f'        result_array[j, {j}]={funcs[j]}(select)' for j in range(m))}
        """
    exec(ffunc, globals())
    return globals()['wapply']


# Define the column-function mapping with custom functions
col_func_map = {
    ('A',): [jcustom_sum, jcustom_mean],
    ('B',): [jcustom_max, jcustom_min],
    ('C',): [jcustom_sum, jcustom_mean],
    ('D',): [jcustom_max, jcustom_min],
}

compiled_groupby = {}

def aggregate(df, by, col_func_map):
    name = by+functools.reduce(lambda x,y: x[0]+y[0], col_func_map.keys())
    ucols = [key for keys in col_func_map.keys() for key in keys]
    groups = list(map(lambda x : x.values, df.groupby('E', sort=False).groups.values()))
    array = df[ucols].to_numpy()    
    pdtype = {}
    sfuncs = []
    scols = []
    dtypes = df.dtypes.to_dict()
    for cols, func_list in col_func_map.items():
        col_idx = [df.colname_to_colnum[col] for col in cols]
        dtype = set(dtypes[col] for col in cols)
        if len(dtype) > 1:
            raise ValueError
        dtype=dtype.pop()
        scols, sfuncs=pdtype.get(dtype, [[], []])
        for func in func_list:
            scols.append(col_idx)
            sfuncs.append(func.__qualname__)
        pdtype[dtype] = [scols, sfuncs]
    t1c=time.time()
    for dtype, (scols, sfuncs_name) in pdtype.items():
        # Getting unique columns to avoid repetition
        unique_cols = set(col for sublist in scols for col in sublist)
        n, k = len(array), len(unique_cols)
        # Create a temporary array for contiguous memory
        _tmp_array = np.zeros((n, k), dtype=dtype, order='C')
        result_array = np.zeros((len(groups), len(sfuncs_name)), dtype=dtype, order='F')
        group_sep = reordernew(_tmp_array, rawarray=array, groups=groups)
        if name not in compiled_groupby:
            compiled_groupby[name] = apply(sfuncs_name)
        compiled_groupby[name](_tmp_array, result_array, group_sep)
    return result_array

%timeit aggregate(df, 'E', col_func_map)

3.16 s ± 18.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%timeit groups = np.unique(df['E'].to_numpy().copy(order='C'))
%timeit groups = np.unique(df['E'].to_numpy().copy(order='F'))

1.61 s ± 11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.57 s ± 27.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%timeit groups = np.argsort(df['E'].to_numpy().copy(order='C'))
%timeit groups = np.argsort(df['E'].to_numpy().copy(order='F'))

2.28 s ± 15.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.33 s ± 32.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%timeit groups = np.sort(df['E'].to_numpy().copy(order='C'))
%timeit groups = np.sort(df['E'].to_numpy().copy(order='F'))

1.51 s ± 7.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.51 s ± 6.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit groups = [g.values for g in df.groupby('E').groups.values()]
%timeit groups = [g.values for g in df.groupby('E', sort=False).groups.values()]

1.4 s ± 8.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.29 s ± 11.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%timeit groups = list(map(lambda x : x.values, df.groupby('E', sort=False).groups.values()))

1.31 s ± 7.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit
group_idx = np.ascontiguousarray(df['E'].to_numpy().copy(order='C'))
sortidx = np.argsort(group_idx, kind="mergesort")

3.34 s ± 21.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%%timeit
group_idx = np.ascontiguousarray(df['E'].to_numpy().copy(order='F'))
sortidx = np.argsort(group_idx, kind="mergesort")

3.35 s ± 19.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
import scipy
%timeit scipy.ndimage.find_objects(df['E'].to_numpy())

[(slice(12, 99999983, None),),
 (slice(2, 99999995, None),),
 (slice(6, 99999999, None),),
 (slice(0, 100000000, None),),
 (slice(97, 99999959, None),)]

In [None]:
roup_idx = np.ascontiguousarray(group_idx)

sortidx = np.argsort(group_idx, kind="mergesort")
self._jitfunc(sortidx, group_idx, a, ret)

In [15]:
%%timeit
group_idx = np.ascontiguousarray(df['E'].to_numpy().copy(order='C'))
sortidx = np.argsort(group_idx, kind="mergesort")

3.35 s ± 34.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [40]:
@nb.jit(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def reordernew(_tmp_array, rawarray, arr, uniq):
    n = len(uniq)
    farray = np.zeros(len(groups) + 1, dtype=np.int32)
    c = 0
    for j in range(n): #dont put nb.prange here --> counter c wont work 
        m = len(groups[j])
        for i in nb.prange(m):
            _tmp_array[c+i] = rawarray[groups[j][i]]
        c += m
        farray[j + 1] = c
    return farray


def apply(funcs):
    m=len(funcs)
    ffunc = f"""
@njit
def wapply(_tmp_array, result_array, group_sep):
    n = len(group_sep) 
    c = 0
    for j in nb.prange(n):
        select = _tmp_array[group_sep[j]:group_sep[j+1]]
{'\n'.join(f'        result_array[j, {j}]={funcs[j]}(select)' for j in range(m))}
        """
    exec(ffunc, globals())
    return globals()['wapply']


SyntaxError: invalid syntax (1481586267.py, line 1)

In [46]:
arr,uniq

(<pyarrow.lib.Int64Array object at 0x7f5c7bfdf100>
 [
   5,
   5,
   5,
   5,
   5,
   5,
   5,
   5,
   5,
   5,
   ...
   25,
   25,
   25,
   25,
   25,
   25,
   25,
   25,
   25,
   25
 ],
 <pyarrow.lib.Int64Array object at 0x7f5c83312d40>
 [
   5,
   10,
   15,
   20,
   25
 ])

In [86]:
import pyarrow as pa

@nb.jit(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def reorder(_tmp_array, rawarray, arr, uniq, dtype):
    n = len(arr)
    m = _tmp_array.shape[1]
    c = 0    
    for j in range(1, n): #dont put nb.prange here --> counter c wont work
        for i in nb.prange(m):
            _tmp_array[0, i] = rawarray[c, i]
        if arr[j-1]!=arr[j]:
            uniq[c] = j

arr = pa.array(df['E'].to_numpy()*5).sort()
uniq = arr.unique()

ucols = [key for keys in col_func_map.keys() for key in keys]
array = df[ucols].to_numpy()    
pdtype = {}
sfuncs = []
scols = []
dtypes = df.dtypes.to_dict()
for cols, func_list in col_func_map.items():
    col_idx = [df.colname_to_colnum[col] for col in cols]
    dtype = set(dtypes[col] for col in cols)
    if len(dtype) > 1:
        raise ValueError
    dtype=dtype.pop()
    scols, sfuncs=pdtype.get(dtype, [[], []])
    for func in func_list:
        scols.append(col_idx)
        sfuncs.append(func.__qualname__)
    pdtype[dtype] = [scols, sfuncs]
    
t1c=time.time()
for dtype, (scols, sfuncs_name) in pdtype.items():
    # Getting unique columns to avoid repetition
    unique_cols = set(col for sublist in scols for col in sublist)
    n, k = len(array), len(unique_cols)
    # Create a temporary array for contiguous memory
    result_array = np.zeros((len(groups), len(sfuncs_name)), dtype=dtype, order='F')
    _tmp_array = np.zeros((n, k), dtype=dtype, order='C')
    group_sep = reorder(_tmp_array = _tmp_array, rawarray=array, arr=arr.to_numpy(), uniq=uniq.to_numpy(), dtype=dtype)
    if name not in compiled_groupby:
        compiled_groupby[name] = apply(sfuncs_name)
    #compiled_groupby[name](_tmp_array, result_array, group_sep)
#result_array

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
No implementation of function Function(<built-in function setitem>) found for signature:
 
 >>> setitem(readonly array(int64, 1d, C), Literal[int](0), int64)
 
There are 16 candidate implementations:
      - Of which 14 did not match due to:
      Overload of function 'setitem': File: <numerous>: Line N/A.
        With argument(s): '(readonly array(int64, 1d, C), int64, int64)':
       No match.
      - Of which 2 did not match due to:
      Overload in function 'SetItemBuffer.generic': File: numba/core/typing/arraydecl.py: Line 219.
        With argument(s): '(readonly array(int64, 1d, C), int64, int64)':
       Rejected as the implementation raised a specific error:
         NumbaTypeError: Cannot modify readonly array of type: readonly array(int64, 1d, C)
  raised from /home/remi/.pyenv/versions/3.12.2/lib/python3.12/site-packages/numba/core/typing/arraydecl.py:226

During: typing of setitem at /tmp/ipykernel_613237/198987023.py (12)

File "../../../../tmp/ipykernel_613237/198987023.py", line 12:
<source missing, REPL/exec in use?>


In [64]:
import pyarrow as pa
pa.chunked_array(df['E'].to_numpy())
%timeit groups = [g for g in df.groupby(level=0).groups]

TypeError: 'numpy.int64' object is not iterable

In [101]:
import pyarrow as pa
import pandas as pd
n_legs = pa.array([2, 4, 5, 100])
animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"])
names = ["n_legs", "animals"]

In [98]:
df = pd.DataFrame({'year': [2020, 2022, 2019, 2021],
                   'n_legs': [2, 4, 5, 100],
                   'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]})

In [102]:
my_schema = pa.schema([
    pa.field('n_legs', pa.int64()),
    pa.field('animals', pa.string())],
    metadata={"n_legs": "Number of legs per animal"})

In [100]:
n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
animals = pa.chunked_array([["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]])
table = pa.table([n_legs, animals], names=names)

In [None]:

@nb.jit(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def reordernew(_tmp_array, rawarray, groups):
    n = len(groups)
    farray = np.zeros(len(groups) + 1, dtype=np.int64)
    c = 0
    for j in range(n): #dont put nb.prange here --> counter c wont work 
        m = len(groups[j])
        for i in nb.prange(m):
            _tmp_array[c+i] = rawarray[groups[j][i]]
        c += m
        farray[j + 1] = c
    return farray


def apply(funcs):
    m=len(funcs)
    ffunc = f"""
@njit
def wapply(_tmp_array, result_array, group_sep):
    n = len(group_sep) 
    c = 0
    for j in nb.prange(n):
        select = _tmp_array[group_sep[j]:group_sep[j+1]]
{'\n'.join(f'        result_array[j, {j}]={funcs[j]}(select)' for j in range(m))}
        """
    exec(ffunc, globals())
    return globals()['wapply']


# Define the column-function mapping with custom functions
col_func_map = {
    ('A',): [jcustom_sum, jcustom_mean],
    ('B',): [jcustom_max, jcustom_min],
    ('C',): [jcustom_sum, jcustom_mean],
    ('D',): [jcustom_max, jcustom_min],
}

compiled_groupby = {}

def aggregate(df, by, col_func_map):
    name = by+functools.reduce(lambda x,y: x[0]+y[0], col_func_map.keys())
    ucols = [key for keys in col_func_map.keys() for key in keys]
    groups = list(map(lambda x : x.to_numpy(), df.groupby(by).groups.values()))
    array = df[ucols].to_numpy()    
    pdtype = {}
    sfuncs = []
    scols = []
    dtypes = df.dtypes.to_dict()
    for cols, func_list in col_func_map.items():
        col_idx = [df.colname_to_colnum[col] for col in cols]
        dtype = set(dtypes[col] for col in cols)
        if len(dtype) > 1:
            raise ValueError
        dtype=dtype.pop()
        scols, sfuncs=pdtype.get(dtype, [[], []])
        for func in func_list:
            scols.append(col_idx)
            sfuncs.append(func.__qualname__)
        pdtype[dtype] = [scols, sfuncs]
    t1c=time.time()
    for dtype, (scols, sfuncs_name) in pdtype.items():
        # Getting unique columns to avoid repetition
        unique_cols = set(col for sublist in scols for col in sublist)
        n, k = len(array), len(unique_cols)
        # Create a temporary array for contiguous memory
        _tmp_array = np.zeros((n, k), dtype=dtype, order='C')
        result_array = np.zeros((len(groups), len(sfuncs_name)), dtype=dtype, order='F')
        group_sep = reordernew(_tmp_array, rawarray=array, groups=groups)
        if name not in compiled_groupby:
            compiled_groupby[name] = apply(sfuncs_name)
        compiled_groupby[name](_tmp_array, result_array, group_sep)
    return result_array

%timeit aggregate(df, 'E', col_func_map)

In [None]:
# Define the column-function mapping with custom functions@nb.jit
def jcustom_sum(array):
    return np.sum(array)

def jcustom_mean(array):
    return np.mean(array)

def jcustom_max(array):
    return np.max(array)
    
col_func_map = {
    'A': [jcustom_sum, jcustom_mean],
    'B': [jcustom_max],
}
print('pandas classic')
%timeit df.to_pandas().groupby('E').aggregate(col_func_map)


@njit
def jcustom_sum(array):
    return np.sum(array)

@njit
def jcustom_mean(array):
    return np.mean(array)

@njit
def jcustom_max(array):
    return np.max(array)

@njit
def reorder(_tmp_array, rawarray, groups):
    n = len(groups)
    farray = np.zeros(len(groups) + 1, dtype=np.int64)
    c = 0
    for j in range(n): #dont put nb.prange here --> counter c wont work 
        m = len(groups[j])
        for i in nb.prange(m):
            _tmp_array[c+i] = rawarray[groups[j][i]]
        c += m
        farray[j + 1] = c
    return farray



def apply(funcs):
    m=len(funcs)
    ffunc = f"""
@njit
def wapply(_tmp_array, result_array, group_sep):
    n = len(group_sep) 
    c = 0
    for j in nb.prange(n):
        select = _tmp_array[group_sep[j]:group_sep[j+1]]
{'\n'.join(f'        result_array[j, {j}]={funcs[j]}(select)' for j in range(m))}
        """
    exec(ffunc, globals())
    return globals()['wapply']


# Define the column-function mapping with custom functions
col_func_map = {
    ('A',): [jcustom_sum, jcustom_mean],
    ('B',): [jcustom_max],
}

compiled_groupby = {}

def aggregate(df, col_func_map, name):
    t1=time.time()
    ucols = [key for keys in col_func_map.keys() for key in keys]
    
    groups = np.unique(df['E'].to_numpy(),return_inverse=True)
    array = df[ucols].to_numpy()
    t1=time.time()
    pdtype = {}
    sfuncs = []
    scols = []
    dtypes = df.dtypes.to_dict()
    for cols, func_list in col_func_map.items():
        col_idx = [df.colname_to_colnum[col] for col in cols]
        dtype = set(dtypes[col] for col in cols)
        if len(dtype) > 1:
            raise ValueError
        dtype=dtype.pop()
        scols, sfuncs=pdtype.get(dtype, [[], []])
        for func in func_list:
            scols.append(col_idx)
            sfuncs.append(func.__qualname__)
        pdtype[dtype] = [scols, sfuncs]
    
    for dtype, (scols, sfuncs_name) in pdtype.items():
        # Getting unique columns to avoid repetition
        unique_cols = set(col for sublist in scols for col in sublist)
        n, k = len(array), len(unique_cols)
        
        # Create a temporary array for contiguous memory
        _tmp_array = np.zeros((n, k), dtype=dtype, order='C')
        result_array = np.zeros((len(groups), len(sfuncs_name)), dtype=dtype, order='F')
        t2=time.time()
        group_sep = reorder(_tmp_array, rawarray=array[:, list(unique_cols)], groups=groups)
        t3=time.time()
        if name not in compiled_groupby:
            compiled_groupby[name] = apply(sfuncs_name)
        compiled_groupby[name](_tmp_array, result_array, group_sep)
    t4=time.time()
    print(t2-t1, t3-t2, t4-t3)
    return result_array

name = "cacaoweb3"
%timeit aggregate(df, col_func_map, name)

In [38]:

%timeit groups = np.unique(df['E'].to_numpy(),return_inverse=True)
%timeit groups = df.groupby('E').groups.values()

569 ms ± 2.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
203 ms ± 359 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [41]:
groups = df.groupby('E').groups.values()


TypeError: 'dict_values' object is not subscriptable

In [29]:
_, g1 = np.unique(np.ravel_multi_index((arr_slice:=df[['E']].values).T,arr_slice.max(0)+1),return_inverse=True)
g2 = np.unique(df['E'].values,return_inverse=True)

In [31]:
g

array([2, 3, 3, ..., 3, 2, 2])

In [27]:

%timeit groups = np.unique(np.ravel_multi_index(arr_slice.T,arr_slice.max(0)+1),return_inverse=True)
%timeit groups = np.unique(df['E'].values,return_inverse=True)


TypeError: unique() got an unexpected keyword argument 'lidx'

In [None]:
group.get_group(1)

In [19]:
a = set((5,))
a = a.pop()
a

5

In [11]:
t1=time.time()

groups = [group.to_numpy() for group in df.groupby('E').groups.values()]

pdtype = {}
sfuncs = []
scols = []
for cols, func_list in col_func_map.items():
    col_idx = [df.columns.get_loc(col) for col in cols]
    dtype = df.dtypes.loc[list(cols)]
    if dtype.nunique() > 1:
        raise ValueError
    dtype=dtype.iloc[0]
    scols, sfuncs=pdtype.get(dtype, [[], []])
    for func in func_list:
        scols.append(col_idx)
        sfuncs.append(func)
    pdtype[dtype] = [scols, sfuncs]

t1b=time.time()


['A', 'B']

In [3]:

def aggregate(df, col_func_map, name):
    t1=time.time()
    
    groups = [group.to_numpy() for group in df.groupby('E').groups.values()]
    
    pdtype = {}
    sfuncs = []
    scols = []
    for cols, func_list in col_func_map.items():
        col_idx = [df.columns.get_loc(col) for col in cols]
        dtype = df.dtypes.loc[list(cols)]
        if dtype.nunique() > 1:
            raise ValueError
        dtype=dtype.iloc[0]
        scols, sfuncs=pdtype.get(dtype, [[], []])
        for func in func_list:
            scols.append(col_idx)
            sfuncs.append(func)
        pdtype[dtype] = [scols, sfuncs]
    
    t1b=time.time()
    
    for dtype, (scols, sfuncs) in pdtype.items():
        # Getting unique columns to avoid repetition
        unique_cols = tuple(set(col for sublist in scols for col in sublist))
        array = df[unique_cols].to_numpy()
        n, k = len(array), len(unique_cols)
        
        # Create a temporary array for contiguous memory
        _tmp_array = np.zeros((n, k), dtype=dtype, order='C')
        result_array = np.zeros((len(groups), len(sfuncs)), dtype=dtype, order='C')
        t2=time.time()
        group_sep = reorder(_tmp_array, rawarray=array[:, list(unique_cols)], groups=groups)
        t3=time.time()
        if name not in compiled_groupby:
            compiled_groupby[name] = apply(sfuncs)
        compiled_groupby[name](_tmp_array, result_array, group_sep)
    t4=time.time()
    print(t1b-t1, t2-t1b, t3-t2, t4-t3)
    return result_array

name = "cacaoweb"
%timeit aggregate(df, col_func_map, name)

KeyError: (0, 1)

In [3]:
import pandopt as pdo
import pandas as pd
import numpy as np
dfa=pd.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D'])
dfa['E'] = dfa['A'].apply(lambda x: 1 if x <-2 else 2 if x <-1 else 3 if x<0 else 4 if x<2 else 5)
dfa.loc[dfa.groupby('E').groups[1],:]

             A         B         C         D  E
79   -2.038734 -1.232376 -0.380578  0.448743  1
123  -2.493969 -0.799089  1.182274 -0.606533  1
191  -2.406897  1.235835  0.950062 -0.704273  1
195  -2.093371  0.762015 -1.655454  0.327814  1
209  -2.661153  0.904761 -0.809708  0.480497  1
...        ...       ...       ...       ... ..
9776 -2.321040 -1.319001  0.628674 -0.288920  1
9813 -2.896840 -0.671568  0.288008  0.974226  1
9831 -2.493081 -1.314321 -1.240508 -1.174425  1
9948 -2.731723 -0.321066  1.050955 -0.551725  1
9949 -2.094642  0.042306 -0.306481  0.117431  1

[241 rows x 5 columns]

In [4]:
df = pdo.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D'])#.astype(np.float32)
df['E'] = df['A'].apply(lambda x: 1 if x <-2 else 2 if x <-1 else 3 if x<0 else 4 if x<2 else 5)
groups = df.groupby('E').groups

A
E


In [None]:
{[col_names]: [funcs], ...}


In [None]:
def slice(array, func):
    ...

def optigroup(array, slices, jitfuncs):
    ...    

def group_result(array, groups, funcs):
    result = np.empty((len(groups), len(funcs)))
    

In [5]:
import numpy as np
import numba as nb

# This function slices the array based on the provided indices
def slice(array, indices):
    return array[indices]

# This function organizes the array slices and functions
def optigroup(array, slices, jitfuncs):
    results = []
    for indices in slices:
        group = slice(array, indices)
        group_results = [func(group) for func in jitfuncs]
        results.append(group_results)
    return results


@nb.jit(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
def group_result(array, groups, agg_sum, agg_mean):
    result = np.empty((len(groups), 2))  # Assuming 2 functions: agg_sum and agg_mean
    for i in prange(len(groups)):
        group_array = array[groups[i]]
        result[i, 0] = agg_sum(group_array)  # Apply agg_sum
        result[i, 1] = agg_mean(group_array)  # Apply agg_mean
    return result
    
def agg_sum(x):
    return np.sum(x)

def agg_mean(x):
    return np.mean(x)

njit=nb.jit(parallel=True, fastmath=True, forceinline=True, looplift=True, inline='always', target_backend='host', no_cfunc_wrapper=True, no_rewrites=True ,nopython=True, nogil=True)
# Example usage:
df = pdo.DataFrame(np.random.randn(1000000, 4), columns=['A', 'B', 'C', 'D'])#.astype(np.float32)
df['E'] = df['A'].apply(lambda x: 1 if x <-2 else 2 if x <-1 else 3 if x<0 else 4 if x<2 else 5)
array = df[['A', 'B', 'C', 'D']].to_numpy()
groups = [group.to_numpy() for group in df.groupby('E').groups.values()]
jit_agg_sum = njit(agg_sum)
jit_agg_mean = njit(agg_mean)

%timeit results = optigroup(array, groups, [jit_agg_sum, jit_agg_mean])
%timeit parallel_results = group_result(array, groups, jit_agg_sum, jit_agg_mean)


A
['A', 'B', 'C', 'D']
E
31.4 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Untyped global name 'prange': Cannot determine Numba type of <class 'numba.core.ir.UndefinedType'>

File "../../../../tmp/ipykernel_1473150/657794856.py", line 21:
<source missing, REPL/exec in use?>

'prange' looks like a Numba internal function, has it been imported (i.e. 'from numba import prange')?


In [None]:
array = df[['A']].to_numpy()
groups = [group.to_numpy() for group in df.groupby('E').groups.values()]
jit_agg_sum = njit(agg_sum)
jit_agg_mean = njit(agg_mean)

%timeit results = optigroup(array, groups, [jit_agg_sum, jit_agg_mean])
%timeit parallel_results = group_result(array, groups, jit_agg_sum, jit_agg_mean)

In [35]:
dfa=df.to_pandas()
%timeit dfa.groupby('E')[['A']].aggregate(['mean', 'sum'])

11.7 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [138]:
array[tuple((list(groups[0]), tuple(0,1)))]

TypeError: tuple expected at most 1 argument, got 2

In [153]:
set(*sc if type(sc)==list else sc for sc in scalcs)

SyntaxError: iterable unpacking cannot be used in comprehension (1991996339.py, line 1)

In [104]:

@nb.jit
def jcustom_sum(array):
    return np.sum(array)

@nb.jit
def jcustom_mean(array):
    return np.mean(array)

@nb.jit
def jcustom_max(array):
    return np.max(array)

@nb.jit(nopython=True, parallel=True)
def reorder(_tmp_array, rawarray, groups):
    n = len(groups)
    farray = np.zeros(len(groups) + 1, dtype=np.int64)
    c = 0
    for j in range(n): #dont put nb.prange here --> counter c wont work 
        m = len(groups[j])
        for i in nb.prange(m):
            _tmp_array[c+i] = rawarray[groups[j][i]]
        c += m
        farray[j + 1] = c
    return farray



def apply(funcs):
    m=len(sfuncs)
    ffunc = f"""
@nb.jit(nopython=True, parallel=True)
def wapply(_tmp_array, result_array, group_sep):
    n = len(group_sep) 
    c = 0
    for j in nb.prange(n):
        select = _tmp_array[group_sep[j]:group_sep[j+1]]
{'\n'.join(f'        result_array[j, {j}]={sfuncs[j].__qualname__}(select)' for j in range(m))}
        """
    exec(ffunc, globals())
    return globals()['wapply']


# Define the column-function mapping with custom functions
col_func_map = {
    ('A',): [jcustom_sum, jcustom_mean],
    ('B',): [jcustom_max],
}

df = pdo.DataFrame(np.random.randn(1000000, 4), columns=['A', 'B', 'C', 'D'])#.astype(np.float32)
df['E'] = df['A'].apply(lambda x: 1 if x <-2 else 2 if x <-1 else 3 if x<0 else 4 if x<2 else 5)
array = df[['A', 'B', 'C', 'D']].to_numpy()
groups = [group.to_numpy() for group in df.groupby('E').groups.values()]

pdtype = {}
sfuncs = []
scols = []
for cols, func_list in col_func_map.items():
    col_idx = [df.columns.get_loc(col) for col in cols]
    dtype = df.dtypes.loc[list(cols)]
    if dtype.nunique() > 1:
        raise ValueError
    dtype=dtype.iloc[0]
    scols, sfuncs=pdtype.get(dtype, [[], []])
    for func in func_list:
        scols.append(col_idx)
        sfuncs.append(func)
    pdtype[dtype] = [scols, sfuncs]



for dtype, (scols, sfuncs) in pdtype.items():
    # Getting unique columns to avoid repetition
    unique_cols = set(col for sublist in scols for col in sublist)
    n, k = len(array), len(unique_cols)
    
    # Create a temporary array for contiguous memory
    _tmp_array = np.zeros((n, k), dtype=dtype, order='C')
    result_array = np.zeros((len(groups), len(sfuncs)), dtype=dtype, order='F')
    group_sep = reorder(_tmp_array, rawarray=array[:, list(unique_cols)], groups=groups)
    apply(sfuncs)(_tmp_array, result_array, group_sep)
    # Assuming 'groups' is a list of indices for each group
    
print(result_array)



A
['A', 'B', 'C', 'D']
E
[[-5.41751697e+04 -1.18982627e+00  4.05851440e+00]
 [-1.87652056e+05 -6.92238659e-01  4.43131352e+00]
 [-1.56867612e+05 -2.29648842e-01  4.72470058e+00]
 [ 3.45192172e+05  3.61610746e-01  4.64376796e+00]
 [ 5.41531045e+04  1.18455474e+00  4.57548134e+00]]


A
pandas classic
53.3 ms ± 203 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
['A', 'B', 'C', 'D']
E
['A', 'B', 'C', 'D']
E
['A', 'B', 'C', 'D']
E
['A', 'B', 'C', 'D']
E
['A', 'B', 'C', 'D']
E
['A', 'B', 'C', 'D']
E
['A', 'B', 'C', 'D']
E
['A', 'B', 'C', 'D']
E
36 ms ± 2.96 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
@nb.jit
def jcustom_sum(array):
    return np.sum(array)

@nb.jit
def jcustom_mean(array):
    return np.mean(array)

@nb.jit
def jcustom_max(array):
    return np.max(array)

@nb.jit(nopython=True, parallel=True)
def reorder(_tmp_array, rawarray, groups):
    n = len(groups)
    farray = np.zeros(len(groups) + 1, dtype=np.int64)
    c = 0
    for j in range(n): #dont put nb.prange here --> counter c wont work 
        m = len(groups[j])
        for i in nb.prange(m):
            _tmp_array[c+i] = rawarray[groups[j][i]]
        c += m
        farray[j + 1] = c
    return farray



def apply(funcs):
    m=len(sfuncs)
    ffunc = f"""
@nb.jit(nopython=True, parallel=True)
def wapply(_tmp_array, result_array, group_sep):
    n = len(group_sep) 
    c = 0
    for j in nb.prange(n):
        select = _tmp_array[group_sep[j]:group_sep[j+1]]
{'\n'.join(f'        result_array[j, {j}]={sfuncs[j].__qualname__}(select)' for j in range(m))}
        """
    exec(ffunc, globals())
    return globals()['wapply']


# Define the column-function mapping with custom functions
col_func_map = {
    ('A',): [jcustom_sum, jcustom_mean],
    ('B',): [jcustom_max],
}

df = pdo.DataFrame(np.random.randn(1000000, 4), columns=['A', 'B', 'C', 'D'])#.astype(np.float32)
df['E'] = df['A'].apply(lambda x: 1 if x <-2 else 2 if x <-1 else 3 if x<0 else 4 if x<2 else 5)
array = df[['A', 'B', 'C', 'D']].to_numpy()
groups = [group.to_numpy() for group in df.groupby('E').groups.values()]

pdtype = {}
sfuncs = []
scols = []
for cols, func_list in col_func_map.items():
    col_idx = [df.columns.get_loc(col) for col in cols]
    dtype = df.dtypes.loc[list(cols)]
    if dtype.nunique() > 1:
        raise ValueError
    dtype=dtype.iloc[0]
    scols, sfuncs=pdtype.get(dtype, [[], []])
    for func in func_list:
        scols.append(col_idx)
        sfuncs.append(func)
    pdtype[dtype] = [scols, sfuncs]



for dtype, (scols, sfuncs) in pdtype.items():
    # Getting unique columns to avoid repetition
    unique_cols = set(col for sublist in scols for col in sublist)
    n, k = len(array), len(unique_cols)
    
    # Create a temporary array for contiguous memory
    _tmp_array = np.zeros((n, k), dtype=dtype, order='C')
    result_array = np.zeros((len(groups), len(sfuncs)), dtype=dtype, order='F')
    group_sep = reorder(_tmp_array, rawarray=array[:, list(unique_cols)], groups=groups)
    apply(sfuncs)(_tmp_array, result_array, group_sep)
    # Assuming 'groups' is a list of indices for each group
    
print(result_array)

In [28]:
@nb.jit(nopython=True, parallel=True)
def apply(_tmp_array, result_array, group_sep, func_keys):
    n = len(group_sep) - 1
    m = len(func_keys)
    for j in nb.prange(n):
        select = _tmp_array[group_sep[j]:group_sep[j+1]]
        for k in nb.prange(m):
            func_key = func_keys[k]
            result_array[j, k] = func_dict[func_key](select)

'jcustom_sum'

In [86]:
m=len(sfuncs)
ffunc = f"""
@nb.jit(nopython=True, parallel=True)
def wapply(_tmp_array, result_array, group_sep):
    n = len(group_sep) 
    c = 0
    for j in nb.prange(n):
        select = _tmp_array[group_sep[j]:group_sep[j+1]]
{'\n'.join(f'        result_array[j, {j}]={sfuncs[j]}(select)' for j in range(m))}
        """

print(result_array)
exec(ffunc, globals())
globals()['wapply'](_tmp_array, result_array, group_sep)
result_array

[[-1.10320418e+02 -4.83119852e-03  4.38267714e+00]
 [ 2.00368725e+02  1.46943138e-03  4.13991635e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]]


array([[-1.10320418e+02, -4.83119852e-03,  4.38267714e+00],
       [ 2.00368725e+02,  1.46943138e-03,  4.13991635e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [87]:
_tmp_array

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [76]:
def wapply(result_array, _tmp_array, group_sep):
    n = len(group_sep) - 1
    c = 0
    for j in nb.prange(n):
        select = _tmp_array[group_sep[j]:group_sep[j+1]]
        print(j,group_sep[j], group_sep[j+1], select)
        result_array[:, 0]=jcustom_sum(select)
        result_array[:, 1]=jcustom_mean(select)
        result_array[:, 2]=jcustom_max(select)

wapply(result_array, _tmp_array, group_sep)

0 0 22835 [[-0.31432851]
 [ 2.45626533]
 [ 0.15953227]
 ...
 [-2.39505003]
 [ 1.31338695]
 [-0.03802477]]
1 22835 159193 [[ 0.20105452]
 [ 0.06233486]
 [-1.05581198]
 ...
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
2 159193 500937 [[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]
3 500937 977327 [[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]
4 977327 1000000 [[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


In [54]:
groups

[array([    12,     82,    124, ..., 999906, 999964, 999998]),
 array([     1,      2,      3, ..., 999996, 999997, 999999]),
 array([     4,      5,     10, ..., 999987, 999990, 999995]),
 array([     0,      6,      8, ..., 999992, 999993, 999994]),
 array([    72,    129,    148, ..., 999598, 999629, 999862])]

In [73]:

@nb.jit
def jcustom_sum(array):
    return np.sum(array)

@nb.jit
def jcustom_mean(array):
    return np.mean(array)

@nb.jit
def jcustom_max(array):
    return np.max(array)

@nb.jit(nopython=True, parallel=True)
def reorder(_tmp_array, rawarray, groups):
    n = len(groups)
    farray = np.zeros(len(groups) + 1, dtype=np.int64)
    c = 0
    for j in range(n): #dont put nb.prange here --> counter c wont work 
        m = len(groups[j])
        for i in nb.prange(m):
            _tmp_array[c] = rawarray[groups[j][i]]
            c += 1
        farray[j + 1] = c
    return farray


# Dictionary of JIT-compiled functions
func_dict = {
    'sum': jcustom_sum,
    'mean': jcustom_mean,
    'max': jcustom_max
}

@nb.jit(nopython=True, parallel=True)
def apply(_tmp_array, result_array, group_sep, func_keys):
    n = len(group_sep) - 1
    m = len(func_keys)
    for j in nb.prange(n):
        select = _tmp_array[group_sep[j]:group_sep[j+1]]
        for k in nb.prange(m):
            func_key = func_keys[k]
            result_array[j, k] = func_dict[func_key](select)



pdtype = {}
sfuncs = []
scols = []
for cols, func_list in col_func_map.items():
    # Your existing code to get col_idx and dtype
    scols, sfuncs = pdtype.get(dtype, [[], []])
    for func in func_list:
        scols.append(col_idx)
        sfuncs.append(func.__name__)  # Append the function name as a string
    pdtype[dtype] = [scols, sfuncs]

for dtype, (scols, sfuncs) in pdtype.items():
    # Getting unique columns to avoid repetition
    unique_cols = set(col for sublist in scols for col in sublist)
    n, k = len(array), len(unique_cols)
    
    # Create a temporary array for contiguous memory
    _tmp_array = np.zeros((n, k), dtype=dtype, order='C')
    result_array = np.zeros((len(groups), len(sfuncs)), dtype=dtype, order='F')
    group_sep = reorder(_tmp_array, rawarray=array[:, list(unique_cols)], groups=groups)
    print(group_sep)
    apply(_tmp_array, result_array, group_sep, sfuncs)
    # Your existing result handling code


[      0   22835  159193  500937  977327 1000000]


In [79]:
pdtype

{dtype('float64'): [[[1], [1], [1]],
  ['jcustom_sum', 'jcustom_mean', 'jcustom_max']]}

In [12]:
import numpy as np
import pandas as pd
import numba as nb
x = np.zeros((10000, 4), dtype=np.float32, order='F')



KeyError: 'Can only index numba types with slices with no start or stop, got (slice(None, None, 10000), slice(None, 4, None)).'

In [84]:
import numpy as np
import pandas as pd
import numba as nb

# Helper function to slice the array based on provided indices
def slice(array, indices):
    return array[indices]

@nb.jit(nb.types.float32(nb.types.float32[:]), nopython=True)
def jcustom_sum(array):
    return np.sum(array)

@nb.jit(nb.types.float32(nb.types.float32[:]), nopython=True)
def jcustom_mean(array):
    return np.mean(array)

@nb.jit(nb.types.float32(nb.types.float32[:]), nopython=True)
def jcustom_max(array):
    return np.max(array)

col_func_map = {
    'A': [jcustom_sum, jcustom_mean],
    'B': [jcustom_max],
    ('A', 'B'): ['max']
    # Add more mappings as needed
}

# Define a single aggregation function with a flag
@nb.jit(nb.types.float32(nb.types.float32[:], nb.types.unicode_type), nopython=True)
def aggregate(array, operation):
    if operation == 0:
        return jcustom_sum(array)
    elif operation == 1:
        return jcustom_mean(array)
    elif operation == 2:
        return jcustom_max(array)
    # Add more operations as needed
    else:
        raise ValueError("Unsupported operation")


# Modified group_result to handle multiple columns and functions
@nb.njit(parallel=True)
def group_result(array, groups, column_indices):
    num_groups = len(groups)
    num_funcs = len(funcs)
    result = np.empty((num_groups, len(column_indices) * num_funcs)).astype(np.float32)

    for j, col_idx in enumerate(column_indices):
        trunc_col = array[:, col_idx]
        for k, func in enumerate(funcs[j]):
            for i, gidx in enumerate(num_groups):
                result_idx = j * num_funcs + k
                result[i, result_idx] = aggregate(trunc_col[gidx], func)

    return result

# Example usage:
df = pd.DataFrame(np.random.randn(100000, 4), columns=['A', 'B', 'C', 'D']).astype(np.float32)
df['E'] = df['A'].apply(lambda x: 1 if x < -2 else 2 if x < -1 else 3 if x < 0 else 4 if x < 2 else 5)
array = df.to_numpy()

# Group indices
groups = [group.to_numpy() for group in df.groupby('E').groups.values()]

# Define the column-function mapping with custom functions
col_func_map = {
    'A': [jcustom_sum, jcustom_mean],
    'B': [jcustom_max]
    # Add more mappings as needed
}

# Define the column-function mapping with custom functions
col_func_map = {
    'A': ['max', 'mean'],
    'B': ['sum']
    # Add more mappings as needed
}

# Prepare column indices and functions for JIT
column_indices = []
funcs = []

for col, func_list in col_func_map.items():
    col_idx = np.int8(df.columns.get_loc(col))
    column_indices.append(col_idx)
    # jit_funcs = [nb.jit(f) for f in func_list]
    funcs.append(func_list)

# Execute the computation
# result = group_result(array, groups, column_indices, funcs)
array = array.astype(np.float32)
# Timing the operation
result = group_result(array, groups, column_indices, funcs)


A
E


TypingError: Failed in nopython mode pipeline (step: nopython frontend)
No implementation of function Function(<class 'enumerate'>) found for signature:
 
 >>> enumerate(int64)
 
There are 2 candidate implementations:
  - Of which 2 did not match due to:
  Overload of function 'enumerate': File: numba/core/typing/builtins.py: Line 1042.
    With argument(s): '(int64)':
   No match.

During: resolving callee type: Function(<class 'enumerate'>)
During: typing of call at /tmp/ipykernel_216863/2214971155.py (51)


File "../../../../tmp/ipykernel_216863/2214971155.py", line 51:
<source missing, REPL/exec in use?>


In [77]:
import numpy as np
import pandas as pd
import numba as nb


@nb.jit(nb.types.float32(nb.types.float32[:]), nopython=True)
def jcustom_sum(array):
    return np.sum(array)

@nb.jit(nb.types.float32(nb.types.float32[:]), nopython=True)
def jcustom_mean(array):
    return np.mean(array)

@nb.jit(nb.types.float32(nb.types.float32[:]), nopython=True)
def jcustom_max(array):
    return np.max(array)



[0, 1]

In [None]:
def agg_sum(x):
    return np.sum(x)


df.apply(agg_sum, axis=0)

In [None]:
df.apply(lambda x: np.sum(x))

In [None]:
df.apply(lambda x: np.sum(x))

In [None]:
df.groupby('A').apply(agg_sum)

In [None]:
df = pdo.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D'])#.astype(np.float32)
type(df)

In [None]:
type(df[['A', 'B']])

In [None]:
df.dtypes

In [None]:
print(type(df), type(df[['A']]))
type(df.apply(np.sum))

In [None]:
def qqq(x):
    return np.sum(x)

def zzz(x):
    return x["A"]

df.apply(zzz, axis=1)

#df[['A', 'B']].apply(apb, axis=1)


In [None]:
    arr_slice = df[['Client', 'Month']].values
    lidx = np.ravel_multi_index(arr_slice.T,arr_slice.max(0)+1)
    unq,unqtags,counts = np.unique(lidx,return_inverse=True,return_counts=True)
    df["Nbcontrats"] = counts[unqtags]

In [None]:
df=pa,das

In [None]:
    arr_slice = df[['Client', 'Month']].values
    lidx = np.ravel_multi_index(arr_slice.T,arr_slice.max(0)+1)
    unq,unqtags,counts = np.unique(lidx,return_inverse=True,return_counts=True)
    df["Nbcontrats"] = counts[unqtags]