In [17]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [18]:
data = np.random.randint(1, 100, size=(100000, 3))
df = pd.DataFrame(data, columns=['a', 'b', 'c'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   a       100000 non-null  int64
 1   b       100000 non-null  int64
 2   c       100000 non-null  int64
dtypes: int64(3)
memory usage: 2.3 MB


In [19]:
df.head()

Unnamed: 0,a,b,c
0,81,56,93
1,33,78,91
2,39,78,84
3,6,34,38
4,40,76,83


In [20]:
grouped = df.groupby(['b', 'c'], sort=False)['a'].agg(['median', 'mean', 'min', 'max', 'std', 'count'])
grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,median,mean,min,max,std,count
b,c,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
56,93,77.5,71.0,25,92,21.672893,8
78,91,44.0,48.6,22,98,24.107168,10
78,84,59.0,54.272727,3,93,28.820447,11
34,38,18.0,27.625,6,95,30.170646,8
76,83,55.5,50.833333,3,95,35.626429,12


In [21]:
dict(grouped.loc[(1, 1)])

{'median': 35.5,
 'mean': 44.416666666666664,
 'min': 2.0,
 'max': 95.0,
 'std': 32.56729629365612,
 'count': 12.0}

In [22]:
quantiles = df.quantile([i / 10 for i in range(1, 10)], interpolation='linear')
quantiles.head(10)

Unnamed: 0,a,b,c
0.1,10.0,10.0,10.0
0.2,20.0,20.0,20.0
0.3,30.0,30.0,30.0
0.4,40.0,40.0,40.0
0.5,50.0,50.0,50.0
0.6,60.0,60.0,60.0
0.7,70.0,70.0,70.0
0.8,80.0,80.0,80.0
0.9,90.0,90.0,90.0


In [23]:
dict(quantiles.loc[0.2])

{'a': 20.0, 'b': 20.0, 'c': 20.0}

In [24]:
rows = []
for t in tqdm(df.itertuples()):
    row = t._asdict()
    g = grouped.loc[(row['b'], row['c'])]
    row['median'] = g['median']
    row['mean'] = g['mean']
    row['min'] = g['min']
    row['max'] = g['max']
    row['std'] = g['std']
    rows.append(row)

100000it [00:18, 5432.72it/s]


In [25]:
%%time
res = pd.DataFrame.from_records(rows)
# prevent division-by-zero error
eps = np.finfo(np.float32).eps
res['std'].fillna(eps, inplace=True)
res.set_index('Index', drop=True, inplace=True)
res.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   a       100000 non-null  int64  
 1   b       100000 non-null  int64  
 2   c       100000 non-null  int64  
 3   median  100000 non-null  float64
 4   mean    100000 non-null  float64
 5   min     100000 non-null  float64
 6   max     100000 non-null  float64
 7   std     100000 non-null  float64
dtypes: float64(5), int64(3)
memory usage: 6.9 MB
CPU times: user 219 ms, sys: 62.5 ms, total: 281 ms
Wall time: 288 ms


In [26]:
res.head()

Unnamed: 0_level_0,a,b,c,median,mean,min,max,std
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,81,56,93,77.5,71.0,25.0,92.0,21.672893
1,33,78,91,44.0,48.6,22.0,98.0,24.107168
2,39,78,84,59.0,54.272727,3.0,93.0,28.820447
3,6,34,38,18.0,27.625,6.0,95.0,30.170646
4,40,76,83,55.5,50.833333,3.0,95.0,35.626429
