In [17]:
# https://dplyr.tidyverse.org/reference/summarise.html

from datar.datasets import mtcars, starwars
from datar.all import *

%run nb_helpers.py
nb_header(summarise)



### # summarise  

##### Summarise each group to fewer rows

See https://dplyr.tidyverse.org/reference/summarise.html  

Both input and the summarised data can be recycled, but separately.  

Aliases - `summarize`  

##### Examples:
&emsp;&emsp;>>> df = tibble(x=[1,2,3,4])  
&emsp;&emsp;>>> df >> summarise(y=sum(f.x), z=f.y*2)  
&emsp;&emsp;>>> #   y  z  
&emsp;&emsp;>>> # 0 10 20  
&emsp;&emsp;>>> df >> summarise(y=sum(f.x), z=f.x+f.y) # fail  

&emsp;&emsp;But they should not be mixed in later argument. For example:  
&emsp;&emsp;>>> df = tibble(x=[1,2,3,4], g=list('aabb')) >> group_by(f.g)  
&emsp;&emsp;>>> df >> summarise(n=n() + f.x)  
&emsp;&emsp;>>> # as expected:  
&emsp;&emsp;>>>      g  n  
&emsp;&emsp;>>> # 0  a  3  
&emsp;&emsp;>>> # 1  a  4  
&emsp;&emsp;>>> # 2  b  5  
&emsp;&emsp;>>> # 3  b  6  
&emsp;&emsp;>>> # [Groups: ['g'] (n=2)]  
&emsp;&emsp;>>> # However:  
&emsp;&emsp;>>> df >> summarise(y=1, n=n() + f.y)  
&emsp;&emsp;>>> # n() will be recycling output instead of input  
&emsp;&emsp;>>> #    g  y  n  
&emsp;&emsp;>>> # 0  a  1  2  
&emsp;&emsp;>>> # 1  b  1  2  

##### Args:
&emsp;&emsp;`_groups`: Grouping structure of the result.  
&emsp;&emsp;&emsp;&emsp;- "drop_last": dropping the last level of grouping.

&emsp;&emsp;&emsp;&emsp;- "drop": All levels of grouping are dropped.

&emsp;&emsp;&emsp;&emsp;- "keep": Same grouping structure as _data.

&emsp;&emsp;&emsp;&emsp;- "rowwise": Each row is its own group.

&emsp;&emsp;*args, **kwargs: Name-value pairs, where value is the summarized  
&emsp;&emsp;&emsp;&emsp;data for each group  

##### Returns:
&emsp;&emsp;The summary dataframe.  


In [18]:
mtcars >> summarise(mean=mean(f.disp), n=n())

Unnamed: 0,mean,n
,<float64>,<int64>
0.0,230.721875,32


In [19]:
mtcars >> \
  group_by(f.cyl) >> \
  summarise(mean=mean(f.disp), n=n()) 

Unnamed: 0,cyl,mean,n
,<int64>,<float64>,<int64>
0.0,4,105.136364,11
1.0,6,183.314286,7
2.0,8,353.100000,14


In [20]:
mtcars >> \
   group_by(f.cyl) >> \
   summarise(qs=quantile(f.disp, c(0.25, 0.75)), prob=c(0.25, 0.75)) 

[2021-07-07 00:21:03][datar][   INFO] `summarise()` has grouped output by ['cyl'] (override with `_groups` argument)


Unnamed: 0,cyl,qs,prob
,<int64>,<float64>,<float64>
0.0,4,78.85,0.25
1.0,4,120.65,0.75
2.0,6,160.00,0.25
3.0,6,196.30,0.75
4.0,8,301.75,0.25
5.0,8,390.00,0.75


In [21]:
with options_context(dplyr_summarise_inform=False):
    mtcars >> \
        group_by(f.cyl) >> \
        summarise(qs=quantile(f.disp, c(0.25, 0.75)), prob=c(0.25, 0.75)) 

Unnamed: 0,cyl,qs,prob
,<int64>,<float64>,<float64>
0.0,4,78.85,0.25
1.0,4,120.65,0.75
2.0,6,160.00,0.25
3.0,6,196.30,0.75
4.0,8,301.75,0.25
5.0,8,390.00,0.75


In [22]:
from pipda import register_func, Context

def my_quantile(x, probs):
  return tibble(x=quantile(x, probs), probs = probs)

my_quantile = register_func(None, context=Context.EVAL, func=my_quantile)

mtcars >> \
  group_by(f.cyl) >> \
  summarise(my_quantile(f.disp, c(0.25, 0.75))) 


[2021-07-07 00:21:06][datar][   INFO] `summarise()` has grouped output by ['cyl'] (override with `_groups` argument)


Unnamed: 0,cyl,x,probs
,<int64>,<float64>,<float64>
0.0,4,78.85,0.25
1.0,4,120.65,0.75
2.0,6,160.00,0.25
3.0,6,196.30,0.75
4.0,8,301.75,0.25
5.0,8,390.00,0.75


In [23]:
mtcars >> \
  group_by(f.cyl, f.vs) >> \
  summarise(cyl_n = n()) >> \
  group_vars()

[2021-07-07 00:21:11][datar][   INFO] `summarise()` has grouped output by ['cyl'] (override with `_groups` argument)


['cyl']

In [24]:
# Unlike dplyr's summarise, f.disp can be reused.
mtcars >> \
  group_by(f.cyl) >> \
  summarise(disp=mean(f.disp), sd=sd(f.disp)) 

Unnamed: 0,cyl,disp,sd
,<int64>,<float64>,<float64>
0.0,4,105.136364,
1.0,6,183.314286,
2.0,8,353.100000,


In [25]:
# Create temporary variable
mtcars >> \
  group_by(f.cyl) >> \
  summarise(disp_m2_=mean(f.disp), disp_m2=f.disp_m2 * 2) 

Unnamed: 0,cyl,disp_m2
,<int64>,<float64>
0.0,4,210.272727
1.0,6,366.628571
2.0,8,706.200000


In [26]:
var = "mass"
starwars >> summarise(avg = mean(f[var], na_rm = TRUE))

Unnamed: 0,avg
,<float64>
0.0,97.311864
