In [1]:
# https://dplyr.tidyverse.org/reference/group_by.html
from datar.datasets import mtcars
from datar.all import *

%run nb_helpers.py
nb_header(group_by, ungroup)

### # group_by  

##### Takes an existing tbl and converts it into a grouped tbl where
operations are performed "by group"  

See https://dplyr.tidyverse.org/reference/group_by.html  

Note that this does not return `pandas.DataFrameGroupBy` object but a  
`datar.core.grouped.DataFrameGroupBy` object, which is a subclass of  
`DataFrame`. This way, it will be easier to implement the APIs that related  
to grouped data.  

##### Args:
&emsp;&emsp;`_data`: The dataframe  
&emsp;&emsp;`_add`: When False, the default, `group_by()` will override  
&emsp;&emsp;&emsp;&emsp;existing groups. To add to the existing groups, use `_add=True`.  

&emsp;&emsp;`_drop`: Drop groups formed by factor levels that don't appear in the  
&emsp;&emsp;&emsp;&emsp;data? The default is True except when `_data` has been previously  
&emsp;&emsp;&emsp;&emsp;grouped with `_drop=False`.  

&emsp;&emsp;`*args`: variables or computations to group by.  
&emsp;&emsp;&emsp;&emsp;Note that columns here cannot be selected by indexes. As they are  
&emsp;&emsp;&emsp;&emsp;treated as computations to be added as new columns.  
&emsp;&emsp;&emsp;&emsp;So no `_base0` argument is supported.  

&emsp;&emsp;`**kwargs`: Extra variables to group the dataframe  

##### Return:
&emsp;&emsp;A `datar.core.grouped.DataFrameGroupBy` object  


### # ungroup  

##### Ungroup a grouped data

See https://dplyr.tidyverse.org/reference/group_by.html  

##### Args:
&emsp;&emsp;`x`: The data frame  
&emsp;&emsp;`*cols`: Variables to remove from the grouping variables.  
&emsp;&emsp;`_base0`: If columns are selected with indexes, whether they are 0-based.  
&emsp;&emsp;&emsp;&emsp;If not given, will use `datar.base.get_option('index.base.0')`  

##### Returns:
&emsp;&emsp;A data frame with selected columns removed from the grouping variables.  


In [2]:
by_cyl = mtcars >> group_by(f.cyl) 
by_cyl

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
6,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [3]:
by_cyl >> group_vars()

['cyl']

In [4]:
by_cyl >> summarise(
  disp = mean(f.disp),
  hp = mean(f.hp)
)

Unnamed: 0,cyl,disp,hp
0,4,105.136364,82.636364
1,6,183.314286,122.285714
2,8,353.1,209.214286


In [5]:
by_cyl >> filter(f.disp == max(f.disp))

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
1,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
2,10.4,8,472.0,205,2.93,5.25,17.98,0,0,3,4


In [6]:
by_vs_am = mtcars >> group_by(f.vs, f.am)
by_vs = by_vs_am >> summarise(n=n())
by_vs

[2021-05-25 11:05:32][datar][   INFO] `summarise()` has grouped output by ['vs'] (override with `_groups` argument)


Unnamed: 0,vs,am,n
0,0,0,12
1,0,1,6
2,1,0,7
3,1,1,7


In [7]:
by_vs >> summarise(n=sum(f.n))

Unnamed: 0,vs,n
0,0,18
1,1,14


In [8]:
by_vs >> \
  ungroup() >> \
  summarise(n = sum(f.n))

Unnamed: 0,n
0,32


In [9]:
mtcars_vsam = mtcars >> group_by(vsam=f.vs + f.am) 
mtcars_vsam 

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,vsam
0,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4,1
1,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4,1
2,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1,2
3,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1,1
4,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2,0
5,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1,1
6,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4,0
7,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2,1
8,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2,1
9,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4,1


In [10]:
by_cyl >> \
  group_by(f.vs, f.am) >> \
  group_vars()

['vs', 'am']

In [11]:
by_cyl >> \
  group_by(f.vs, f.am, _add=True) >> \
  group_vars()

['cyl', 'vs', 'am']