In [1]:
# https://dplyr.tidyverse.org/reference/distinct.html
%run nb_helpers.py
from datar.data import starwars
from datar.all import *

nb_header(distinct, n_distinct, book='distinct')

### <div style="background-color: #EEE; padding: 5px 0 8px 0">★ distinct</div>

##### Filter a data frame based on conditions

The original API:  
https://dplyr.tidyverse.org/reference/distinct.html  

##### Args:
&emsp;&emsp;`_data`: A data frame  
&emsp;&emsp;`*args`: Variables to filter by.  
&emsp;&emsp;`keep_all`: If `True`, keep all rows that match.  
&emsp;&emsp;`_preserve`: If `True`, keep grouping variables even if they are not used.  

##### Returns:
&emsp;&emsp;The subset dataframe  


### <div style="background-color: #EEE; padding: 5px 0 8px 0">★ n_distinct</div>

##### Count the number of distinct values

The original API:  
https://dplyr.tidyverse.org/reference/distinct.html  

##### Args:
&emsp;&emsp;`_data`: A data frame  
&emsp;&emsp;`na_rm`: If `True`, remove missing values before counting.  

##### Returns:
&emsp;&emsp;The number of distinct values  


In [2]:
df = tibble(
  x=sample(range(10), 100, replace=True),
  y=sample(range(10), 100, replace=True)
)
nrow(df)

100

In [3]:
nrow(distinct(df))

59

In [4]:
df >> distinct(f.x, f.y) >> nrow()

59

In [5]:
df >> distinct(f.x)

Unnamed: 0,x
,<int64>
0.0,4
1.0,6
2.0,1
3.0,8
4.0,5
6.0,9
14.0,2
19.0,7
22.0,0


In [6]:
df >> distinct(f.y)

Unnamed: 0,y
,<int64>
0.0,4
1.0,3
2.0,1
3.0,7
4.0,8
6.0,6
10.0,9
13.0,0
16.0,5


In [7]:
df >> distinct(f.x, _keep_all=True)

Unnamed: 0,x,y
,<int64>,<int64>
0.0,4,4
1.0,6,3
2.0,1,1
3.0,8,7
4.0,5,8
6.0,9,6
14.0,2,0
19.0,7,8
22.0,0,6


In [8]:
df >> distinct(f.y, _keep_all=True)

Unnamed: 0,x,y
,<int64>,<int64>
0.0,4,4
1.0,6,3
2.0,1,1
3.0,8,7
4.0,5,8
6.0,9,6
10.0,1,9
13.0,6,0
16.0,4,5


In [9]:
df >> distinct(diff=abs(f.x-f.y))

Unnamed: 0,diff
,<int64>
0.0,0
1.0,3
3.0,1
8.0,5
10.0,8
13.0,6
14.0,2
18.0,4
36.0,7


In [10]:
starwars >> distinct(across(contains("color")))

Unnamed: 0,hair_color,skin_color,eye_color
,<object>,<object>,<object>
0,blond,fair,blue
1,,gold,yellow
2,,"white, blue",red
3,none,white,yellow
...,...,...,...
4,brown,light,brown
79,none,pale,white
81,black,dark,dark
82,brown,light,hazel


In [11]:
df = tibble(
  g=[1, 1, 2, 2],
  x=[1, 1, 2, 1]
) >> group_by(f.g)

df >> distinct(f.x) 

Unnamed: 0,g,x
,<int64>,<int64>
0.0,1,1
2.0,2,2
