# [dplython](https://github.com/dodger487/dplython): dplyr in python

I've also included equivalent pandas examples to compare dplython to.

In [1]:
from dplython import *

In [2]:
# dplython way
diamonds >> select(X.carat, X.cut, X.price) >> head(5)

Unnamed: 0,carat,cut,price
0,0.23,Ideal,326
1,0.21,Premium,326
2,0.23,Good,327
3,0.29,Premium,334
4,0.31,Good,335


In [3]:
# pandas way
diamonds[['carat', 'cut', 'price']].head()

Unnamed: 0,carat,cut,price
0,0.23,Ideal,326
1,0.21,Premium,326
2,0.23,Good,327
3,0.29,Premium,334
4,0.31,Good,335


In [4]:
# dplython way
diamonds >> dfilter(X.carat > 4) >> select(X.carat, X.cut, X.depth, X.price)

Unnamed: 0,carat,cut,depth,price
25998,4.01,Premium,61.0,15223
25999,4.01,Premium,62.5,15223
27130,4.13,Fair,64.8,17329
27415,5.01,Fair,65.5,18018
27630,4.5,Fair,65.8,18531


In [5]:
# pandas way
diamonds.query("carat > 4")[['carat','cut','depth','price']]

Unnamed: 0,carat,cut,depth,price
25998,4.01,Premium,61.0,15223
25999,4.01,Premium,62.5,15223
27130,4.13,Fair,64.8,17329
27415,5.01,Fair,65.5,18018
27630,4.5,Fair,65.8,18531


In [6]:
# dplython way; sample 10 random records, the sort by carat column, then return only those specific columns
(diamonds >> 
  sample_n(10) >> 
  arrange(X.carat) >> 
  select(X.carat, X.cut, X.depth, X.price))

  return lambda df: DplyFrame(df.sort(names))


Unnamed: 0,carat,cut,depth,price
37842,0.33,Ideal,62.3,1002
31009,0.35,Ideal,62.2,748
52108,0.61,Ideal,59.4,2461
46058,0.64,Fair,59.5,1733
672,0.72,Ideal,60.3,2847
7807,1.05,Very Good,63.5,4292
19618,1.07,Ideal,62.4,8235
23217,1.14,Ideal,60.1,11226
18747,1.2,Premium,61.6,7661
22487,1.54,Very Good,62.7,10518


In [7]:
# pandas way: get sample first, then select those specific columns, then sort by carat column
diamonds.sample(n=10)[['carat','cut','depth','price']].sort_values(by='carat')

Unnamed: 0,carat,cut,depth,price
49,0.29,Very Good,60.7,404
33804,0.32,Very Good,58.1,842
40481,0.33,Ideal,61.8,1141
39781,0.36,Ideal,61.6,1094
30164,0.4,Very Good,60.5,720
42543,0.53,Premium,61.4,1324
9222,1.05,Premium,62.6,4557
9561,1.1,Premium,60.9,4622
27619,2.03,Very Good,61.7,18507
26940,2.27,Premium,62.4,16994


In [8]:
# dplython way
diamonds >> sample_n(6) >> select(X.carat, X.price) >> X._.T

Unnamed: 0,34991,1780,32042,1205,37420,6248
carat,0.4,0.72,0.32,0.82,0.33,0.9
price,881.0,3048.0,779.0,2937.0,984.0,4008.0


In [9]:
# pandas way
diamonds.sample(n=6)[['carat','price']].transpose()

Unnamed: 0,9752,20344,32959,32415,49302,35763
carat,0.94,1.51,0.31,0.41,0.32,0.4
price,4661.0,8742.0,461.0,791.0,540.0,912.0


In [10]:
# dplython way
(diamonds >> 
  mutate(carat_bin=X.carat.round()) >> 
  group_by(X.cut, X.carat_bin) >> 
  summarize(avg_price=X.price.mean()))

Unnamed: 0,avg_price,carat_bin,cut
0,4213.864948,1.0,Ideal
1,12337.020064,2.0,Premium
2,15842.666667,4.0,Fair
3,15053.555556,3.0,Very Good
4,786.054191,0.0,Good
5,12587.0,4.0,Ideal
6,4135.271007,1.0,Very Good
7,11096.950321,2.0,Good
8,13466.823529,3.0,Fair
9,12838.984078,2.0,Ideal


In [14]:
# pandas way
(diamonds.assign(carat_bin=diamonds.carat.round())
 .groupby(['carat_bin','cut'])[['price']].mean()
 .rename(columns={'price':'avg_price'})
)

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_price
carat_bin,cut,Unnamed: 2_level_1
0.0,Fair,1027.979275
0.0,Good,786.054191
0.0,Ideal,863.908535
0.0,Premium,863.329085
0.0,Very Good,766.35459
1.0,Fair,3305.754579
1.0,Good,3815.307879
1.0,Ideal,4213.864948
1.0,Premium,4382.906453
1.0,Very Good,4135.271007


#### Not sure if it's because I've been using pandas for a while, but I still don't mind how things are done with pandas.  YMMV!

However, I do wish that with pandas' query() method, we can do a SQL like statement (example: df.query("column like '%NAME%'") and also filter is null or not null somehow.  Then I would consider query() method pretty much feature complete with respect to what can be done using SQL's WHERE statement.