In [1]:
using DataFrames, Chain, Statistics

In [2]:
data = DataFrame(
				  A = 1:2:13,
				  B = ["M", "F", "F", "M", "X", "F", "M"],
				  C = [3.0, 2.5, pi, -2.3, 1/3, 56, 100],
				  D = [(-1)^n//n for n=1:7]
				)

Row,A,B,C,D
Unnamed: 0_level_1,Int64,String,Float64,Rational…
1,1,M,3.0,-1//1
2,3,F,2.5,1//2
3,5,F,3.14159,-1//3
4,7,M,-2.3,1//4
5,9,X,0.333333,-1//5
6,11,F,56.0,1//6
7,13,M,100.0,-1//7


In [3]:
data[!,[:A,:D]]

Row,A,D
Unnamed: 0_level_1,Int64,Rational…
1,1,-1//1
2,3,1//2
3,5,-1//3
4,7,1//4
5,9,-1//5
6,11,1//6
7,13,-1//7


In [4]:
select(data,:A,:D)

Row,A,D
Unnamed: 0_level_1,Int64,Rational…
1,1,-1//1
2,3,1//2
3,5,-1//3
4,7,1//4
5,9,-1//5
6,11,1//6
7,13,-1//7


In [5]:
@chain data begin
  select(Not(:B))
end

Row,A,C,D
Unnamed: 0_level_1,Int64,Float64,Rational…
1,1,3.0,-1//1
2,3,2.5,1//2
3,5,3.14159,-1//3
4,7,-2.3,1//4
5,9,0.333333,-1//5
6,11,56.0,1//6
7,13,100.0,-1//7


In [6]:
@chain data begin
  select(Between(:B,:D))
end

Row,B,C,D
Unnamed: 0_level_1,String,Float64,Rational…
1,M,3.0,-1//1
2,F,2.5,1//2
3,F,3.14159,-1//3
4,M,-2.3,1//4
5,X,0.333333,-1//5
6,F,56.0,1//6
7,M,100.0,-1//7


In [7]:
@chain data begin
  subset(:A => a -> a .< 10)
end

Row,A,B,C,D
Unnamed: 0_level_1,Int64,String,Float64,Rational…
1,1,M,3.0,-1//1
2,3,F,2.5,1//2
3,5,F,3.14159,-1//3
4,7,M,-2.3,1//4
5,9,X,0.333333,-1//5


In [8]:
@chain data begin
  subset(:B => b -> b .== "F")
end

Row,A,B,C,D
Unnamed: 0_level_1,Int64,String,Float64,Rational…
1,3,F,2.5,1//2
2,5,F,3.14159,-1//3
3,11,F,56.0,1//6


In [9]:
@chain data begin
  subset([:C, :D] => (c, d) -> c .* d .> 2)
end

Row,A,B,C,D
Unnamed: 0_level_1,Int64,String,Float64,Rational…
1,11,F,56.0,1//6


In [10]:
@chain data begin
  subset([:C, :D] => ByRow((c, d) -> c * d > 2))
end

Row,A,B,C,D
Unnamed: 0_level_1,Int64,String,Float64,Rational…
1,11,F,56.0,1//6


In [11]:
@chain data begin
  subset([:C, :D] => (c, d) -> @. c * d > 2)
end

Row,A,B,C,D
Unnamed: 0_level_1,Int64,String,Float64,Rational…
1,11,F,56.0,1//6


In [12]:
@chain data begin
  select(:A => a -> a.^2)
end

Row,A_function
Unnamed: 0_level_1,Int64
1,1
2,9
3,25
4,49
5,81
6,121
7,169


In [13]:
@chain data begin
  transform(:A => (a -> a.^2) => :Asq)
end

Row,A,B,C,D,Asq
Unnamed: 0_level_1,Int64,String,Float64,Rational…,Int64
1,1,M,3.0,-1//1,1
2,3,F,2.5,1//2,9
3,5,F,3.14159,-1//3,25
4,7,M,-2.3,1//4,49
5,9,X,0.333333,-1//5,81
6,11,F,56.0,1//6,121
7,13,M,100.0,-1//7,169


In [14]:
@chain data begin
  select([:C, :D] => ((c,d) -> @. c*d) => :prod)
end

Row,prod
Unnamed: 0_level_1,Float64
1,-3.0
2,1.25
3,-1.0472
4,-0.575
5,-0.0666667
6,9.33333
7,-14.2857


In [15]:
sort(data,:C)

Row,A,B,C,D
Unnamed: 0_level_1,Int64,String,Float64,Rational…
1,7,M,-2.3,1//4
2,9,X,0.333333,-1//5
3,3,F,2.5,1//2
4,1,M,3.0,-1//1
5,5,F,3.14159,-1//3
6,11,F,56.0,1//6
7,13,M,100.0,-1//7


In [16]:
@chain data begin
  sort(:B)
end

Row,A,B,C,D
Unnamed: 0_level_1,Int64,String,Float64,Rational…
1,3,F,2.5,1//2
2,5,F,3.14159,-1//3
3,11,F,56.0,1//6
4,1,M,3.0,-1//1
5,7,M,-2.3,1//4
6,13,M,100.0,-1//7
7,9,X,0.333333,-1//5


In [17]:
@chain data begin
  sort([:B,:D])
end

Row,A,B,C,D
Unnamed: 0_level_1,Int64,String,Float64,Rational…
1,5,F,3.14159,-1//3
2,11,F,56.0,1//6
3,3,F,2.5,1//2
4,1,M,3.0,-1//1
5,13,M,100.0,-1//7
6,7,M,-2.3,1//4
7,9,X,0.333333,-1//5


In [18]:
simpsons = DataFrame(
  name=["Homer","Marge","Lisa","Bart","Maggie","Apu","Moe", "Milhouse", "Patty"],
  age =[45,42,8,10,1,38,59, 1, 46],
  current_school_grade = [missing, missing, 2, 4, missing, missing, missing, 4, missing],
  favorite_food = ["pork chops","casserole","salad","hamburger",missing,"saag paneer","peanuts", missing, "Lady Laramie 100s"]
)

Row,name,age,current_school_grade,favorite_food
Unnamed: 0_level_1,String,Int64,Int64?,String?
1,Homer,45,missing,pork chops
2,Marge,42,missing,casserole
3,Lisa,8,2,salad
4,Bart,10,4,hamburger
5,Maggie,1,missing,missing
6,Apu,38,missing,saag paneer
7,Moe,59,missing,peanuts
8,Milhouse,1,4,missing
9,Patty,46,missing,Lady Laramie 100s


In [19]:
jobs = DataFrame(
  name = ["Homer","Marge","Apu","Moe", "Patty", "Wiggam"],
  job = ["nuclear technician", "housewife", "store owner", "bartender", "DMV clerk", "police chief"],
  salary = [50_000, 25_000, 60_000, 15_000, missing, 75_000]
)

Row,name,job,salary
Unnamed: 0_level_1,String,String,Int64?
1,Homer,nuclear technician,50000
2,Marge,housewife,25000
3,Apu,store owner,60000
4,Moe,bartender,15000
5,Patty,DMV clerk,missing
6,Wiggam,police chief,75000


In [39]:
innerjoin(simpsons, jobs, on = :name)

Row,name,age,current_school_grade,favorite_food,job,salary
Unnamed: 0_level_1,String,Int64,Int64?,String?,String,Int64?
1,Homer,45,missing,pork chops,nuclear technician,50000
2,Marge,42,missing,casserole,housewife,25000
3,Apu,38,missing,saag paneer,store owner,60000
4,Moe,59,missing,peanuts,bartender,15000
5,Patty,46,missing,Lady Laramie 100s,DMV clerk,missing


In [40]:
outerjoin(simpsons, jobs, on = :name)

Row,name,age,current_school_grade,favorite_food,job,salary
Unnamed: 0_level_1,String,Int64?,Int64?,String?,String?,Int64?
1,Homer,45,missing,pork chops,nuclear technician,50000
2,Marge,42,missing,casserole,housewife,25000
3,Apu,38,missing,saag paneer,store owner,60000
4,Moe,59,missing,peanuts,bartender,15000
5,Patty,46,missing,Lady Laramie 100s,DMV clerk,missing
6,Lisa,8,2,salad,missing,missing
7,Bart,10,4,hamburger,missing,missing
8,Maggie,1,missing,missing,missing,missing
9,Milhouse,1,4,missing,missing,missing
10,Wiggam,missing,missing,missing,police chief,75000


In [41]:
combine(data, :C => mean, :D => mean, :C=> std, :C => length)

Row,C_mean,D_mean,C_std,C_length
Unnamed: 0_level_1,Float64,Rational…,Float64,Int64
1,23.2393,-319//2940,39.5518,7


In [48]:
@chain simpsons begin
  innerjoin(jobs, on = :name)
  transform([:age, :salary] => ((a, s) -> @. s - 400*a) => :happiness)
  dropmissing(:happiness)
  subset(:happiness => h -> h .> 0)
  select(:name, :age, :favorite_food, :happiness)
  sort(:happiness, rev = true)
end

Row,name,age,favorite_food,happiness
Unnamed: 0_level_1,String,Int64,String?,Int64
1,Apu,38,saag paneer,44800
2,Homer,45,pork chops,32000
3,Marge,42,casserole,8200


In [49]:
using RDatasets

In [52]:
iris = RDatasets.dataset("datasets","iris")

Row,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Cat…
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa
6,5.4,3.9,1.7,0.4,setosa
7,4.6,3.4,1.4,0.3,setosa
8,5.0,3.4,1.5,0.2,setosa
9,4.4,2.9,1.4,0.2,setosa
10,4.9,3.1,1.5,0.1,setosa


In [56]:
@chain iris begin
  groupby(:Species)
  combine(:SepalLength => mean)
end

Row,Species,SepalLength_mean
Unnamed: 0_level_1,Cat…,Float64
1,setosa,5.006
2,versicolor,5.936
3,virginica,6.588


In [57]:
@chain iris begin
  groupby(:Species)
  combine(nrow)
end

Row,Species,nrow
Unnamed: 0_level_1,Cat…,Int64
1,setosa,50
2,versicolor,50
3,virginica,50


In [59]:
@chain iris begin
  groupby(:Species)
  combine(nrow, :SepalLength => mean, :SepalWidth => std, :PetalWidth => maximum)
end

Row,Species,nrow,SepalLength_mean,SepalWidth_std,PetalWidth_maximum
Unnamed: 0_level_1,Cat…,Int64,Float64,Float64,Float64
1,setosa,50,5.006,0.379064,0.6
2,versicolor,50,5.936,0.313798,1.8
3,virginica,50,6.588,0.322497,2.5
