### Load Modules

In [1]:
using Random
using AutoMLPipeline
using DataFrames
using AbstractTrees
ENV["COLUMNS"]=1000;

### Load dataset
- pro-football dataset
- predict if the game is played home/away

In [2]:
using CSV
profbdata = CSV.read("profb.csv",DataFrame)
first(profbdata,5)

Unnamed: 0_level_0,Home.Away,Favorite_Points,Underdog_Points,Pointspread,Favorite_Name,Underdog_name,Year
Unnamed: 0_level_1,String,Int64,Int64,Float64,String,String,Int64
1,away,27,24,4.0,BUF,MIA,89
2,at_home,17,14,3.0,CHI,CIN,89
3,away,51,0,2.5,CLE,PIT,89
4,at_home,28,0,5.5,NO,DAL,89
5,at_home,38,7,5.5,MIN,HOU,89


### Split data into input features and target output

In [3]:
X = profbdata[:,2:end] 
Y = profbdata[:,1] |> Vector;


### Load the building blocks of modeling

In [4]:
# decomposition
pca = SKPreprocessor("PCA")
fa = SKPreprocessor("FactorAnalysis")
ica = SKPreprocessor("FastICA")
# Scaler 
rb = SKPreprocessor("RobustScaler")
pt = SKPreprocessor("PowerTransformer")
norm = SKPreprocessor("Normalizer")
mx = SKPreprocessor("MinMaxScaler")
# categorical preprocessing
ohe = OneHotEncoder()
# Column selector
disc = CatNumDiscriminator()
catf = CatFeatureSelector()
numf = NumFeatureSelector()
# Learners
rf = SKLearner("RandomForestClassifier")
gb = SKLearner("GradientBoostingClassifier")
lsvc = SKLearner("LinearSVC")
svc = SKLearner("SVC")
mlp = SKLearner("MLPClassifier")
ada = SKLearner("AdaBoostClassifier");
jrf = RandomForest();
vote = VoteEnsemble();
stack = StackEnsemble();
best = BestLearner();

### Define a pipeline composed of transformers and a learner at the end
#### Symbolic expression exploiting Julia Macro (allows symbolic manipulation)

In [5]:
pvote = @pipeline  (catf |> ohe) + (numf) |> vote
pred = fit_transform!(pvote,X,Y)
score(:accuracy,pred,Y)

100.0

In [6]:
a=:(2+2)

:(2 + 2)

In [7]:
eval(a)

4

In [8]:
@pipelinex (catf |> ohe) + (numf) |> vote

:(Pipeline(ComboPipeline(Pipeline(catf, ohe), numf), vote))

In [9]:
print_tree(@pipelinex (catf |> ohe) + (numf) |> vote)

Expr(:call)
├─ :Pipeline
├─ Expr(:call)
│  ├─ :ComboPipeline
│  ├─ Expr(:call)
│  │  ├─ :Pipeline
│  │  ├─ :catf
│  │  └─ :ohe
│  └─ :numf
└─ :vote


In [10]:
pohe = @pipeline  numf + catf |> ohe 
pred = fit_transform!(pvote,X,Y)

672-element Vector{String}:
 "away"
 "at_home"
 "away"
 "at_home"
 "at_home"
 "at_home"
 "away"
 "at_home"
 "away"
 "at_home"
 "away"
 "at_home"
 "at_home"
 ⋮
 "at_home"
 "at_home"
 "away"
 "at_home"
 "away"
 "at_home"
 "at_home"
 "at_home"
 "away"
 "away"
 "at_home"
 "at_home"

#### Corresponding function call

In [11]:
@pipelinex (catf |> ohe) + (numf) |> vote

:(Pipeline(ComboPipeline(Pipeline(catf, ohe), numf), vote))

In [12]:
(@pipelinex (catf |> ohe) + (numf) |> vote) |> print_tree

Expr(:call)
├─ :Pipeline
├─ Expr(:call)
│  ├─ :ComboPipeline
│  ├─ Expr(:call)
│  │  ├─ :Pipeline
│  │  ├─ :catf
│  │  └─ :ohe
│  └─ :numf
└─ :vote


### Evaluate performance by 5-fold cross-validation

In [13]:
crossvalidate(pvote,X,Y,"accuracy_score",5)

fold: 1, 0.5746268656716418
fold: 2, 0.6222222222222222
fold: 3, 0.5447761194029851
fold: 4, 0.6074074074074074
fold: 5, 0.5970149253731343
errors: 0


(mean = 0.5892095080154782, std = 0.030285751035013225, folds = 5, errors = 0)

### Use similar workflow to discover optimal pipeline

### RandomForest learner

In [14]:
prf = @pipeline  (numf |> rb |> pca) + (numf |> rb |> ica) + (catf |> ohe) + (numf |> rb |> fa) |> rf
pred = fit_transform!(prf,X,Y)
score(:accuracy,pred,Y)

100.0

In [15]:
@pipelinex (numf |> rb |> pca) + (numf |> rb |> ica) + (catf |> ohe) + (numf |> rb |> fa) |> rf

:(Pipeline(ComboPipeline(Pipeline(Pipeline(numf, rb), pca), Pipeline(Pipeline(numf, rb), ica), Pipeline(catf, ohe), Pipeline(Pipeline(numf, rb), fa)), rf))

In [16]:
crossvalidate(prf,X,Y,"accuracy_score",5)

fold: 1, 0.753731343283582
fold: 2, 0.5703703703703704
fold: 3, 0.7313432835820896
fold: 4, 0.6962962962962963
fold: 5, 0.6492537313432836
errors: 0


(mean = 0.6801990049751244, std = 0.0729756872843776, folds = 5, errors = 0)

### Gradient Boost Learner

In [17]:
pgb = @pipeline (numf |> rb |> pca) + (numf|>rb|>ica)  + (numf|>rb|>fa) |> gb
pred = fit_transform!(pgb,X,Y)
score(:accuracy,pred,Y)

91.22023809523809

In [18]:
crossvalidate(pgb,X,Y,"accuracy_score",5)

fold: 1, 0.6492537313432836
fold: 2, 0.6518518518518519
fold: 3, 0.7014925373134329
fold: 4, 0.6148148148148148
fold: 5, 0.5671641791044776
errors: 0


(mean = 0.6369154228855721, std = 0.04974738023202397, folds = 5, errors = 0)

### Linear Support Vector Machine for Classification

In [19]:
plsvc = @pipeline ((numf |> rb |> pca)+(numf |> rb |> fa)+(numf |> rb |> ica)+(catf |> ohe )) |> lsvc
pred = fit_transform!(plsvc,X,Y)
score(:accuracy,pred,Y)

79.91071428571429

In [20]:
crossvalidate(plsvc,X,Y,"accuracy_score",5)

fold: 1, 0.6492537313432836
fold: 2, 0.7555555555555555
fold: 3, 0.746268656716418
fold: 4, 0.7407407407407407
fold: 5, 0.7313432835820896
errors: 0


(mean = 0.7246323935876174, std = 0.0430430814491215, folds = 5, errors = 0)

### RBF Kernel Support Vector Machine for Classification

In [21]:
psvc = @pipeline (numf |> pca) + numf + (numf|> ica) + (catf |> ohe) |> svc
pred = fit_transform!(psvc,X,Y)
score(:accuracy,pred,Y)

66.66666666666666

In [22]:
crossvalidate(psvc,X,Y,"accuracy_score",5)

fold: 1, 0.6865671641791045
fold: 2, 0.674074074074074
fold: 3, 0.6417910447761194
fold: 4, 0.6518518518518519
fold: 5, 0.6791044776119403
errors: 0


(mean = 0.666677722498618, std = 0.018999166548390053, folds = 5, errors = 0)