### Load Modules

In [1]:
using Random
using AutoMLPipeline
using DataFrames
using AbstractTrees
ENV["COLUMNS"]=1000;

### Load dataset
- pro-football dataset
- predict if the game is played home/away

In [2]:
using CSV
profbdata = CSV.read("profb.csv",DataFrame)
first(profbdata,5)

Unnamed: 0_level_0,Home.Away,Favorite_Points,Underdog_Points,Pointspread,Favorite_Name,Underdog_name,Year
Unnamed: 0_level_1,String,Int64,Int64,Float64,String,String,Int64
1,away,27,24,4.0,BUF,MIA,89
2,at_home,17,14,3.0,CHI,CIN,89
3,away,51,0,2.5,CLE,PIT,89
4,at_home,28,0,5.5,NO,DAL,89
5,at_home,38,7,5.5,MIN,HOU,89


### Split data into input features and target output

In [3]:
X = profbdata[:,2:end] 
Y = profbdata[:,1] |> Vector;

### Load the building blocks of modeling

In [4]:
# decomposition
pca = SKPreprocessor("PCA")
fa = SKPreprocessor("FactorAnalysis")
ica = SKPreprocessor("FastICA")
# Scaler 
rb = SKPreprocessor("RobustScaler")
pt = SKPreprocessor("PowerTransformer")
norm = SKPreprocessor("Normalizer")
mx = SKPreprocessor("MinMaxScaler")
# categorical preprocessing
ohe = OneHotEncoder()
# Column selector
disc = CatNumDiscriminator()
catf = CatFeatureSelector()
numf = NumFeatureSelector()
# Learners
rf = SKLearner("RandomForestClassifier")
gb = SKLearner("GradientBoostingClassifier")
lsvc = SKLearner("LinearSVC")
svc = SKLearner("SVC")
mlp = SKLearner("MLPClassifier")
ada = SKLearner("AdaBoostClassifier");
jrf = RandomForest();
vote = VoteEnsemble();
stack = StackEnsemble();
best = BestLearner();

### Define a pipeline composed of transformers and a learner at the end

In [5]:
pvote = ((catf |> ohe) + numf) |> vote
pred = fit_transform!(pvote,X,Y)
score(:accuracy,pred,Y)

100.0

In [6]:
pohe = numf + catf |> ohe 
pred = fit_transform!(pvote,X,Y)

672-element Vector{String}:
 "away"
 "at_home"
 "away"
 "at_home"
 "at_home"
 "at_home"
 "away"
 "at_home"
 "away"
 "at_home"
 "away"
 "at_home"
 "at_home"
 ⋮
 "at_home"
 "at_home"
 "away"
 "at_home"
 "away"
 "at_home"
 "at_home"
 "at_home"
 "away"
 "away"
 "at_home"
 "at_home"

#### Corresponding function call

In [7]:
@pipelinex (catf |> ohe) + (numf) |> vote

:(Pipeline(ComboPipeline(Pipeline(catf, ohe), numf), vote))

#### Tree representation

In [8]:
(@pipelinex (catf |> ohe) + (numf) |> vote) |> print_tree

Expr(:call)
├─ :Pipeline
├─ Expr(:call)
│  ├─ :ComboPipeline
│  ├─ Expr(:call)
│  │  ├─ :Pipeline
│  │  ├─ :catf
│  │  └─ :ohe
│  └─ :numf
└─ :vote


### Evaluate performance by 5-fold cross-validation

In [9]:
crossvalidate(pvote,X,Y,"accuracy_score",10)

fold: 1, 0.6417910447761194
fold: 2, 0.5970149253731343
fold: 3, 0.5
fold: 4, 0.6865671641791045
fold: 5, 0.5223880597014925
fold: 6, 0.5522388059701493
fold: 7, 0.6716417910447762
fold: 8, 0.5588235294117647
fold: 9, 0.6268656716417911
fold: 10, 0.5074626865671642
errors: 0


(mean = 0.5864793678665496, std = 0.06818187495509788, folds = 10, errors = 0)

### Use similar workflow to discover optimal pipeline

### RandomForest learner

In [10]:
prf = (numf |> rb |> pca) + (numf |> rb |> ica) + (catf |> ohe) + (numf |> rb |> fa) |> rf
pred = fit_transform!(prf,X,Y)
score(:accuracy,pred,Y)

│   caller = npyinitialize() at numpy.jl:67
└ @ PyCall /Users/ppalmes/.julia/packages/PyCall/L0fLP/src/numpy.jl:67


100.0

In [11]:
@pipelinex (numf |> rb |> pca) + (numf |> rb |> ica) + (catf |> ohe) + (numf |> rb |> fa) |> rf

:(Pipeline(ComboPipeline(Pipeline(Pipeline(numf, rb), pca), Pipeline(Pipeline(numf, rb), ica), Pipeline(catf, ohe), Pipeline(Pipeline(numf, rb), fa)), rf))

In [12]:
crossvalidate(prf,X,Y,"accuracy_score",10)

fold: 1, 0.6417910447761194
fold: 2, 0.8507462686567164
fold: 3, 0.6617647058823529
fold: 4, 0.6268656716417911
fold: 5, 0.5970149253731343
fold: 6, 0.6268656716417911
fold: 7, 0.7164179104477612
fold: 8, 0.6911764705882353
fold: 9, 0.7164179104477612
fold: 10, 0.6865671641791045
errors: 0


(mean = 0.6815627743634767, std = 0.07170612997803955, folds = 10, errors = 0)

### Gradient Boost Learner

In [13]:
pgb = @pipeline (numf |> rb |> pca) + (numf|>rb|>ica)  + (numf|>rb|>fa) |> gb
pred = fit_transform!(pgb,X,Y)
score(:accuracy,pred,Y)

90.17857142857143

In [14]:
crossvalidate(pgb,X,Y,"accuracy_score",10)

fold: 1, 0.6716417910447762
fold: 2, 0.6268656716417911
fold: 3, 0.7058823529411765
fold: 4, 0.6865671641791045
fold: 5, 0.6716417910447762
fold: 6, 0.7313432835820896
fold: 7, 0.6417910447761194
fold: 8, 0.6176470588235294
fold: 9, 0.746268656716418
fold: 10, 0.6119402985074627
errors: 0


(mean = 0.6711589113257243, std = 0.047024763742709405, folds = 10, errors = 0)

### Linear Support Vector Machine for Classification

In [15]:
plsvc = @pipeline ((numf |> rb |> pca)+(numf |> rb |> fa)+(numf |> rb |> ica)+(catf |> ohe )) |> lsvc
pred = fit_transform!(plsvc,X,Y)
score(:accuracy,pred,Y)

79.91071428571429

In [16]:
crossvalidate(plsvc,X,Y,"accuracy_score",10)

fold: 1, 0.8059701492537313
fold: 2, 0.7014925373134329
fold: 3, 0.6911764705882353
fold: 4, 0.7164179104477612
fold: 5, 0.6716417910447762
fold: 6, 0.8059701492537313
fold: 7, 0.7761194029850746
fold: 8, 0.7205882352941176
fold: 9, 0.7313432835820896
fold: 10, 0.8059701492537313
errors: 0


(mean = 0.7426690079016681, std = 0.051463939642796545, folds = 10, errors = 0)

### RBF Kernel Support Vector Machine for Classification

In [17]:
psvc = @pipeline (numf |> pca) + numf + (numf|> ica) + (catf |> ohe) |> svc
pred = fit_transform!(psvc,X,Y)
score(:accuracy,pred,Y)

66.66666666666666

In [18]:
crossvalidate(psvc,X,Y,"accuracy_score",10)

fold: 1, 0.746268656716418
fold: 2, 0.6417910447761194
fold: 3, 0.7352941176470589
fold: 4, 0.6119402985074627
fold: 5, 0.582089552238806
fold: 6, 0.6417910447761194
fold: 7, 0.746268656716418
fold: 8, 0.6323529411764706
fold: 9, 0.6268656716417911
fold: 10, 0.7014925373134329
errors: 0


(mean = 0.6666154521510098, std = 0.060316667544051315, folds = 10, errors = 0)

In [19]:
using AutoMLPipeline
using Distributed
using DataFrames

# disable truncation of dataframes columns
import Base.show
show(df::AbstractDataFrame) = show(df,truncate=0)
show(io::IO,df::AbstractDataFrame) = show(io,df;truncate=0)

# Add workers
nprocs() == 1 && addprocs(;  exeflags="--project");

@sync @everywhere using AutoMLPipeline
@sync @everywhere using DataFrames

#### feature selectors
catf   = CatFeatureSelector();
numf   = NumFeatureSelector();
# hot-bit encoder
ohe    = OneHotEncoder();
#### feature scalers
rb     = SKPreprocessor("RobustScaler");
pt     = SKPreprocessor("PowerTransformer");
mx     = SKPreprocessor("MinMaxScaler");
std    = SKPreprocessor("StandardScaler");
norm   = SKPreprocessor("Normalizer");
#### feature extractors
pca    = SKPreprocessor("PCA", Dict(:autocomponent => true));
ica    = SKPreprocessor("FastICA", Dict(:autocomponent => true));
fa     = SKPreprocessor("FactorAnalysis", Dict(:autocomponent => true));
#### Learners
rf     = SKLearner("RandomForestClassifier", Dict(:impl_args => Dict(:n_estimators => 10)));
gb     = SKLearner("GradientBoostingClassifier");
lsvc   = SKLearner("LinearSVC");
mlp    = SKLearner("MLPClassifier");
stack  = StackEnsemble();
rbfsvc = SKLearner("SVC");
ada    = SKLearner("AdaBoostClassifier");
vote   = VoteEnsemble();
best   = BestLearner();
tree   = PrunedTree();
sgd    = SKLearner("SGDClassifier");
noop = Identity(Dict(:name => "Noop"));

# Parallel Search for Datamining Optimal Pipelines
function prpsearch()
    learners = [rf,ada,sgd,tree,rbfsvc,lsvc,gb];
    scalers = [rb,pt,norm,std,mx,noop];
    extractors = [pca,ica,fa,noop];
    dftable = @sync @distributed (vcat) for lr in learners
    @distributed (vcat) for sc in scalers
      @distributed (vcat) for xt  in extractors
        pipe  = @pipeline (catf |> ohe) + (numf |> sc |> xt)  |> lr
        scn   = sc.name[1:end - 4]; xtn = xt.name[1:end - 4]; lrn = lr.name[1:end - 4]
        pname = "$scn |> $xtn |> $lrn"
        ptime = @elapsed begin
          mean, sd, kfold, _ = crossvalidate(pipe, X, Y, "accuracy_score", 3)
        end
        DataFrame(pipeline=pname, mean=mean, sd=sd, time=ptime, folds=kfold)
      end
    end
    end
    sort!(dftable, :mean, rev=true);
    dftable
end
runtime = @elapsed begin
    df = prpsearch()
end;
serialtime = df.time |> sum;
(serialtime = "$(round(serialtime / 60.0)) minutes", paralleltime = "$(round(runtime)) seconds")

      From worker 2:	[33m[1m│ [22m[39m  caller = npyinitialize() at numpy.jl:67
      From worker 2:	[33m[1m└ [22m[39m[90m@ PyCall ~/.julia/packages/PyCall/L0fLP/src/numpy.jl:67[39m
      From worker 5:	[33m[1m│ [22m[39m  caller = npyinitialize() at numpy.jl:67
      From worker 5:	[33m[1m└ [22m[39m[90m@ PyCall ~/.julia/packages/PyCall/L0fLP/src/numpy.jl:67[39m
      From worker 3:	[33m[1m│ [22m[39m  caller = npyinitialize() at numpy.jl:67
      From worker 3:	[33m[1m└ [22m[39m[90m@ PyCall ~/.julia/packages/PyCall/L0fLP/src/numpy.jl:67[39m
      From worker 8:	[33m[1m│ [22m[39m  caller = npyinitialize() at numpy.jl:67
      From worker 8:	[33m[1m└ [22m[39m[90m@ PyCall ~/.julia/packages/PyCall/L0fLP/src/numpy.jl:67[39m
      From worker 4:	[33m[1m│ [22m[39m  caller = npyinitialize() at numpy.jl:67
      From worker 4:	[33m[1m└ [22m[39m[90m@ PyCall ~/.julia/packages/PyCall/L0fLP/src/numpy.jl:67[39m
      From worker 6:	[33m[1m│ [22m[3

      From worker 3:	[33m[1m│ [22m[39m               for entry (202, 2) = SF.
      From worker 3:	[33m[1m│ [22m[39m               Patching value to MIA.
      From worker 3:	[33m[1m└ [22m[39m[90m@ AMLPipelineBase.BaseFilters ~/.julia/packages/AMLPipelineBase/c6Pzl/src/basefilters.jl:106[39m
      From worker 5:	fold: 2, 0.6741071428571429
      From worker 2:	
      From worker 2:	[33m[1m│ [22m[39m               for entry (192, 2) = SF.
      From worker 2:	[33m[1m│ [22m[39m               Patching value to MIA.
      From worker 2:	[33m[1m└ [22m[39m[90m@ AMLPipelineBase.BaseFilters ~/.julia/packages/AMLPipelineBase/c6Pzl/src/basefilters.jl:106[39m
      From worker 5:	fold: 3, 0.6964285714285714
      From worker 5:	errors: 0
      From worker 5:	fold: 1, 0.6696428571428571
      From worker 5:	fold: 2, 0.6875
      From worker 2:	fold: 2, 0.7276785714285714
      From worker 2:	fold: 3, 0.6339285714285714
      From worker 2:	errors: 0
      From worker 2:

      From worker 5:	fold: 3, 0.6160714285714286
      From worker 5:	errors: 0
      From worker 5:	fold: 1, 0.6964285714285714
      From worker 2:	fold: 3, 0.6830357142857143
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 0.5892857142857143
      From worker 4:	fold: 3, 0.65625
      From worker 4:	errors: 0
      From worker 4:	fold: 1, 0.7053571428571429
      From worker 4:	fold: 2, 0.6696428571428571
      From worker 4:	fold: 3, 0.7142857142857143
      From worker 4:	errors: 0
      From worker 4:	fold: 1, 0.6830357142857143
      From worker 2:	fold: 2, 0.6339285714285714
      From worker 2:	fold: 3, 0.59375
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 0.6428571428571429
      From worker 5:	fold: 2, 0.6741071428571429
      From worker 3:	fold: 3, 0.6026785714285714
      From worker 3:	errors: 0
      From worker 3:	fold: 1, 0.5625
      From worker 2:	fold: 2, 0.6294642857142857
      From worker 3:	fold: 2, 0.5267857142857143
      From work

      From worker 4:	[33m[1m└ [22m[39m[90m@ AMLPipelineBase.BaseFilters ~/.julia/packages/AMLPipelineBase/c6Pzl/src/basefilters.jl:106[39m
      From worker 4:	[33m[1m│ [22m[39m               for entry (42, 1) = NE.
      From worker 4:	[33m[1m│ [22m[39m               Patching value to BUF.
      From worker 4:	[33m[1m└ [22m[39m[90m@ AMLPipelineBase.BaseFilters ~/.julia/packages/AMLPipelineBase/c6Pzl/src/basefilters.jl:106[39m
      From worker 4:	[33m[1m│ [22m[39m               for entry (66, 1) = NE.
      From worker 4:	[33m[1m│ [22m[39m               Patching value to BUF.
      From worker 4:	[33m[1m└ [22m[39m[90m@ AMLPipelineBase.BaseFilters ~/.julia/packages/AMLPipelineBase/c6Pzl/src/basefilters.jl:106[39m
      From worker 4:	[33m[1m│ [22m[39m               for entry (120, 1) = NE.
      From worker 4:	[33m[1m│ [22m[39m               Patching value to BUF.
      From worker 4:	[33m[1m└ [22m[39m[90m@ AMLPipelineBase.BaseFilters ~/.j

      From worker 4:	fold: 2, 0.65625
      From worker 3:	fold: 2, 0.6026785714285714
      From worker 3:	fold: 3, 0.6428571428571429
      From worker 3:	errors: 0
      From worker 3:	fold: 1, 0.6651785714285714
      From worker 3:	fold: 2, 0.6294642857142857
      From worker 3:	fold: 3, 0.6785714285714286
      From worker 3:	errors: 0
      From worker 3:	fold: 1, 0.5714285714285714
      From worker 4:	fold: 3, 0.6741071428571429
      From worker 4:	errors: 0
      From worker 4:	fold: 1, 0.6651785714285714
      From worker 3:	fold: 2, 0.6071428571428571
      From worker 4:	fold: 2, 0.6875
      From worker 3:	fold: 3, 0.6071428571428571
      From worker 3:	errors: 0
      From worker 3:	fold: 1, 0.5848214285714286
      From worker 4:	fold: 3, 0.6473214285714286
      From worker 4:	errors: 0
      From worker 4:	fold: 1, 0.6607142857142857
      From worker 3:	fold: 2, 0.5803571428571429
      From worker 4:	fold: 2, 0.6741071428571429
      From worker 3:	fold: 3, 0.549

(serialtime = "21.0 minutes", paralleltime = "51.0 seconds")

#### Best pipelines

In [20]:
df

Unnamed: 0_level_0,pipeline,mean,sd,time,folds
Unnamed: 0_level_1,String,Float64,Float64,Float64,Int64
1,PowerTransformer |> Noop |> LinearSVC,0.730655,0.0206197,3.34627,3
2,MinMaxScaler |> Noop |> LinearSVC,0.72619,0.0143507,2.30587,3
3,RobustScaler |> Noop |> LinearSVC,0.717262,0.0313561,16.2657,3
4,StandardScaler |> Noop |> LinearSVC,0.715774,0.0363595,1.60664,3
5,PowerTransformer |> Noop |> SGDClassifier,0.715774,0.0582642,3.38227,3
6,RobustScaler |> Noop |> SVC,0.708333,0.0405079,10.0467,3
7,MinMaxScaler |> Noop |> SGDClassifier,0.706845,0.0112349,10.3353,3
8,RobustScaler |> FactorAnalysis |> SGDClassifier,0.703869,0.0180422,12.4883,3
9,PowerTransformer |> FactorAnalysis |> LinearSVC,0.702381,0.0405079,10.3149,3
10,StandardScaler |> FactorAnalysis |> LinearSVC,0.702381,0.0405079,9.28563,3
