### Load Modules

In [None]:
using Random
using AutoMLPipeline
using DataFrames
using AbstractTrees
ENV["COLUMNS"]=1000;

### Load dataset
- pro-football dataset
- predict if the game is played home/away

In [None]:
using CSV
profbdata = CSV.read("profb.csv",DataFrame)
first(profbdata,5)

### Split data into input features and target output

In [None]:
X = profbdata[:,2:end] 
Y = profbdata[:,1] |> Vector;

### Load the building blocks of modeling

In [None]:
# decomposition
pca = SKPreprocessor("PCA")
fa = SKPreprocessor("FactorAnalysis")
ica = SKPreprocessor("FastICA")
# Scaler 
rb = SKPreprocessor("RobustScaler")
pt = SKPreprocessor("PowerTransformer")
norm = SKPreprocessor("Normalizer")
mx = SKPreprocessor("MinMaxScaler")
# categorical preprocessing
ohe = OneHotEncoder()
# Column selector
disc = CatNumDiscriminator()
catf = CatFeatureSelector()
numf = NumFeatureSelector()
# Learners
rf = SKLearner("RandomForestClassifier")
gb = SKLearner("GradientBoostingClassifier")
lsvc = SKLearner("LinearSVC")
svc = SKLearner("SVC")
mlp = SKLearner("MLPClassifier")
ada = SKLearner("AdaBoostClassifier");
jrf = RandomForest();
vote = VoteEnsemble();
stack = StackEnsemble();
best = BestLearner();

### Define a pipeline composed of transformers and a learner at the end

In [None]:
pvote = ((catf |> ohe) + numf) |> vote
pred = fit_transform!(pvote,X,Y)
score(:accuracy,pred,Y)

In [None]:
pohe = numf + catf |> ohe 
pred = fit_transform!(pvote,X,Y)

#### Corresponding function call

In [None]:
@pipelinex (catf |> ohe) + (numf) |> vote

#### Tree representation

In [None]:
(@pipelinex (catf |> ohe) + (numf) |> vote) |> print_tree

### Evaluate performance by 5-fold cross-validation

In [None]:
crossvalidate(pvote,X,Y,"accuracy_score",10)

### Use similar workflow to discover optimal pipeline

### RandomForest learner

In [None]:
prf = (numf |> rb |> pca) + (numf |> rb |> ica) + (catf |> ohe) + (numf |> rb |> fa) |> rf
pred = fit_transform!(prf,X,Y)
score(:accuracy,pred,Y)

In [None]:
@pipelinex (numf |> rb |> pca) + (numf |> rb |> ica) + (catf |> ohe) + (numf |> rb |> fa) |> rf

In [None]:
crossvalidate(prf,X,Y,"accuracy_score",10)

### Gradient Boost Learner

In [None]:
pgb = @pipeline (numf |> rb |> pca) + (numf|>rb|>ica)  + (numf|>rb|>fa) |> gb
pred = fit_transform!(pgb,X,Y)
score(:accuracy,pred,Y)

In [None]:
crossvalidate(pgb,X,Y,"accuracy_score",10)

### Linear Support Vector Machine for Classification

In [None]:
plsvc = @pipeline ((numf |> rb |> pca)+(numf |> rb |> fa)+(numf |> rb |> ica)+(catf |> ohe )) |> lsvc
pred = fit_transform!(plsvc,X,Y)
score(:accuracy,pred,Y)

In [None]:
crossvalidate(plsvc,X,Y,"accuracy_score",10)

### RBF Kernel Support Vector Machine for Classification

In [None]:
psvc = @pipeline (numf |> pca) + numf + (numf|> ica) + (catf |> ohe) |> svc
pred = fit_transform!(psvc,X,Y)
score(:accuracy,pred,Y)

In [None]:
crossvalidate(psvc,X,Y,"accuracy_score",10)

In [None]:
using AutoMLPipeline
using Distributed
using DataFrames

# disable truncation of dataframes columns
import Base.show
show(df::AbstractDataFrame) = show(df,truncate=0)
show(io::IO,df::AbstractDataFrame) = show(io,df;truncate=0)

# Add workers
nprocs() == 1 && addprocs(;  exeflags="--project");

@sync @everywhere using AutoMLPipeline
@sync @everywhere using DataFrames

#### feature selectors
catf   = CatFeatureSelector();
numf   = NumFeatureSelector();
# hot-bit encoder
ohe    = OneHotEncoder();
#### feature scalers
rb     = SKPreprocessor("RobustScaler");
pt     = SKPreprocessor("PowerTransformer");
mx     = SKPreprocessor("MinMaxScaler");
std    = SKPreprocessor("StandardScaler");
norm   = SKPreprocessor("Normalizer");
#### feature extractors
pca    = SKPreprocessor("PCA", Dict(:autocomponent => true));
ica    = SKPreprocessor("FastICA", Dict(:autocomponent => true));
fa     = SKPreprocessor("FactorAnalysis", Dict(:autocomponent => true));
#### Learners
rf     = SKLearner("RandomForestClassifier", Dict(:impl_args => Dict(:n_estimators => 10)));
gb     = SKLearner("GradientBoostingClassifier");
lsvc   = SKLearner("LinearSVC");
mlp    = SKLearner("MLPClassifier");
stack  = StackEnsemble();
rbfsvc = SKLearner("SVC");
ada    = SKLearner("AdaBoostClassifier");
vote   = VoteEnsemble();
best   = BestLearner();
tree   = PrunedTree();
sgd    = SKLearner("SGDClassifier");
noop = Identity(Dict(:name => "Noop"));

# Parallel Search for Datamining Optimal Pipelines
function prpsearch()
    learners = [rf,ada,sgd,tree,rbfsvc,lsvc,gb];
    scalers = [rb,pt,norm,std,mx,noop];
    extractors = [pca,ica,fa,noop];
    dftable = @sync @distributed (vcat) for lr in learners
    @distributed (vcat) for sc in scalers
      @distributed (vcat) for xt  in extractors
        pipe  = @pipeline (catf |> ohe) + (numf |> sc |> xt)  |> lr
        scn   = sc.name[1:end - 4]; xtn = xt.name[1:end - 4]; lrn = lr.name[1:end - 4]
        pname = "$scn |> $xtn |> $lrn"
        ptime = @elapsed begin
          mean, sd, kfold, _ = crossvalidate(pipe, X, Y, "accuracy_score", 3)
        end
        DataFrame(pipeline=pname, mean=mean, sd=sd, time=ptime, folds=kfold)
      end
    end
    end
    sort!(dftable, :mean, rev=true);
    dftable
end
runtime = @elapsed begin
    df = prpsearch()
end;
serialtime = df.time |> sum;
(serialtime = "$(round(serialtime / 60.0)) minutes", paralleltime = "$(round(runtime)) seconds")

#### Best pipelines

In [None]:
df