### Load Modules

In [2]:
using Random
using AutoMLPipeline
using AutoMLPipeline.FeatureSelectors
using AutoMLPipeline.EnsembleMethods
using AutoMLPipeline.CrossValidators
using AutoMLPipeline.DecisionTreeLearners
using AutoMLPipeline.Pipelines
using AutoMLPipeline.BaseFilters
using AutoMLPipeline.SKPreprocessors
using AutoMLPipeline.Utils
ENV["COLUMNS"]=1000;

### Load dataset
- pro-football dataset
- predict if the game is played home/away

In [3]:
using CSV
profbdata = CSV.read("profb.csv")
first(profbdata,5)

Unnamed: 0_level_0,Home.Away,Favorite_Points,Underdog_Points,Pointspread,Favorite_Name,Underdog_name,Year
Unnamed: 0_level_1,String,Int64,Int64,Float64,String,String,Int64
1,away,27,24,4.0,BUF,MIA,89
2,at_home,17,14,3.0,CHI,CIN,89
3,away,51,0,2.5,CLE,PIT,89
4,at_home,28,0,5.5,NO,DAL,89
5,at_home,38,7,5.5,MIN,HOU,89


### Split data into input features and target output

In [4]:
X = profbdata[:,2:end] 
Y = profbdata[:,1] |> Vector;

### Load the building blocks of modeling

In [5]:
# decomposition
pca = SKPreprocessor("PCA")
fa = SKPreprocessor("FactorAnalysis")
ica = SKPreprocessor("FastICA")
# Scaler 
rb = SKPreprocessor("RobustScaler")
pt = SKPreprocessor("PowerTransformer")
norm = SKPreprocessor("Normalizer")
mx = SKPreprocessor("MinMaxScaler")
# categorical preprocessing
ohe = OneHotEncoder()
# Column selector
disc = CatNumDiscriminator()
catf = CatFeatureSelector()
numf = NumFeatureSelector()
# Learners
rf = SKLearner("RandomForestClassifier")
gb = SKLearner("GradientBoostingClassifier")
lsvc = SKLearner("LinearSVC")
svc = SKLearner("SVC")
mlp = SKLearner("MLPClassifier")
ada = SKLearner("AdaBoostClassifier");
jrf = RandomForest();
vote = VoteEnsemble();
stack = StackEnsemble();
best = BestLearner();

### Define a pipeline composed of transformers and a learner at the end
#### Symbolic expression exploiting Julia Macro (allows symbolic manipulation)

In [6]:
pvote = @pipeline  (catf |> ohe) + (numf) |> vote
pred = fit_transform!(pvote,X,Y)
score(:accuracy,pred,Y)

100.0

#### Corresponding function call

In [7]:
@pipelinex (catf |> ohe) + (numf) |> vote

:(Pipeline(ComboPipeline(Pipeline(catf, ohe), numf), vote))

### Evaluate performance by 5-fold cross-validation

In [8]:
crossvalidate(pvote,X,Y,"accuracy_score",5)

fold: 1, 0.5597014925373134
fold: 2, 0.6148148148148148
fold: 3, 0.5746268656716418
fold: 4, 0.6444444444444445
fold: 5, 0.6567164179104478


(mean = 0.6100608070757325, std = 0.04234739040027941, folds = 5)

### Use similar workflow to discover optimal pipeline

### RandomForest learner

In [9]:
prf = @pipeline  (numf |> rb |> pca) + (numf |> rb |> ica) + (catf |> ohe) + (numf |> rb |> fa) |> rf
pred = fit_transform!(prf,X,Y)
score(:accuracy,pred,Y)

97.91666666666666

In [10]:
@pipelinex (numf |> rb |> pca) + (numf |> rb |> ica) + (catf |> ohe) + (numf |> rb |> fa) |> rf

:(Pipeline(ComboPipeline(Pipeline(Pipeline(numf, rb), pca), Pipeline(Pipeline(numf, rb), ica), Pipeline(catf, ohe), Pipeline(Pipeline(numf, rb), fa)), rf))

In [11]:
crossvalidate(prf,X,Y,"accuracy_score",5)

fold: 1, 0.7089552238805971
fold: 2, 0.6666666666666666
fold: 3, 0.6343283582089553
fold: 4, 0.6222222222222222
fold: 5, 0.6940298507462687


(mean = 0.665240464344942, std = 0.03724234983449279, folds = 5)

### Gradient Boost Learner

In [12]:
pgb = @pipeline (numf |> rb |> pca) + (numf|>rb|>ica)  + (numf|>rb|>fa) |> gb
pred = fit_transform!(pgb,X,Y)
score(:accuracy,pred,Y)

89.58333333333334

In [13]:
crossvalidate(pgb,X,Y,"accuracy_score",5)

fold: 1, 0.6268656716417911
fold: 2, 0.6962962962962963
fold: 3, 0.5597014925373134
fold: 4, 0.6888888888888889
fold: 5, 0.6194029850746269


(mean = 0.6382310668877833, std = 0.05609890533175638, folds = 5)

### Linear Support Vector Machine for Classification

In [14]:
plsvc = @pipeline ((numf |> rb |> pca)+(numf |> rb |> fa)+(numf |> rb |> ica)+(catf |> ohe )) |> lsvc
pred = fit_transform!(plsvc,X,Y)
score(:accuracy,pred,Y)

79.91071428571429

In [15]:
crossvalidate(plsvc,X,Y,"accuracy_score",5)

fold: 1, 0.7238805970149254
fold: 2, 0.6444444444444445
fold: 3, 0.746268656716418
fold: 4, 0.7185185185185186
fold: 5, 0.7611940298507462


(mean = 0.7188612493090105, std = 0.045017781073784874, folds = 5)

### RBF Kernel Support Vector Machine for Classification

In [16]:
psvc = @pipeline (numf |> pca) + numf + (numf|> ica) + (catf |> ohe) |> svc
pred = fit_transform!(psvc,X,Y)
score(:accuracy,pred,Y)

74.40476190476191

In [17]:
crossvalidate(psvc,X,Y,"accuracy_score",5)

fold: 1, 0.6417910447761194
fold: 2, 0.7407407407407407
fold: 3, 0.6343283582089553
fold: 4, 0.6444444444444445
fold: 5, 0.7014925373134329


(mean = 0.6725594250967386, std = 0.04659553794490995, folds = 5)