In [17]:
using RCall
using DataFrames, DataFramesMeta
using CSV
using DataVoyager
using AMLPipelineBase
using AutoMLPipeline
using Statistics

using Distributed
nprocs() == 1 && addprocs(;exeflags="--project")
@everywhere begin
    using AutoMLPipeline
    using DataFrames
    using Statistics
end

ENV["LINES"] = 10
ENV["COLUMNS"]=10000;

### Load Covid Data

source: https://covid.ourworldindata.org/data/owid-covid-data.csv

In [18]:
function getdata()
   df = CSV.read("../data/owid-covid-data.csv",DataFrame);
   @rput df
   R"""
      library(tidyverse)
      library(mice)

      sdf = df %>% group_by(location) %>% 
      summarise(
        mnewcases = mean(new_cases,na.rm=T),
        mnewdeaths = mean(new_deaths,na.rm=T),
        mpopulation = mean(population,na.rm=T),
        mcardio = mean(cardiovasc_death_rate,na.rm=T),
        mdiab = mean(diabetes_prevalence,na.rm=T),
        mhandwash = mean(handwashing_facilities,na.rm=T),
        mhbed = mean(hospital_beds_per_thousand,na.rm=T),
        mpatient = mean(hosp_patients,na.rm=T),
        madmission = mean(weekly_hosp_admissions,na.rm=T),
        mhdi = mean(human_development_index,na.rm=T),
        mle = mean(life_expectancy,na.rm=T)
      ) 
      imputed=mice(sdf,meth='sample',printFlag=F)
      complete(imputed)
   """ |> rcopy
end;

In [19]:
dfimp = getdata()
X = dfimp[:,Not([:location,:mnewdeaths])]
Y = dfimp.mnewdeaths;

└ @ RCall /Users/ppalmes/.julia/packages/RCall/eRsxl/src/io.jl:160
└ @ RCall /Users/ppalmes/.julia/packages/RCall/eRsxl/src/io.jl:160


### Baseline performance using Random Forest

In [20]:
R"""
library(caret)

fitControl <- trainControl(method = "repeatedcv",number = 5)
rfcaret <- train($X,$Y, method = "rf",trControl = fitControl)
rfcaret
"""

RObject{VecSxp}
Random Forest 

190 samples
 10 predictor

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 1 times) 
Summary of sample sizes: 151, 153, 153, 151, 152 
Resampling results across tuning parameters:

  mtry  RMSE      Rsquared   MAE     
   2    61.02000  0.6183065  23.48515
   6    51.84704  0.6960727  17.08586
  10    52.35961  0.7024063  15.34290

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 6.


#### NOTE: Best RMSE is around 44

### Use AutoML to find better RMSE by searching optimal pipelines

In [21]:
using AutoMLPipeline

#### Decomposition
pca = SKPreprocessor("PCA")
fa  = SKPreprocessor("FactorAnalysis")
ica = SKPreprocessor("FastICA")

#### Scaler 
rb   = SKPreprocessor("RobustScaler")
pt   = SKPreprocessor("PowerTransformer")
norm = SKPreprocessor("Normalizer")
mx   = SKPreprocessor("MinMaxScaler")
mstd    = SKPreprocessor("StandardScaler")

#### categorical preprocessing
ohe = OneHotEncoder()
noop = Identity(Dict(:name=>"Noop"))

#### Column selector
catf = CatFeatureSelector()
numf = NumFeatureSelector()

#### Learners
sk_rf       = SKLearner("RandomForestRegressor")
sk_gb       = SKLearner("GradientBoostingRegressor")
sk_svr      = SKLearner("SVR")
sk_gp       = SKLearner("GaussianProcessRegressor")
sk_ard      = SKLearner("ARDRegression")
jl_rf       = RandomForest()
jl_vote     = VoteEnsemble()
jl_stack    = StackEnsemble()
jl_best     = BestLearner();

In [22]:
plsvc = @pipeline  numf |> rb |> sk_svr
pred = fit_transform!(plsvc,X,Y)

190-element Array{Float64,1}:
 1.545499224143862
 1.6262378864793403
 4.359239387528533
 ⋮
 1.0221421504968689
 0.8491585976134459

In [23]:
@everywhere rmse(x,y) = sqrt(mean((x .- y).^2))

plsvc = @pipeline ((numf |> rb |> ica )) |> sk_gb
crossvalidate(plsvc,X,Y,rmse)

fold: 1, 6.743727618927776
fold: 2, 11.412466967613046
fold: 3, 18.20752085294149
fold: 4, 33.41345125291111
fold: 5, 74.08810729497955
fold: 6, 93.00217981052387
fold: 7, 38.997278580339966
fold: 8, 62.49401848294808
fold: 9, 8.584518289085185
fold: 10, 64.16899323574091
errors: 0


(mean = 41.1112262386011, std = 30.692855481251648, folds = 10, errors = 0)

### Parallel algo to search optimal combinations of preprocessing elements and learner

In [24]:
function prpsearch(X,Y)
    learners = [sk_rf,sk_gb,sk_svr,sk_ard,sk_gp,jl_rf];
    scalers = [rb,pt,norm,mx,mstd,noop];
    extractors = [ica,noop];
    dftable = @sync @distributed (vcat) for lr in learners
    @distributed (vcat) for sc in scalers
     @distributed (vcat) for xt  in extractors
          pipe  = @pipeline (numf |> sc |> xt)  |> lr
          scn   = sc.name[1:end - 4]; xtn = xt.name[1:end - 4]; lrn = lr.name[1:end - 4]
          pname = "$scn |> $xtn |> $lrn"
          ptime = @elapsed begin
             mean, sd, kfold, _ = crossvalidate(pipe, X, Y, rmse, 5,true)
          end
          DataFrame(pipeline=pname, mean=mean, sd=sd, time=ptime, folds=kfold)
        end
     end
    end
    sort!(dftable, :mean, rev=false);
    dftable
end

prpsearch (generic function with 1 method)

In [25]:
ENV["LINES"]=1000

runtime = @elapsed begin
    dfres = prpsearch(X,Y)
end;
serialtime = dfres.time |> sum;

(serialtime = "$(round(serialtime / 60.0)) minutes", 
    paralleltime = "$(round(runtime)) seconds")

      From worker 4:	fold: 1, 55.618515824596095
      From worker 4:	fold: 2, 117.07759225697248
      From worker 5:	fold: 1, 84.96770590460763
      From worker 3:	fold: 1, 50.390389333583684
      From worker 4:	fold: 3, 65.74017868600878
      From worker 6:	fold: 1, 48.45460977174831
      From worker 7:	fold: 1, 21.74677563821194
      From worker 4:	fold: 4, 38.49457713236518
      From worker 5:	fold: 2, 40.31302851256214
      From worker 6:	fold: 2, 903.580332927692
      From worker 7:	fold: 1, 176.26646773606478
      From worker 5:	fold: 3, 47.98734099496711
      From worker 7:	fold: 2, 43.7269520324854
      From worker 4:	fold: 5, 149.37835777658435
      From worker 6:	fold: 3, 212.45898992586095
      From worker 4:	errors: 0
      From worker 4:	fold: 1, 63.35747514525789
      From worker 7:	fold: 3, 83.36026580487345
      From worker 6:	fold: 4, 67.07236700024602
      From worker 7:	fold: 4, 62.76969491024395
      From worker 5:	fold: 4, 12.899674727306195
    

      From worker 3:	fold: 2, 23.02324063257822
      From worker 2:	fold: 5, 141.34920466178056
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 49.859515022765635
      From worker 2:	fold: 1, 22.05612906440077
      From worker 2:	fold: 2, 166.70227025662803
      From worker 2:	fold: 3, 53.43886635202109
      From worker 2:	fold: 4, 106.33094219828753
      From worker 2:	fold: 5, 32.020497000961605
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 18.150891146442007
      From worker 3:	fold: 3, 29.460747881457724
      From worker 2:	fold: 2, 12.441555715107878
      From worker 3:	fold: 4, 91.36108982675718
      From worker 2:	fold: 3, 56.42993632309373
      From worker 3:	fold: 5, 53.4382110536144
      From worker 3:	errors: 0
      From worker 3:	fold: 1, 93.26520350652663
      From worker 3:	fold: 2, 148.10610853758882
      From worker 3:	fold: 3, 53.78745388131175
      From worker 3:	fold: 4, 42.49676270411647
      From worker 3:	fold: 5, 91.60

      From worker 2:	fold: 5, 40.068476263260635
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 34.46212246160417
      From worker 2:	fold: 2, 18.921653397684807
      From worker 2:	fold: 3, 10.138098268494083
      From worker 2:	fold: 4, 28.687028737689662
      From worker 2:	fold: 5, 91.11167869387165
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 125.4733888719907
      From worker 2:	fold: 1, 78.38546172359223
      From worker 2:	fold: 2, 151.3619555369334
      From worker 2:	fold: 3, 29.88940070484285
      From worker 2:	fold: 4, 109.44244728673875
      From worker 2:	fold: 5, 47.23131369304999
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 36.3153363538389
      From worker 2:	fold: 2, 21.97991674073669
      From worker 2:	fold: 3, 35.791058588139194
      From worker 2:	fold: 4, 40.83654343709692
      From worker 2:	fold: 5, 101.1188595443183
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 34.88776586064804
      F

(serialtime = "8.0 minutes", paralleltime = "15.0 seconds")

In [26]:
dfres

Unnamed: 0_level_0,pipeline,mean,sd,time,folds
Unnamed: 0_level_1,String,Float64,Float64,Float64,Int64
1,Noop |> Noop |> ARDRegression,27.9648,13.7292,10.3579,5
2,MinMaxScaler |> FastICA |> ARDRegression,30.2844,15.4899,3.57426,5
3,Noop |> FastICA |> ARDRegression,31.0432,14.6194,4.17557,5
4,MinMaxScaler |> Noop |> ARDRegression,32.6249,14.2134,8.75153,5
5,StandardScaler |> FastICA |> ARDRegression,33.4454,8.99663,4.6386,5
6,StandardScaler |> Noop |> ARDRegression,36.6641,31.8255,9.94754,5
7,RobustScaler |> Noop |> ARDRegression,39.0926,26.2172,6.26191,5
8,PowerTransformer |> Noop |> RandomForestRegressor,39.2263,31.1596,8.80404,5
9,RobustScaler |> FastICA |> ARDRegression,39.7136,29.9239,1.84589,5
10,MinMaxScaler |> Noop |> GradientBoostingRegressor,41.0551,25.5872,9.38505,5


### Extend the pipeline with R ML implementations

In [27]:
module RMachine

using RCall
using DataFrames
using Random

using AMLPipelineBase
using AMLPipelineBase.AbsTypes
using AMLPipelineBase.Utils

import AMLPipelineBase.AbsTypes: fit!, transform!

export CRTLearner,fit!,transform!,fit_transform!
export caretrun, crossvalidate

mutable struct CRTLearner <: Learner
   name
   model
   function CRTLearner(args::Dict{Symbol,<:Any} = Dict())
      fitControl="trainControl(method = 'none')"
      default_args = Dict{Symbol,Any}(
                      :name => "R_learner",
                      :learner => "rf",
                      :fitControl => fitControl,
                      :impl_args => Dict{Symbol,Any}()
                     )
      cargs = nested_dict_merge(default_args, args)
      cargs[:name] = cargs[:name]*"_"*randstring(3)
      rl = cargs[:learner]
      new(cargs[:name],cargs)
   end
end

function fit!(crt::CRTLearner,x::DataFrame,y::Vector)
   rmodel = rcall(:train,x,y,method=crt.model[:learner],trControl = reval(crt.model[:fitControl]))
   crt.model[:rmodel] = rmodel
end

function transform!(crt::CRTLearner,x::DataFrame)
   res = rcall(:predict,crt.model[:rmodel],x)
   return rcopy(res) |> Array
end


end



Main.RMachine

In [28]:
module RTest
using Test
using DataFrames
using RDatasets

using ..RMachine

function test_r()
   rf = CRTLearner(Dict(:learner=>"rf"))
   gbm = CRTLearner(Dict(:learner=>"gbm"))
   iris=dataset("datasets","iris")
   x=iris[:,1:4]
   y=iris[:,5] |> Array{String}
   @test (fit_transform!(rf,x,y) .== y ) |> sum == 150
   @test crossvalidate(rf,x,y,"accuracy_score",3,false).mean > 0.80
end
@testset "rmachine tests" begin
   test_r()
end
end

[37m[1mTest Summary:  | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
rmachine tests | [32m   2  [39m[36m    2[39m




Main.RTest

In [29]:
@everywhere using AMLPipelineR

In [30]:
r_rf = CRTLearner(Dict(:name=>"RF_fromR",:learner=>"rf"));

In [31]:
function prpsearch(X,Y)
    learners = [sk_rf,sk_gb,sk_svr,sk_ard,sk_gp,jl_rf,r_rf];
    scalers = [rb,pt,norm,mx,mstd,noop];
    extractors = [ica,noop];
    dftable = @sync @distributed (vcat) for lr in learners
    @distributed (vcat) for sc in scalers
     @distributed (vcat) for xt  in extractors
          pipe  = @pipeline (numf |> sc |> xt)  |> lr
          scn   = sc.name[1:end - 4]; xtn = xt.name[1:end - 4]; lrn = lr.name[1:end - 4]
          pname = "$scn |> $xtn |> $lrn"
          ptime = @elapsed begin
             mean, sd, kfold, _ = crossvalidate(pipe, X, Y, rmse, 3,true)
          end
          DataFrame(pipeline=pname, mean=mean, sd=sd, time=ptime, folds=kfold)
        end
     end
    end
    sort!(dftable, :mean, rev=false);
    dftable
end

runtime = @elapsed begin
    dfres = prpsearch(X,Y)
end;
serialtime = dfres.time |> sum;

(serialtime = "$(round(serialtime / 60.0)) minutes", 
    paralleltime = "$(round(runtime)) seconds")

      From worker 7:	fold: 1, 47.60562200265507
      From worker 7:	fold: 1, 18.95241792939585
      From worker 7:	fold: 2, 65.6203915646111
      From worker 8:	fold: 1, 109.8191179594075
      From worker 7:	fold: 3, 55.36644826502138
      From worker 7:	errors: 0
      From worker 7:	fold: 1, 99.75013961662336
      From worker 7:	fold: 2, 23.93526649100652
      From worker 7:	fold: 3, 128.84891796678903
      From worker 7:	errors: 0
      From worker 7:	fold: 1, 39.43892752357571
      From worker 7:	fold: 2, 71.53869219006133
      From worker 3:	fold: 1, 32.749056365322986
      From worker 7:	fold: 3, 26.664310479628487
      From worker 7:	errors: 0
      From worker 7:	fold: 1, 75.87245315842128
      From worker 7:	fold: 2, 83.78577840544183
      From worker 8:	fold: 2, 46.5246963889781
      From worker 4:	fold: 1, 63.53362389755619
      From worker 7:	fold: 3, 348.25221105259635
      From worker 7:	errors: 0
      From worker 7:	fold: 1, 53.99459050484783
      From

      From worker 2:	fold: 3, 61.194619010776755
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 44.29177581077011
      From worker 2:	fold: 2, 153.387778218188
      From worker 2:	fold: 3, 56.535393762542824
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 83.51182088784711
      From worker 2:	fold: 2, 33.8453406678255
      From worker 2:	fold: 3, 137.27004741907007
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 49.69616929203554
      From worker 2:	fold: 2, 55.22355443742835
      From worker 2:	fold: 3, 10.100231686457413
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 47.719848651459024
      From worker 2:	fold: 1, 79.66754248490545
      From worker 2:	fold: 2, 58.423479270448205
      From worker 2:	fold: 3, 35.21902338745926
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 71.28759915912616
      From worker 2:	fold: 2, 77.04705796020792
      From worker 2:	fold: 3, 36.25607533636381
      From worker 2:	erro

(serialtime = "9.0 minutes", paralleltime = "20.0 seconds")

In [32]:
dfres

Unnamed: 0_level_0,pipeline,mean,sd,time,folds
Unnamed: 0_level_1,String,Float64,Float64,Float64,Int64
1,Noop |> Noop |> ARDRegression,26.8347,14.1514,11.8771,3
2,MinMaxScaler |> Noop |> ARDRegression,30.2363,11.1938,3.14222,3
3,RobustScaler |> Noop |> ARDRegression,31.4547,8.30047,12.8923,3
4,RobustScaler |> FastICA |> ARDRegression,37.2723,5.8069,4.33922,3
5,StandardScaler |> Noop |> ARDRegression,38.34,24.612,3.75042,3
6,MinMaxScaler |> FastICA |> GradientBoostingRegressor,42.3267,19.6701,3.08776,3
7,StandardScaler |> FastICA |> RandomForestRegressor,44.177,23.3355,4.40624,3
8,Noop |> FastICA |> ARDRegression,45.8806,23.1203,2.10264,3
9,Noop |> FastICA |> RandomForestRegressor,46.6464,24.5256,2.17816,3
10,RobustScaler |> FastICA |> RandomForestRegressor,48.5624,28.9416,9.83627,3
