In [87]:
using RCall
using DataFrames, DataFramesMeta
using CSV
using DataVoyager
using AMLPipelineBase
using AutoMLPipeline
using Statistics

using Distributed
nprocs() == 1 && addprocs(;exeflags="--project")
@everywhere begin
    using AutoMLPipeline
    using DataFrames
    using Statistics
end

ENV["LINES"] = 10
ENV["COLUMNS"]=10000;

### Load Covid Data

source: https://covid.ourworldindata.org/data/owid-covid-data.csv

In [7]:
function getdata()
   df = CSV.read("../data/owid-covid-data.csv",DataFrame);
   @rput df
   R"""
      library(tidyverse)
      library(mice)

      sdf = df %>% group_by(location) %>% 
      summarise(
        mnewcases = mean(new_cases,na.rm=T),
        mnewdeaths = mean(new_deaths,na.rm=T),
        mpopulation = mean(population,na.rm=T),
        mcardio = mean(cardiovasc_death_rate,na.rm=T),
        mdiab = mean(diabetes_prevalence,na.rm=T),
        mhandwash = mean(handwashing_facilities,na.rm=T),
        mhbed = mean(hospital_beds_per_thousand,na.rm=T),
        mpatient = mean(hosp_patients,na.rm=T),
        madmission = mean(weekly_hosp_admissions,na.rm=T),
        mhdi = mean(human_development_index,na.rm=T),
        mle = mean(life_expectancy,na.rm=T)
      ) 
      imputed=mice(sdf,meth='sample',printFlag=F)
      complete(imputed)
   """ |> rcopy
end;

In [35]:
dfimp = getdata()
X = dfimp[:,Not([:location,:mnewdeaths])]
Y = dfimp.mnewdeaths;

└ @ RCall /Users/ppalmes/.julia/packages/RCall/eRsxl/src/io.jl:160
└ @ RCall /Users/ppalmes/.julia/packages/RCall/eRsxl/src/io.jl:160


In [10]:
R"""
library(caret)

fitControl <- trainControl(## 5-fold CV
                           method = "repeatedcv",
                           number = 5,
                          )
rfcaret <- train($X,$Y,  
                 method = "rf", 
                 trControl = fitControl,
                 )
rfcaret
"""

│ 
│ Attaching package: ‘caret’
│ 
│ The following object is masked from ‘package:purrr’:
│ 
│     lift
│ 
└ @ RCall /Users/ppalmes/.julia/packages/RCall/eRsxl/src/io.jl:160


RObject{VecSxp}
Random Forest 

190 samples
 10 predictor

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 1 times) 
Summary of sample sizes: 152, 151, 153, 152, 152 
Resampling results across tuning parameters:

  mtry  RMSE      Rsquared   MAE     
   2    59.08423  0.6733149  21.33716
   6    49.07606  0.7537993  14.91031
  10    48.15070  0.7846335  13.48423

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 10.


In [71]:
using AutoMLPipeline

#### Decomposition
pca = SKPreprocessor("PCA")
fa  = SKPreprocessor("FactorAnalysis")
ica = SKPreprocessor("FastICA")

#### Scaler 
rb   = SKPreprocessor("RobustScaler")
pt   = SKPreprocessor("PowerTransformer")
norm = SKPreprocessor("Normalizer")
mx   = SKPreprocessor("MinMaxScaler")
mstd    = SKPreprocessor("StandardScaler")

#### categorical preprocessing
ohe = OneHotEncoder()
noop = Identity(Dict(:name=>"Noop"))

#### Column selector
catf = CatFeatureSelector()
numf = NumFeatureSelector()

#### Learners
sk_rf       = SKLearner("RandomForestRegressor")
sk_gb       = SKLearner("GradientBoostingRegressor")
sk_svr      = SKLearner("SVR")
sk_gp       = SKLearner("GaussianProcessRegressor")
sk_ard      = SKLearner("ARDRegression")
jl_rf       = RandomForest()
jl_vote     = VoteEnsemble()
jl_stack    = StackEnsemble()
jl_best     = BestLearner();

In [72]:
plsvc = @pipeline  numf |> rb |> sk_svr
pred = fit_transform!(plsvc,X,Y)

190-element Array{Float64,1}:
  1.9947016077660855
  1.563442127520517
  3.4132272470427374
  0.8165241542589561
  1.4180225483173476
  0.97321360257612
 16.78504012363476
  2.567986932013074
  3.9583847301788952
  4.710616972286513
  2.880876084994698
  0.6787592751847766
  2.0291946736187603
 10.051261959262849
  1.8186288749064978
  3.7577187394637956
  8.545811513665335
  0.9985165935001552
  0.45993993880024675
  0.7836066497892746
  2.4289669007137675
  2.342838246753356
  0.9558367140371962
  7.707238226696546
  1.5067009570712955
  3.691756834293649
  0.32078481560837346
  0.5240855036773739
  1.1141984987107048
  1.6079858605584478
  7.177311302570495
  3.7759189733698615
  0.29266047847861465
  0.027356154088445273
  7.589918796681051
  7.676588011864249
 15.581892802530605
  0.3385358630418498
  0.7885790194452911
  2.823995876789053
  1.3142218946972086
  3.4331242811942784
  2.114963927460807
  1.6581760692361422
  7.912986451407457
  2.8497855613142957
  2.805684919614369

In [84]:
@everywhere rmse(x,y) = sqrt(mean((x .- y).^2))

plsvc = @pipeline ((numf |> rb |> ica )) |> sk_rf
crossvalidate(plsvc,X,Y,mrmse)

fold: 1, 15.385363620728153
fold: 2, 16.15377894552439
fold: 3, 56.57020873254884
fold: 4, 17.367717620616904
fold: 5, 8.615273868958266
fold: 6, 5.344016547484432
fold: 7, 46.401204634717985
fold: 8, 53.941357896864595
fold: 9, 26.814311295547505
fold: 10, 137.21184482441765
errors: 0


(mean = 38.38050779874087, std = 39.41300873914402, folds = 10, errors = 0)

In [85]:
function prpsearch(X,Y)
    learners = [sk_rf,sk_gb,sk_svr,sk_ard,sk_gp,jl_rf];
    scalers = [rb,pt,norm,mx,mstd,noop];
    extractors = [ica,noop];
    dftable = @sync @distributed (vcat) for lr in learners
    @distributed (vcat) for sc in scalers
     @distributed (vcat) for xt  in extractors
          pipe  = @pipeline (numf |> sc |> xt)  |> lr
          scn   = sc.name[1:end - 4]; xtn = xt.name[1:end - 4]; lrn = lr.name[1:end - 4]
          pname = "$scn |> $xtn |> $lrn"
          ptime = @elapsed begin
             mean, sd, kfold, _ = crossvalidate(pipe, X, Y, rmse, 5,true)
          end
          DataFrame(pipeline=pname, mean=mean, sd=sd, time=ptime, folds=kfold)
        end
     end
    end
    sort!(dftable, :mean, rev=false);
    dftable
end

prpsearch (generic function with 1 method)

In [88]:
ENV["LINES"]=1000

runtime = @elapsed begin
    dfres = prpsearch(X,Y)
end;
serialtime = dfres.time |> sum;

(serialtime = "$(round(serialtime / 60.0)) minutes", 
    paralleltime = "$(round(runtime)) seconds")

      From worker 3:	fold: 1, 13.604523452548
      From worker 5:	fold: 1, 24.31309619750921
      From worker 4:	fold: 1, 80.26477808844527
      From worker 7:	fold: 1, 67.11608408502565
      From worker 5:	fold: 2, 9.456298568178553
      From worker 7:	fold: 1, 33.67827580789317
      From worker 5:	fold: 3, 21.8788945509615
      From worker 5:	fold: 4, 60.10066061996548
      From worker 4:	fold: 2, 29.704466641354166
      From worker 4:	fold: 3, 151.52456273243888
      From worker 6:	fold: 1, 137.98782722612634
      From worker 7:	fold: 2, 23.746907647917336
      From worker 4:	fold: 4, 109.05667844126138
      From worker 5:	fold: 5, 25.532258549816227
      From worker 5:	errors: 0
      From worker 5:	fold: 1, 85.44986902233022
      From worker 6:	fold: 2, 130.30846320194348
      From worker 4:	fold: 5, 52.14163983022657
      From worker 4:	errors: 0
      From worker 4:	fold: 1, 72.83927172792453
      From worker 6:	fold: 3, 50.01266807552282
      From worker 5:	f

      From worker 4:	fold: 2, 51.293384888574145
      From worker 5:	fold: 4, 49.138620486618
      From worker 4:	fold: 3, 541.8309940668594
      From worker 6:	fold: 5, 27.34563450758119
      From worker 6:	errors: 0
      From worker 6:	fold: 1, 152.4331495055207
      From worker 5:	fold: 5, 20.083815604747922
      From worker 5:	errors: 0
      From worker 5:	fold: 1, 132.464390873328
      From worker 7:	fold: 2, 53.223668003989346
      From worker 5:	fold: 2, 1114.9463121440872
      From worker 5:	fold: 3, 503.525821650417
      From worker 4:	fold: 4, 693.3968211731109
      From worker 5:	fold: 4, 24.170172353983137
      From worker 6:	fold: 2, 22.38468940130868
      From worker 5:	fold: 5, 960.3228097511677
      From worker 5:	errors: 0
      From worker 5:	fold: 1, 66.45110141516525
      From worker 4:	fold: 5, 119.38173295416856
      From worker 4:	errors: 0
      From worker 4:	fold: 1, 68.5012381279177
      From worker 6:	fold: 3, 87.42563751580553
      From 

      From worker 2:	fold: 4, 73.61335298322689
      From worker 2:	fold: 5, 42.29602767082701
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 79.20768892341047
      From worker 2:	fold: 2, 102.36390670555585
      From worker 2:	fold: 3, 104.44604831879721
      From worker 2:	fold: 4, 96.04298591979064
      From worker 2:	fold: 5, 64.668394361083
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 50.673297057912336
      From worker 2:	fold: 1, 126.625647549624
      From worker 2:	fold: 1, 28.67919398050893
      From worker 2:	fold: 2, 91.6254556538764
      From worker 2:	fold: 3, 31.386672141244382
      From worker 2:	fold: 4, 40.82221814847387
      From worker 2:	fold: 5, 94.17824235549249
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 55.38930941739582
      From worker 2:	fold: 2, 101.38611300413702
      From worker 2:	fold: 3, 14.00027798103735
      From worker 2:	fold: 4, 90.26097449610631
      From worker 2:	fold: 5, 22.982639465

(serialtime = "6.0 minutes", paralleltime = "12.0 seconds")

In [89]:
dfres

Unnamed: 0_level_0,pipeline,mean,sd,time,folds
Unnamed: 0_level_1,String,Float64,Float64,Float64,Int64
1,RobustScaler |> FastICA |> ARDRegression,28.2562,18.9152,0.884838,5
2,Noop |> Noop |> ARDRegression,29.4067,11.7404,7.73995,5
3,RobustScaler |> Noop |> ARDRegression,29.485,16.6679,4.51522,5
4,MinMaxScaler |> FastICA |> ARDRegression,29.5211,18.6372,2.46762,5
5,Noop |> FastICA |> ARDRegression,29.7173,16.0288,2.90426,5
6,StandardScaler |> FastICA |> ARDRegression,31.7412,10.1188,3.06498,5
7,StandardScaler |> Noop |> ARDRegression,34.7005,34.0469,8.18524,5
8,Noop |> Noop |> GradientBoostingRegressor,35.4714,26.1603,7.51235,5
9,RobustScaler |> Noop |> GradientBoostingRegressor,38.3182,24.9221,4.50711,5
10,PowerTransformer |> Noop |> GradientBoostingRegressor,38.6402,28.299,4.16924,5


In [139]:
module RMachine

using RCall
using DataFrames
using Random

using AMLPipelineBase
using AMLPipelineBase.AbsTypes
using AMLPipelineBase.Utils


import AMLPipelineBase.AbsTypes: fit!, transform!

export CRTLearner,fit!,transform!,fit_transform!
export caretrun, crossvalidate

const RLearners=["rf","gbm"]

mutable struct CRTLearner <: Learner
   name
   model
   function CRTLearner(args::Dict{Symbol,<:Any} = Dict())
      fitControl="trainControl(method = 'none')"
      default_args = Dict{Symbol,Any}(
                      :name => "rlearner",
                      :learner => "rf",
                      :fitControl => fitControl,
                      :impl_args => Dict{Symbol,Any}()
                     )
      cargs = nested_dict_merge(default_args, args)
      cargs[:name] = cargs[:name]*"_"*randstring(3)
      rl = cargs[:learner]
      if !(rl in RLearners)
         println("$rl is not supported.")
         println()
         throw(ArgumentError("Argument keyword error"))
      end
      new(cargs[:name],cargs)
   end
end

function fit!(crt::CRTLearner,x::DataFrame,y::Vector)
   rmodel = rcall(:train,x,y,method=crt.model[:learner],trControl = reval(crt.model[:fitControl]))
   crt.model[:rmodel] = rmodel
end

function transform!(crt::CRTLearner,x::DataFrame)
   res = rcall(:predict,crt.model[:rmodel],x)
   return rcopy(res) |> Array
end


end



Main.RMachine

In [141]:
module RTest
using Test
using DataFrames
using RDatasets

using ..RMachine

function caretrun()
   rf = CRTLearner(Dict(:learner=>"rf"))
   gbm = CRTLearner(Dict(:learner=>"gbm"))
   iris=dataset("datasets","iris")
   x=iris[:,1:4]
   y=iris[:,5] |> Array{String}
   @test (fit_transform!(rf,x,y) .== y ) |> sum == 150
   @test crossvalidate(rf,x,y,"accuracy_score",3,false).mean > 0.80
end
@testset "rmachine tests" begin
   caretrun()
end
end

[37m[1mTest Summary:  | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
rmachine tests | [32m   2  [39m[36m    2[39m




Main.RTest

In [150]:
@everywhere using JuliaR

In [161]:
r_rf = JuliaR.CRTLearner(Dict(:name=>"RFfromR",:learner=>"rf"));

In [162]:
function prpsearch(X,Y)
    learners = [sk_rf,sk_gb,sk_svr,sk_ard,sk_gp,jl_rf,r_rf];
    scalers = [rb,pt,norm,mx,mstd,noop];
    extractors = [ica,noop];
    dftable = @sync @distributed (vcat) for lr in learners
    @distributed (vcat) for sc in scalers
     @distributed (vcat) for xt  in extractors
          pipe  = @pipeline (numf |> sc |> xt)  |> lr
          scn   = sc.name[1:end - 4]; xtn = xt.name[1:end - 4]; lrn = lr.name[1:end - 4]
          pname = "$scn |> $xtn |> $lrn"
          ptime = @elapsed begin
             mean, sd, kfold, _ = crossvalidate(pipe, X, Y, rmse, 5,true)
          end
          DataFrame(pipeline=pname, mean=mean, sd=sd, time=ptime, folds=kfold)
        end
     end
    end
    sort!(dftable, :mean, rev=false);
    dftable
end

runtime = @elapsed begin
    dfres = prpsearch(X,Y)
end;
serialtime = dfres.time |> sum;

(serialtime = "$(round(serialtime / 60.0)) minutes", 
    paralleltime = "$(round(runtime)) seconds")

      From worker 8:	fold: 1, 17.37625575290625
      From worker 7:	fold: 1, 16.07191851750751
      From worker 7:	fold: 1, 19.181015487619263
      From worker 7:	fold: 2, 98.85924279948307
      From worker 8:	fold: 2, 71.20823953939491
      From worker 7:	fold: 3, 38.5244147269735
      From worker 4:	fold: 1, 95.15695930007472
      From worker 5:	fold: 1, 9.240200905707054
      From worker 3:	fold: 1, 39.82829273265745
      From worker 4:	fold: 2, 14.615407461674936
      From worker 5:	fold: 2, 27.0093095126894
      From worker 4:	fold: 3, 101.40513138011558
      From worker 6:	fold: 1, 77.23381225142415
      From worker 4:	fold: 4, 158.35594186613275
      From worker 4:	fold: 5, 18.341582595914815
      From worker 4:	errors: 0
      From worker 4:	fold: 1, 71.8370705318638
      From worker 6:	fold: 2, 1436.3862661093217
      From worker 5:	fold: 3, 76.94722996327246
      From worker 6:	fold: 3, 937.4681504106672
      From worker 5:	fold: 4, 18.354640985784016
     

      From worker 3:	fold: 2, 48580.62637539865
      From worker 5:	fold: 3, 101.79749564946384
      From worker 6:	fold: 5, 30.018291038972464
      From worker 6:	errors: 0
      From worker 6:	fold: 1, 38.638824819774314
      From worker 6:	fold: 2, 34.313481464378576
      From worker 5:	fold: 4, 52.504349872796894
      From worker 3:	fold: 3, 39085.61944481998
      From worker 5:	fold: 5, 9.688070360547842
      From worker 5:	errors: 0
      From worker 3:	fold: 4, 97260.07305294329
      From worker 6:	fold: 3, 47.98165793034724
      From worker 3:	fold: 5, 39292.132179397966
      From worker 3:	errors: 0
      From worker 3:	fold: 1, 140.98329876579325
      From worker 6:	fold: 4, 178.17533300924734
      From worker 3:	fold: 1, 42.752850936412045
      From worker 6:	fold: 5, 90.16055809198498
      From worker 6:	errors: 0
      From worker 6:	fold: 1, 38.53542100812605
      From worker 6:	fold: 2, 118.17097538274744
      From worker 6:	fold: 3, 53.20845875349788
  

      From worker 2:	fold: 2, 58.988138420919704
      From worker 2:	fold: 3, 18.047818069642798
      From worker 2:	fold: 4, 39.223229599951665
      From worker 2:	fold: 5, 55.102607715149915
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 135.4113343319633
      From worker 2:	fold: 2, 44.736533153638455
      From worker 2:	fold: 3, 24.243062192914376
      From worker 2:	fold: 4, 149.10954840089857
      From worker 2:	fold: 5, 39.81740835889948
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 6.471334328891898
      From worker 2:	fold: 1, 105.81622378155014
      From worker 2:	fold: 2, 61.44529763745993
      From worker 2:	fold: 3, 41.01679261859065
      From worker 2:	fold: 4, 87.3808246094166
      From worker 2:	fold: 5, 50.031485200093684
      From worker 2:	errors: 0
      From worker 2:	fold: 1, 19.069973112004707
      From worker 2:	fold: 2, 134.82431108027032
      From worker 2:	fold: 3, 120.92580245847199
      From worker 2:	fold: 4, 18

(serialtime = "14.0 minutes", paralleltime = "29.0 seconds")

In [163]:
dfres

Unnamed: 0_level_0,pipeline,mean,sd,time,folds
Unnamed: 0_level_1,String,Float64,Float64,Float64,Int64
1,Noop |> Noop |> ARDRegression,27.5552,33.9223,18.1298,5
2,RobustScaler |> FastICA |> ARDRegression,29.7369,27.1329,2.37118,5
3,MinMaxScaler |> FastICA |> ARDRegression,30.4747,12.9025,3.57303,5
4,Noop |> FastICA |> ARDRegression,31.6987,22.1974,2.89394,5
5,MinMaxScaler |> FastICA |> GradientBoostingRegressor,34.3902,32.2186,4.21285,5
6,StandardScaler |> FastICA |> ARDRegression,34.7098,20.0542,4.42542,5
7,MinMaxScaler |> Noop |> ARDRegression,36.2061,20.0381,14.8857,5
8,Noop |> Noop |> GradientBoostingRegressor,37.87,23.9057,18.1809,5
9,RobustScaler |> Noop |> GradientBoostingRegressor,39.0068,24.4975,9.57602,5
10,RobustScaler |> Noop |> ARDRegression,39.7233,24.7903,9.56614,5
