# Meta-Modeling: Combine.jl

Idea adapted from Orchestra.jl work of Samuel Jenkins (https://github.com/svs14/Orchestra.jl)

In [19]:
using Combine.Util
using Combine.Transformers
import RDatasets

In [20]:
iris = RDatasets.dataset("datasets", "iris")
X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])
y = convert(Array, iris[:Species]);

# Split into training and test sets
(train_ind, test_ind) = holdout(size(X, 1), 0.3)

([9,41,129,131,118,92,1,89,95,44  …  120,126,101,13,133,53,96,78,74,21],[85,116,19,88,50,49,69,147,108,109  …  86,112,22,38,8,140,122,73,91,102])

In [3]:
prunedTreeLearner = PrunedTree()

Combine.Transformers.DecisionTreeWrapper.PrunedTree(nothing,Dict{Symbol,Any}(Pair{Symbol,Any}(:impl_options,Dict(:purity_threshold=>1.0)),Pair{Symbol,Any}(:output,:class)))

In [4]:
pipeline = Pipeline(Dict(
        :transformers => [
            OneHotEncoder(), # Encodes nominal features into numeric
            Imputer(), # Imputes NA values
            #StandardScaler(), # Standardizes features 
            prunedTreeLearner # Predicts labels on instances
        ]
    ))

Combine.Transformers.CombineTransformers.Pipeline(nothing,Dict{Symbol,Any}(Pair{Symbol,Any}(:transformers,Combine.Types.Transformer[Combine.Transformers.CombineTransformers.OneHotEncoder(nothing,Dict(:nominal_columns=>nothing,:nominal_column_values_map=>nothing)),Combine.Transformers.CombineTransformers.Imputer(nothing,Dict(:strategy=>mean)),Combine.Transformers.DecisionTreeWrapper.PrunedTree(nothing,Dict{Symbol,Any}(Pair{Symbol,Any}(:impl_options,Dict(:purity_threshold=>1.0)),Pair{Symbol,Any}(:output,:class)))]),Pair{Symbol,Any}(:transformer_options,nothing)))

In [5]:
# Train
fit!(pipeline, X[train_ind, :], y[train_ind]);

In [6]:
# Predict
predictions = transform!(pipeline, X[test_ind, :]);

In [7]:
sum(predictions .== y[test_ind])/length(predictions)*100

97.77777777777777

In [8]:
result = score(:accuracy, y[test_ind], predictions)
println(result)

97.77777777777777


In [9]:
function processModel(learner)
    iris = RDatasets.dataset("datasets", "iris")
    X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])
    y = convert(Array, iris[:Species]);
    (train_ind, test_ind) = holdout(size(X, 1), 0.3)
    pipeline = Pipeline(Dict(
            :transformers => [
                OneHotEncoder(), # Encodes nominal features into numeric
                Imputer(), # Imputes NA values
                #StandardScaler(), # Standardizes features 
                learner # Predicts labels on instances
            ]
        ))
    # Train
    fit!(pipeline, X[train_ind, :], y[train_ind]);
    # Predict
    predictions = transform!(pipeline, X[test_ind, :]);
    result = score(:accuracy, y[test_ind], predictions)
    return(result)
end

processModel (generic function with 1 method)

In [10]:
adaLearner = DecisionStumpAdaboost(Dict(
  # Output to train against
  # (:class).
  :output => :class,
  # Options specific to this implementation.
  :impl_options => Dict(
    # Number of boosting iterations.
    :num_iterations => 7
  )
))
processModel(adaLearner)

71.11111111111111

In [11]:
rfLearner = RandomForest(Dict(
  :output => :class,
  :impl_options => Dict(
    :num_subfeatures => nothing,
    :num_trees => 10,
    :partial_sampling => 0.7
  )
))
processModel(rfLearner)

97.77777777777777

In [12]:
using ScikitLearn
@sk_import neighbors: KNeighborsClassifier
@sk_import svm: SVC

skLearner = SKLLearner(Dict(
  :output => :class,
  #:learner => "KNeighborsClassifier",
  :learner => "SVC",
  :impl_options => Dict()
))
processModel(skLearner)

[1m[34mINFO: Recompiling stale cache file /Users/ppalmes/.julia/lib/v0.5/ScikitLearn.ji for module ScikitLearn.


95.55555555555556

In [13]:
voteLearner = VoteEnsemble(Dict(
  :output => :class,
  # Learners in voting committee.
  :learners => [RandomForest(),PrunedTree(), DecisionStumpAdaboost()]
))
processModel(voteLearner)

97.77777777777777

In [14]:
bestLearner = BestLearner(Dict(
  :output => :class,
  :partition_generator => (X, y) -> kfold(size(X, 1), 5),
  :selection_function => (learner_partition_scores) -> findmax(mean(learner_partition_scores, 2))[2],      
  :score_type => Real,
  :learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest()],
  :learner_options_grid => nothing
))
processModel(bestLearner)

95.55555555555556

In [15]:
stackLearner = StackEnsemble(Dict(
  :output => :class,
  :learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest(),skLearner,voteLearner,bestLearner],
  :stacker => RandomForest(),
  # Proportion of training set left to train stacker itself.
  :stacker_training_proportion => 0.3,
  :keep_original_features => false
))
processModel(stackLearner)

100.0

In [16]:
results=@parallel (vcat) for i=1:30
   processModel(stackLearner)
end
println("acc = ",round(mean(results))," +/- ",round(std(results)))

acc = 94.0 +/- 3.0


In [17]:
results

30-element Array{Float64,1}:
 97.7778
 95.5556
 93.3333
 93.3333
 91.1111
 93.3333
 93.3333
 97.7778
 91.1111
 97.7778
 88.8889
 93.3333
 91.1111
  ⋮     
 95.5556
 95.5556
 95.5556
 88.8889
 84.4444
 93.3333
 97.7778
 91.1111
 91.1111
 91.1111
 93.3333
 95.5556

In [18]:
#svmcrt = CRTLearner(Dict(
  # Output to train against
  # (:class).
  #:output => :class,
  #:learner => "rf",
  #:learner => "svmLinear2",
  #:learner => "rpart",
  #:learner => "lda",
  #:impl_options => Dict()
#))