In [16]:
using CombineML.Util
using CombineML.Transformers
import RDatasets

In [17]:
iris = RDatasets.dataset("datasets", "iris")
X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])
y = convert(Array, iris[:Species]);

# Split into training and test sets
(train_ind, test_ind) = holdout(size(X, 1), 0.3)

([42, 17, 18, 149, 89, 3, 23, 115, 52, 83  …  113, 48, 28, 141, 40, 94, 71, 41, 33, 27], [123, 132, 125, 106, 63, 68, 95, 31, 96, 15  …  143, 139, 73, 90, 150, 38, 126, 47, 58, 39])

In [18]:
prunedTreeLearner = PrunedTree()

PrunedTree(nothing, Dict{Symbol,Any}(:output=>:class,:impl_options=>Dict{Symbol,Real}(:purity_threshold=>1.0,:min_samples_split=>2,:min_samples_leaf=>1,:min_purity_increase=>0.0,:max_depth=>-1)))

In [19]:
pipeline1 = Pipeline(Dict(
        :transformers => [
            OneHotEncoder(), # Encodes nominal features into numeric
            Imputer(), # Imputes NA values
            #StandardScaler(), # Standardizes features 
            prunedTreeLearner # Predicts labels on instances
        ]
    ))

Pipeline(nothing, Dict{Symbol,Union{Nothing, Array{Transformer,1}}}(:transformers=>Transformer[OneHotEncoder(nothing, Dict(:nominal_column_values_map=>nothing,:nominal_columns=>nothing)), Imputer(nothing, Dict(:strategy=>mean)), PrunedTree(nothing, Dict{Symbol,Any}(:output=>:class,:impl_options=>Dict{Symbol,Real}(:purity_threshold=>1.0,:min_samples_split=>2,:min_samples_leaf=>1,:min_purity_increase=>0.0,:max_depth=>-1)))],:transformer_options=>nothing))

In [20]:
# Train
fit!(pipeline1, X[train_ind, :], y[train_ind]);

In [21]:
# Predict
predictions = transform!(pipeline1, X[test_ind, :]);

In [22]:
sum(predictions .== y[test_ind])/length(predictions)*100

91.11111111111111

In [23]:
result = score(:accuracy, y[test_ind], predictions)
println(result)

91.11111111111111


In [24]:
function processModel(learner)
    iris = RDatasets.dataset("datasets", "iris")
    X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])
    y = convert(Array, iris[:Species]);
    (train_ind, test_ind) = holdout(size(X, 1), 0.3)
    pipeline = Pipeline(Dict(
            :transformers => [
                OneHotEncoder(), # Encodes nominal features into numeric
                Imputer(), # Imputes NA values
                #StandardScaler(), # Standardizes features 
                learner # Predicts labels on instances
            ]
        ))
    # Train
    fit!(pipeline, X[train_ind, :], y[train_ind]);
    # Predict
    predictions = transform!(pipeline, X[test_ind, :]);
    result = score(:accuracy, y[test_ind], predictions)
    return(result)
end

processModel (generic function with 1 method)

In [25]:
adaLearner = DecisionStumpAdaboost(Dict(
  # Output to train against
  # (:class).
  :output => :class,
  # Options specific to this implementation.
  :impl_options => Dict(
    # Number of boosting iterations.
    :num_iterations => 7
  )
))
processModel(adaLearner)

91.11111111111111

In [26]:
rfLearner = RandomForest(Dict(
  :output => :class,
  :impl_options => Dict(
    :num_subfeatures => 0,
    :num_trees => 10,
    :partial_sampling => 0.7
  )
))
processModel(rfLearner)

95.55555555555556

In [27]:
using ScikitLearn
@sk_import neighbors: KNeighborsClassifier
@sk_import svm: SVC

skLearner = SKLLearner(Dict(
  :output => :class,
  #:learner => "KNeighborsClassifier",
  :learner => "SVC",
  :impl_options => Dict()
))
processModel(skLearner)



UndefVarError: UndefVarError: SKLLearner not defined

In [28]:
voteLearner = VoteEnsemble(Dict(
  :output => :class,
  # Learners in voting committee.
  :learners => [RandomForest(),PrunedTree(), DecisionStumpAdaboost()]
))
processModel(voteLearner)

97.77777777777777

In [32]:
using Statistics
bestLearner = BestLearner(Dict(
  :output => :class,
  :partition_generator => (X, y) -> kfold(size(X, 1), 5),
  :selection_function => (learner_partition_scores) -> findmax(mean(learner_partition_scores, dims=2))[2],      
  :score_type => Real,
  :learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest()],
  :learner_options_grid => nothing
))
processModel(bestLearner)

91.11111111111111

In [33]:
stackLearner = StackEnsemble(Dict(
  :output => :class,
  :learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest(),voteLearner,bestLearner],
  :stacker => RandomForest(),
  # Proportion of training set left to train stacker itself.
  :stacker_training_proportion => 0.3,
  :keep_original_features => false
))
processModel(stackLearner)

86.66666666666667

In [37]:
using Distributed

results = @parallel (vcat) for i=1:30
   processModel(stackLearner)
end
println("acc = ",round(mean(results))," +/- ",round(std(results)))

LoadError: UndefVarError: @parallel not defined

In [22]:
results

30-element Array{Float64,1}:
 100.0   
  93.3333
  95.5556
  88.8889
  91.1111
  93.3333
  93.3333
  95.5556
  97.7778
  95.5556
  88.8889
  93.3333
  91.1111
   ⋮     
  97.7778
  93.3333
  95.5556
  84.4444
  95.5556
  93.3333
  93.3333
  95.5556
  97.7778
  93.3333
  95.5556
  91.1111

In [17]:
#svmcrt = CRTLearner(Dict(
  # Output to train against
  # (:class).
  #:output => :class,
  #:learner => "rf",
  #:learner => "svmLinear2",
  #:learner => "rpart",
  #:learner => "lda",
  #:impl_options => Dict()
#))