# Ensembles  
Credits: https://cran.r-project.org/web/packages/caretEnsemble/vignettes/caretEnsemble-intro.html

In [45]:
install.packages("caretEnsemble")

Installing package into '/home/nbcommon/R'
(as 'lib' is unspecified)


In [46]:
# Load the libraries used in this example
library("caret")
library("mlbench")
library("pROC")

In [47]:
# load the sample data into the environment
# Split into train and test
data(Sonar)
set.seed(98052)
inTrain <- createDataPartition(y = Sonar$Class, p = .75, list = FALSE)
training <- Sonar[ inTrain,]
testing <- Sonar[-inTrain,]

In [48]:
# Specify the training parameters
caretList_control <- trainControl(method="cv", number=5)

In [49]:
library("rpart")
library("caretEnsemble")


## Using caretEnsemble  
Greey Linear Optimization on AUC 

In [50]:
library("caTools")

In [51]:
greedy_ensemble <- caretEnsemble(
  model_list, 
  metric="ROC",
  trControl=trainControl(
    number=2,
    summaryFunction=twoClassSummary,
    classProbs=TRUE
    ))
summary(greedy_ensemble)

The following models were ensembled: rpart, glm 
They were weighted: 
1.3745 -1.92 -0.9703
The resulting ROC is: 0.7745
The fit for each individual model on the ROC is: 
 method       ROC      ROCSD
  rpart 0.7156265 0.05991560
    glm 0.6831013 0.07740682


In [52]:
model_preds <- lapply(model_list, predict, newdata=testing, type="prob")
model_preds <- lapply(model_preds, function(x) x[,"M"])
model_preds <- data.frame(model_preds)
    
# Using the greedy ensemble defined above
ens_preds <- predict(greedy_ensemble, newdata=testing, type="prob")
model_preds$ensemble <- ens_preds
    
head(model_preds)    
# Area under ROC curve
colAUC(model_preds, testing$Class)

rpart,glm,ensemble
0.2089552,1.0,0.4639712
0.2089552,2.220446e-16,0.2469954
0.8873239,1.0,0.7609905
0.8873239,2.220446e-16,0.5468078
0.2089552,2.220446e-16,0.2469954
0.2089552,2.220446e-16,0.2469954


Unnamed: 0,rpart,glm,ensemble
M vs. R,0.7307099,0.75,0.7932099


## Using caret Stack  
Combine several predictive models via stacking

In [53]:
glm_ensemble <- caretStack(
  model_list,
  method="glm",
  metric="ROC",
  trControl=trainControl(
    method="boot",
    number=10,
    savePredictions="final",
    classProbs=TRUE,
    summaryFunction=twoClassSummary
  )
)

summary(glm_ensemble)



Call:
NULL

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7888  -0.9671  -0.6384   0.8520   1.8517  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)   1.3745     0.1110  12.385  < 2e-16 ***
rpart        -1.9200     0.1474 -13.024  < 2e-16 ***
glm          -0.9703     0.1212  -8.006 1.18e-15 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1975.4  on 1429  degrees of freedom
Residual deviance: 1662.9  on 1427  degrees of freedom
AIC: 1668.9

Number of Fisher Scoring iterations: 4


In [55]:
model_preds2 <- model_preds
model_preds2$ensemble <- predict(glm_ensemble, newdata=testing, type="prob")
CF <- coef(glm_ensemble$ens_model$finalModel)[-1]

head(model_preds2)

# Area under ROC curve
colAUC(model_preds2, testing$Class)

rpart,glm,ensemble
0.2089552,1.0,0.4639712
0.2089552,2.220446e-16,0.2469954
0.8873239,1.0,0.7609905
0.8873239,2.220446e-16,0.5468078
0.2089552,2.220446e-16,0.2469954
0.2089552,2.220446e-16,0.2469954


Unnamed: 0,rpart,glm,ensemble
M vs. R,0.7307099,0.75,0.7932099
