### Imbalanced Classification Improvement of Undersampling

In [4]:
library(caret)
library(ranger)

Loading required package: lattice
Loading required package: ggplot2


Reading in the data

In [9]:
hmeq = read.csv('hmeq.csv')
hmeq$BAD=as.factor(hmeq$BAD)
hmeq[hmeq=='']=NA
hmeq_complete=hmeq[complete.cases(hmeq),]

names(hmeq_complete)[1]='target'

set.seed(2018)
splitIndex=createDataPartition(hmeq_complete$target, p=.70, list=FALSE, times=1)
train=hmeq_complete[splitIndex,]
test=hmeq_complete[-splitIndex,]

Training a random forest using undersampling

In [10]:
train1=train[train[,'target']=="1",]
n1=nrow(train1)
table(train1$target)
  
train0=train[train[,'target']=="0",]
n0=nrow(train0)
table(train0$target)

train00=train0[sample(1:n0,n1),]
train_under=rbind(train00,train1)

rf1=ranger(target~.,data=train_under)
pred_under=predict(rf1,data=test)$predictions
cm_under=confusionMatrix(pred_under,test$target,positive="1")
print(cm_under$byClass['Balanced Accuracy'])


  0   1 
  0 210 


   0    1 
2145    0 

Balanced Accuracy 
         0.826466 


Running multiple forests on different undersamples of the data to get an improved balanced accuracy

In [13]:
mc=matrix(,nrow=nrow(test),ncol=0)
for (i in 1:100){
  train00=train0[sample(1:n0,n1),]
  train_d=rbind(train00,train1)
  
  model=ranger(target~.,data=train_d)
  pred=predict(model,data=test)$predictions
  mc=cbind(mc,pred)
}
mc=mc-1
mc=cbind(mc,vote=(round(rowMeans(mc[,1:ncol(mc)]),0)))

cm=confusionMatrix(factor(mc[,"vote"]),test$target,positive="1")
print(cm$byClass['Balanced Accuracy'])


Balanced Accuracy 
        0.8371176 
