added getEps (#39)

* added getEps * added getEps to clusterTask * integrated getEps fun
tuanle618 · Mar 26, 2018 · 263da25 · 263da25
1 parent 49da9b4
commit 263da25
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 12 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,7 +24,8 @@ Imports: checkmate,
     vcd,
     plyr,
     RColorBrewer,
-    ggpubr
+    ggpubr,
+    pracma
 Suggests: testthat,
   lintr (>= 1.0.0.9001),
   MASS,

diff --git a/NAMESPACE b/NAMESPACE
@@ -63,6 +63,7 @@ importFrom(cluster,agnes)
 importFrom(cluster,diana)
 importFrom(cluster,pam)
 importFrom(dbscan,dbscan)
+importFrom(dbscan,kNNdist)
 importFrom(factoextra,eclust)
 importFrom(factoextra,fviz_cluster)
 importFrom(factoextra,fviz_dend)
@@ -82,6 +83,7 @@ importFrom(kernlab,kkmeans)
 importFrom(mclust,Mclust)
 importFrom(mclust,mclustBIC)
 importFrom(plyr,alply)
+importFrom(pracma,gradient)
 importFrom(stats,cor)
 importFrom(stats,hclust)
 importFrom(stats,kmeans)

diff --git a/R/getClusterAnalysis.R b/R/getClusterAnalysis.R
@@ -194,7 +194,7 @@ getClusterAnalysis = function(data, num.features, method, par.vals, random.seed,
     db.cluster = do.call(dbscan, args = append(list(x = num.data), par.vals))
     #plot results
     db.plot = fviz_cluster(db.cluster, data = num.data, stand = FALSE,
-      ellipse = TRUE, show.clust.cent = TRUE, ellipse.type = "norm",
+      ellipse = FALSE, show.clust.cent = FALSE,
       geom = "point", ggtheme = theme_classic(), main = "DBScan Cluster Plot")
     #mostly no db-cluster because if dim(X) > 2, apply PCA.. No Structure
     #save results
@@ -205,11 +205,13 @@ getClusterAnalysis = function(data, num.features, method, par.vals, random.seed,
     comb.cluster.list = apply(combinations, 2, function(x) {
       #print(x)
       cols = colnames(num.data)[x]
+      # set suitable eps value
+      par.vals$eps = getEps(num.data[, x])
       #apply db scan algorithm
       db.cluster = do.call(dbscan, args = append(list(x = num.data[, x]), par.vals))
       #plot results
       db.plot = fviz_cluster(db.cluster, data = num.data[, x], stand = FALSE,
-        ellipse = TRUE, show.clust.cent = TRUE, ellipse.type = "norm",
+        ellipse = FALSE, show.clust.cent = FALSE,
         geom = "point", ggtheme = theme_classic(), main = "DBScan Cluster Plot")
       #save results
       list(cluster.cols = cols,

diff --git a/R/getDbscanEps.R b/R/getDbscanEps.R
@@ -0,0 +1,30 @@
+#' @title Computes a suitable eps value for DBScan
+#'
+#' @description
+#'  The criterion used is an elbow criterion for knn distancies.
+#'  Since this is a subjective criterion the calculation is just heuristic
+#'
+#' @param data [\code{data.frame}]\cr
+#'   A Dataframe with different numeric variables.
+#' @return [\code{numeric(1)}]
+#'   An eps value for dbscan
+#' @import checkmate
+#' @importFrom pracma gradient
+#' @importFrom dbscan kNNdist
+getEps = function(data) {
+  dists = kNNdist(data, k = 5)
+  y = sort(dists)
+  x = seq(1, length(y))
+  f = gradient(y, x)
+  # make "later" values bigger by weighting with the decreasing knnDists
+  # add mean againt dividing by 0
+  wf = f / (sort(dists, decreasing = TRUE) + mean(dists))
+  # remove small values
+  big.wf = wf[wf > mean(wf)]
+  big.x3 = x[wf > mean(wf)]
+  trim.wf = big.wf[big.wf <= quantile(big.wf, 0.85)]
+  trim.x = big.x3[big.wf <= quantile(big.wf, 0.85)]
+  eps.ind = trim.x[trim.wf == max(trim.wf)]
+  # in case multiple max are found
+  mean(y[eps.ind])
+}
diff --git a/R/makeClusterTask.R b/R/makeClusterTask.R
@@ -122,28 +122,30 @@ makeClusterTask = function(id, data, target, cluster.cols = NULL, method = "clus
     }
   }
 
+
+  ####################
+  # Encapsulate Data and Data Types into new env
+  env = new.env(parent = emptyenv())
+  env$data = data
+  env$datatypes = getDataType(data, target)
+
   ##add option for Eps in dbscan and args in kkmeans
   if (length(par.vals) == 0) {
     if (method == "cluster.dbscan") {
-      par.vals = list(eps = 0.15)
+      num.features = c(env$datatypes$num, env$datatypes$int)
+      par.vals = list(eps = getEps(data[, num.features]))
     } else if (method == "cluster.kkmeans") {
       par.vals = list(centers = 2L)
     }
   } else if (length(par.vals) >= 1) {
     if (method == "cluster.dbscan" & !is.element(names(par.vals), "eps")) {
-      par.vals = append(par.vals, list(eps = 0.15))
+      num.features = c(env$datatypes$num, env$datatypes$int)
+      par.vals = append(par.vals, list(eps = getEps(data[, num.features])))
     } else if (method == "cluster.kkmeans" & !is.element(names(par.vals), "centers")) {
       par.vals = append(par.vals, list(centers = 2L))
     }
   }
 
-
-  ####################
-  # Encapsulate Data and Data Types into new env
-  env = new.env(parent = emptyenv())
-  env$data = data
-  env$datatypes = getDataType(data, target)
-
   makeS3Obj("ClusterTask",
     id = id,
     type = "ClusterSummary",