Skip to content

Commit

Permalink
added getEps (#39)
Browse files Browse the repository at this point in the history
* added getEps

* added getEps to clusterTask

* integrated getEps fun
  • Loading branch information
MiGraber committed Mar 26, 2018
1 parent 49da9b4 commit 263da25
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 12 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Expand Up @@ -24,7 +24,8 @@ Imports: checkmate,
vcd,
plyr,
RColorBrewer,
ggpubr
ggpubr,
pracma
Suggests: testthat,
lintr (>= 1.0.0.9001),
MASS,
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Expand Up @@ -63,6 +63,7 @@ importFrom(cluster,agnes)
importFrom(cluster,diana)
importFrom(cluster,pam)
importFrom(dbscan,dbscan)
importFrom(dbscan,kNNdist)
importFrom(factoextra,eclust)
importFrom(factoextra,fviz_cluster)
importFrom(factoextra,fviz_dend)
Expand All @@ -82,6 +83,7 @@ importFrom(kernlab,kkmeans)
importFrom(mclust,Mclust)
importFrom(mclust,mclustBIC)
importFrom(plyr,alply)
importFrom(pracma,gradient)
importFrom(stats,cor)
importFrom(stats,hclust)
importFrom(stats,kmeans)
Expand Down
6 changes: 4 additions & 2 deletions R/getClusterAnalysis.R
Expand Up @@ -194,7 +194,7 @@ getClusterAnalysis = function(data, num.features, method, par.vals, random.seed,
db.cluster = do.call(dbscan, args = append(list(x = num.data), par.vals))
#plot results
db.plot = fviz_cluster(db.cluster, data = num.data, stand = FALSE,
ellipse = TRUE, show.clust.cent = TRUE, ellipse.type = "norm",
ellipse = FALSE, show.clust.cent = FALSE,
geom = "point", ggtheme = theme_classic(), main = "DBScan Cluster Plot")
#mostly no db-cluster because if dim(X) > 2, apply PCA.. No Structure
#save results
Expand All @@ -205,11 +205,13 @@ getClusterAnalysis = function(data, num.features, method, par.vals, random.seed,
comb.cluster.list = apply(combinations, 2, function(x) {
#print(x)
cols = colnames(num.data)[x]
# set suitable eps value
par.vals$eps = getEps(num.data[, x])
#apply db scan algorithm
db.cluster = do.call(dbscan, args = append(list(x = num.data[, x]), par.vals))
#plot results
db.plot = fviz_cluster(db.cluster, data = num.data[, x], stand = FALSE,
ellipse = TRUE, show.clust.cent = TRUE, ellipse.type = "norm",
ellipse = FALSE, show.clust.cent = FALSE,
geom = "point", ggtheme = theme_classic(), main = "DBScan Cluster Plot")
#save results
list(cluster.cols = cols,
Expand Down
30 changes: 30 additions & 0 deletions R/getDbscanEps.R
@@ -0,0 +1,30 @@
#' @title Computes a suitable eps value for DBScan
#'
#' @description
#' The criterion used is an elbow criterion for knn distancies.
#' Since this is a subjective criterion the calculation is just heuristic
#'
#' @param data [\code{data.frame}]\cr
#' A Dataframe with different numeric variables.
#' @return [\code{numeric(1)}]
#' An eps value for dbscan
#' @import checkmate
#' @importFrom pracma gradient
#' @importFrom dbscan kNNdist
getEps = function(data) {
dists = kNNdist(data, k = 5)
y = sort(dists)
x = seq(1, length(y))
f = gradient(y, x)
# make "later" values bigger by weighting with the decreasing knnDists
# add mean againt dividing by 0
wf = f / (sort(dists, decreasing = TRUE) + mean(dists))
# remove small values
big.wf = wf[wf > mean(wf)]
big.x3 = x[wf > mean(wf)]
trim.wf = big.wf[big.wf <= quantile(big.wf, 0.85)]
trim.x = big.x3[big.wf <= quantile(big.wf, 0.85)]
eps.ind = trim.x[trim.wf == max(trim.wf)]
# in case multiple max are found
mean(y[eps.ind])
}
20 changes: 11 additions & 9 deletions R/makeClusterTask.R
Expand Up @@ -122,28 +122,30 @@ makeClusterTask = function(id, data, target, cluster.cols = NULL, method = "clus
}
}


####################
# Encapsulate Data and Data Types into new env
env = new.env(parent = emptyenv())
env$data = data
env$datatypes = getDataType(data, target)

##add option for Eps in dbscan and args in kkmeans
if (length(par.vals) == 0) {
if (method == "cluster.dbscan") {
par.vals = list(eps = 0.15)
num.features = c(env$datatypes$num, env$datatypes$int)
par.vals = list(eps = getEps(data[, num.features]))
} else if (method == "cluster.kkmeans") {
par.vals = list(centers = 2L)
}
} else if (length(par.vals) >= 1) {
if (method == "cluster.dbscan" & !is.element(names(par.vals), "eps")) {
par.vals = append(par.vals, list(eps = 0.15))
num.features = c(env$datatypes$num, env$datatypes$int)
par.vals = append(par.vals, list(eps = getEps(data[, num.features])))
} else if (method == "cluster.kkmeans" & !is.element(names(par.vals), "centers")) {
par.vals = append(par.vals, list(centers = 2L))
}
}


####################
# Encapsulate Data and Data Types into new env
env = new.env(parent = emptyenv())
env$data = data
env$datatypes = getDataType(data, target)

makeS3Obj("ClusterTask",
id = id,
type = "ClusterSummary",
Expand Down

0 comments on commit 263da25

Please sign in to comment.