Skip to content

Commit

Permalink
Remove hamming to address #1442
Browse files Browse the repository at this point in the history
  • Loading branch information
koheiw committed Oct 4, 2018
1 parent e2d8005 commit 5a1cc5d
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 71 deletions.
66 changes: 32 additions & 34 deletions R/textstat_dist.R
Expand Up @@ -2,7 +2,7 @@
#' @export
#' @param p The power of the Minkowski distance.
#' @details \code{textstat_dist} options are: \code{"euclidean"} (default),
#' \code{"chisquared"}, \code{"chisquared2"}, \code{"hamming"},
#' \code{"chisquared"}, \code{"chisquared2"},
#' \code{"kullback"}. \code{"manhattan"}, \code{"maximum"}, \code{"canberra"},
#' and \code{"minkowski"}.
#' @references The \code{"chisquared"} metric is from Legendre, P., & Gallagher,
Expand All @@ -17,8 +17,6 @@
#' Quadratic-Chi Histogram Distance Family}". In \emph{Computer Vision – ECCV
#' 2010} (Vol. 6312, pp. 749–762). Berlin, Heidelberg: Springer, Berlin,
#' Heidelberg. doi.org/10.1007/978-3-642-15552-9_54.
#'
#' \code{"hamming"} is \eqn{\sum{x \neq y)}}.
#'
#' \code{"kullback"} is the Kullback-Leibler distance, which assumes that
#' \eqn{P(x_i) = 0} implies \eqn{P(y_i)=0}, and in case both \eqn{P(x_i)} and
Expand Down Expand Up @@ -77,7 +75,7 @@ textstat_dist.dfm <- function(x, selection = NULL,
}

m <- if (margin == "documents") 1 else 2
methods1 <- c("euclidean", "hamming", "chisquared", "chisquared2", "kullback", "manhattan", "maximum", "canberra")
methods1 <- c("euclidean", "chisquared", "chisquared2", "kullback", "manhattan", "maximum", "canberra")
methods2 <- c("jaccard", "binary", "ejaccard", "simple matching")

if (method %in% methods1) {
Expand Down Expand Up @@ -275,36 +273,36 @@ euclidean_dist <- function(x, y = NULL, margin = 1){

# Hamming distance
# formula: hamming = sum(x .!= y)
hamming_dist <- function(x, y = NULL, margin = 1) {

if (!(margin %in% 1:2)) stop("margin can only be 1 (rows) or 2 (columns)")

# convert to binary matrix
x <- dfm_weight(x, "boolean")
x0 <- 1 - x
func_cp <- if (margin == 2) Matrix::crossprod else Matrix::tcrossprod
func_sum <- if (margin == 2) nrow else ncol
func_name <- if (margin == 2) colnames else rownames
# union
an <- func_sum(x)
if (!is.null(y)) {
y <- dfm_weight(y, "boolean")
y0 <- 1 - y
a <- func_cp(x, y)
a0 <- func_cp(x0, y0)
colname <- func_name(y)
} else {
a <- func_cp(x)
a0 <- func_cp(x0)
colname <- func_name(x)
}
rowname <- func_name(x)
# common values
a <- a + a0
hammat <- an - a
dimnames(hammat) <- list(rowname, colname)
hammat
}
# hamming_dist <- function(x, y = NULL, margin = 1) {
#
# if (!(margin %in% 1:2)) stop("margin can only be 1 (rows) or 2 (columns)")
#
# # convert to binary matrix
# x <- dfm_weight(x, "boolean")
# x0 <- 1 - x
# func_cp <- if (margin == 2) Matrix::crossprod else Matrix::tcrossprod
# func_sum <- if (margin == 2) nrow else ncol
# func_name <- if (margin == 2) colnames else rownames
# # union
# an <- func_sum(x)
# if (!is.null(y)) {
# y <- dfm_weight(y, "boolean")
# y0 <- 1 - y
# a <- func_cp(x, y)
# a0 <- func_cp(x0, y0)
# colname <- func_name(y)
# } else {
# a <- func_cp(x)
# a0 <- func_cp(x0)
# colname <- func_name(x)
# }
# rowname <- func_name(x)
# # common values
# a <- a + a0
# hammat <- an - a
# dimnames(hammat) <- list(rowname, colname)
# hammat
# }

# Chi-squared distance:divide by row sums and square root of column sums,
# and adjust for square root of matrix total (Legendre & Gallagher 2001,
Expand Down
4 changes: 1 addition & 3 deletions man/textstat_simil.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

69 changes: 35 additions & 34 deletions tests/testthat/test-textstat_dist.R
Expand Up @@ -182,38 +182,38 @@ test_that("test textstat_dist method = \"Canberra\" against proxy dist() : featu
expect_equal(minkQuanteda, minkProxy)
})

# Hamming distance
test_that("test textstat_dist method = \"hamming\" against e1071::hamming.distance: documents", {
presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980 & Year < 2018), remove = stopwords("english"),
stem = TRUE, verbose = FALSE)

hammingQuanteda <- sort(as.matrix(textstat_dist(presDfm, "1981-Reagan", method = "hamming", margin = "documents", upper = TRUE))[,"1981-Reagan"], decreasing = FALSE)
hammingQuanteda <- hammingQuanteda[-which(names(hammingQuanteda) == "1981-Reagan")]

if (requireNamespace("e1071", quietly = TRUE)) {
hammingE1071 <- sort(e1071::hamming.distance(as.matrix(dfm_weight(presDfm, "boolean")))[, "1981-Reagan"], decreasing = FALSE)
if("1981-Reagan" %in% names(hammingE1071)) hammingE1071 <- hammingE1071[-which(names(hammingE1071) == "1981-Reagan")]
} else {
hammingE1071 <- c(712, 723, 746, 769, 774, 781, 784, 812, 857)
}
expect_equivalent(hammingQuanteda, hammingE1071)
})

test_that("test textstat_dist method = \"hamming\" against e1071::hamming.distance: features", {
skip_if_not_installed("e1071")
presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), remove = stopwords("english"),
stem = TRUE, verbose = FALSE)

hammingQuanteda <- textstat_dist(presDfm, "soviet", method = "hamming", margin = "features")[,"soviet"]
hammingQuanteda <- hammingQuanteda[order(names(hammingQuanteda))]
hammingQuanteda <- hammingQuanteda[-which(names(hammingQuanteda) == "soviet")]

presM <- t(as.matrix(dfm_weight(presDfm, "boolean")))
hammingE1071 <- e1071::hamming.distance(presM)[, "soviet"]
hammingE1071 <- hammingE1071[order(names(hammingE1071))]
if("soviet" %in% names(hammingE1071)) hammingE1071 <- hammingE1071[-which(names(hammingE1071) == "soviet")]
expect_equal(hammingQuanteda, hammingE1071)
})
# # Hamming distance
# test_that("test textstat_dist method = \"hamming\" against e1071::hamming.distance: documents", {
# presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980 & Year < 2018), remove = stopwords("english"),
# stem = TRUE, verbose = FALSE)
#
# hammingQuanteda <- sort(as.matrix(textstat_dist(presDfm, "1981-Reagan", method = "hamming", margin = "documents", upper = TRUE))[,"1981-Reagan"], decreasing = FALSE)
# hammingQuanteda <- hammingQuanteda[-which(names(hammingQuanteda) == "1981-Reagan")]
#
# if (requireNamespace("e1071", quietly = TRUE)) {
# hammingE1071 <- sort(e1071::hamming.distance(as.matrix(dfm_weight(presDfm, "boolean")))[, "1981-Reagan"], decreasing = FALSE)
# if("1981-Reagan" %in% names(hammingE1071)) hammingE1071 <- hammingE1071[-which(names(hammingE1071) == "1981-Reagan")]
# } else {
# hammingE1071 <- c(712, 723, 746, 769, 774, 781, 784, 812, 857)
# }
# expect_equivalent(hammingQuanteda, hammingE1071)
# })
#
# test_that("test textstat_dist method = \"hamming\" against e1071::hamming.distance: features", {
# skip_if_not_installed("e1071")
# presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), remove = stopwords("english"),
# stem = TRUE, verbose = FALSE)
#
# hammingQuanteda <- textstat_dist(presDfm, "soviet", method = "hamming", margin = "features")[,"soviet"]
# hammingQuanteda <- hammingQuanteda[order(names(hammingQuanteda))]
# hammingQuanteda <- hammingQuanteda[-which(names(hammingQuanteda) == "soviet")]
#
# presM <- t(as.matrix(dfm_weight(presDfm, "boolean")))
# hammingE1071 <- e1071::hamming.distance(presM)[, "soviet"]
# hammingE1071 <- hammingE1071[order(names(hammingE1071))]
# if("soviet" %in% names(hammingE1071)) hammingE1071 <- hammingE1071[-which(names(hammingE1071) == "soviet")]
# expect_equal(hammingQuanteda, hammingE1071)
# })

test_that("test textstat_dist method = \"binary\" against proxy::simil(): features", {
skip_if_not_installed("proxy")
Expand All @@ -234,10 +234,11 @@ test_that("test textstat_dist method = \"binary\" against proxy::simil(): featur
test_that("as.list.dist works as expected",{
presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), remove = stopwords("english"),
stem = TRUE, verbose = FALSE)
ddist <- textstat_dist(presDfm, method = "hamming")
ddist <- textstat_dist(presDfm, method = "euclidean")
expect_equal(
as.list(ddist)$`1981-Reagan`[1:3],
c("2009-Obama" = 857, "2013-Obama" = 812, "1997-Clinton" = 784)
c("2017-Trump" = 108.74282, "2013-Obama" = 103.31989, "2001-Bush" = 94.67312),
tolerance = 0.001
)
})

Expand Down

0 comments on commit 5a1cc5d

Please sign in to comment.