Skip to content

Commit

Permalink
Merge pull request #72 from quanteda/fix-threads
Browse files Browse the repository at this point in the history
Fix parallel code for quanteda 4.0
  • Loading branch information
kbenoit committed Apr 7, 2024
2 parents 7df74c3 + d00b299 commit 69e3d4e
Show file tree
Hide file tree
Showing 14 changed files with 106 additions and 99 deletions.
18 changes: 18 additions & 0 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ jobs:
R_KEEP_PKG_SOURCE: yes

steps:

- if: matrix.config.os == 'ubuntu-latest'
run: |
sudo apt update
sudo apt install libtbb-dev
- if: matrix.config.os == 'macos-latest'
run: |
brew update
brew install tbb
- uses: actions/checkout@v3

- uses: r-lib/actions/setup-pandoc@v2
Expand All @@ -43,6 +53,14 @@ jobs:
with:
extra-packages: any::rcmdcheck
needs: check

- name: Install remotes
run: |
Rscript -e "install.packages('remotes', repos='https://ftp.belnet.be/mirror/CRAN')"
- name: Install quanteda from Github
run: |
Rscript -e "remotes::install_github('quanteda/quanteda')"
- uses: r-lib/actions/check-r-package@v2
with:
Expand Down
5 changes: 2 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@ License: GPL-3
Depends:
R (>= 3.5.0)
Imports:
quanteda,
quanteda (>= 4.0.0),
Matrix (>= 1.5-0),
methods,
nsyllable,
proxyC (>= 0.1.4),
Rcpp (>= 0.12.12),
RcppParallel,
stringi
LinkingTo: Rcpp, RcppParallel, RcppArmadillo (>= 0.7.600.1.0), quanteda
LinkingTo: Rcpp, RcppArmadillo (>= 0.7.600.1.0), quanteda
Suggests:
entropy,
ExPosition,
Expand Down
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ exportMethods(show)
import(Matrix)
import(methods)
importFrom(Rcpp,evalCpp)
importFrom(RcppParallel,RcppParallelLibs)
importFrom(nsyllable,nsyllable)
importFrom(quanteda,as.corpus)
importFrom(quanteda,as.dfm)
Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

cpp_collocations <- function(texts_, types_, words_ignore_, count_min, sizes_, method, smoothing, thread = 1L) {
cpp_collocations <- function(texts_, types_, words_ignore_, count_min, sizes_, method, smoothing, thread = -1L) {
.Call(`_quanteda_textstats_cpp_collocations`, texts_, types_, words_ignore_, count_min, sizes_, method, smoothing, thread)
}

cpp_keyness <- function(mt, measure, correct, thread = 1L) {
cpp_keyness <- function(mt, measure, correct, thread = -1L) {
.Call(`_quanteda_textstats_cpp_keyness`, mt, measure, correct, thread)
}

2 changes: 1 addition & 1 deletion R/textstat_collocations.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ textstat_collocations.tokens <- function(x, method = "lambda",
if (is.null(id_ignore)) id_ignore <- integer()
result <- cpp_collocations(x, types, id_ignore, min_count, size,
if (method == "lambda1") "lambda1" else "lambda",
smoothing, get_threads())
smoothing, quanteda:::get_threads())

# compute z for lambda methods
result$z <- result$lambda / result$sigma
Expand Down
2 changes: 1 addition & 1 deletion R/textstat_keyness.R
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ textstat_keyness.dfm <- function(x, target = 1L, measure = c("chi2", "exact", "l
warning("correction is always none for pmi")
result <- data.frame(
feature = featnames(temp),
stat = cpp_keyness(temp, measure, correction, get_threads()),
stat = cpp_keyness(temp, measure, correction, quanteda:::get_threads()),
p = NA,
n_target = as.vector(temp[1, ]),
n_reference = as.vector(temp[2, ]),
Expand Down
1 change: 0 additions & 1 deletion R/textstat_simil.R
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,6 @@ textstat_simil.dfm <- function(x, y = NULL, selection = NULL,
#' @details `textstat_dist` options are: `"euclidean"` (default),
#' `"manhattan"`, `"maximum"`, `"canberra"`,
#' and `"minkowski"`.
#' @importFrom RcppParallel RcppParallelLibs
#' @examples
#'
#' # distances for documents
Expand Down
8 changes: 0 additions & 8 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,3 @@ check_dots <- function(..., method = NULL) {
warning(arg, " argument is not used.", call. = FALSE)
}
}

get_threads <- function() {
value <- getOption("quanteda_threads", -1L)
if (!is.integer(value) || length(value) != 1L)
stop("Invalid value of threads in quanteda options")
return(value)
}

4 changes: 4 additions & 0 deletions inst/deftbb.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#' Print TBB flag in makevars
if (quanteda:::cpp_tbb_enabled()) {
cat("-DTBB")
}
12 changes: 12 additions & 0 deletions inst/libtbb.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#' Print TBB flag in makevars
if (Sys.info()[["sysname"]] == "Windows") {
if (getRversion() >= "4.3.0") {
cat("-ltbb12")
} else {
cat("-ltbb_static")
}
} else {
if (quanteda:::cpp_tbb_enabled()) {
cat("-ltbb")
}
}
6 changes: 2 additions & 4 deletions src/Makevars
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) `$(R_HOME)/bin/Rscript -e "RcppParallel::RcppParallelLibs()"`
#PKG_CXXFLAGS = -DARMA_64BIT_WORD=1
PKG_CPPFLAGS = -DARMA_DONT_PRINT_OPENMP_WARNING -I../inst/include
#CXX_STD = CXX11
PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) `$(R_HOME)/bin${R_ARCH_BIN}/Rscript ../inst/libtbb.R`
PKG_CXXFLAGS = -DARMA_64BIT_WORD=1 `$(R_HOME)/bin${R_ARCH_BIN}/Rscript ../inst/deftbb.R`
7 changes: 2 additions & 5 deletions src/Makevars.win
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS)
PKG_LIBS += `$(R_HOME)/bin${R_ARCH_BIN}/Rscript.exe -e "RcppParallel::RcppParallelLibs()"`
PKG_CXXFLAGS = -DRCPP_PARALLEL_USE_TBB=1 -DARMA_64BIT_WORD=1
PKG_CPPFLAGS = -I../inst/include -DARMA_DONT_PRINT_OPENMP_WARNING=1
# CXX_STD = CXX11
PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) `$(R_HOME)/bin${R_ARCH_BIN}/Rscript.exe ../inst/libtbb.R` -fstack-protector
PKG_CXXFLAGS = -DARMA_64BIT_WORD=1 -DTBB
87 changes: 33 additions & 54 deletions src/collocations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,13 @@ float GLOBAL_PATTERN_MAX_LOAD_FACTOR = 0.05;
float GLOBAL_NGRAMS_MAX_LOAD_FACTOR = 0.25;
#endif

typedef std::atomic<unsigned int> UintAtomic;
#if QUANTEDA_USE_TBB
typedef tbb::atomic<unsigned int> UintAtomic; // NOTE: changed to std::atomic<unsigned int> for TBB 2021
typedef tbb::concurrent_vector<std::pair<Ngram, UintAtomic>> VecPair;
typedef tbb::concurrent_unordered_map<Ngram, std::pair<UintAtomic, UintAtomic>, hash_ngram, equal_ngram> MapNgramsPair;
#else
typedef std::vector<std::pair<Ngram, unsigned int>> VecPair;
typedef std::unordered_map<Ngram, std::pair<unsigned int, unsigned int>, hash_ngram, equal_ngram> MapNgramsPair;
typedef std::unordered_map<Ngram, std::pair<UintAtomic, UintAtomic>, hash_ngram, equal_ngram> MapNgramsPair;
#endif
typedef std::vector<std::pair<Ngram, unsigned int>> VecNgramsPair;

// return the matching pattern between two words at each position, 0 for matching, 1 for not matching.
// for example, for 3-gram, bit = 000, 001, 010 ... 111 eg. 0-7
Expand Down Expand Up @@ -107,7 +106,7 @@ Text mark(Text tokens,
}

void counts2(Text text,
MapNgramsPair &counts_seq,
MapNgramsPair &map_seqs,
const std::vector<unsigned int> &sizes,
const unsigned int &id_ignore){

Expand Down Expand Up @@ -140,7 +139,7 @@ void counts2(Text text,
Text text_sub(text.begin() + i, text.begin() + i + size);
//Rcout << "@" << i << " " << nested << ": ";
//dev::print_ngram(text_sub);
auto &count = counts_seq[text_sub];
auto &count = map_seqs[text_sub];
count.first++;
if (!padded) {
if (nested) {
Expand All @@ -154,33 +153,9 @@ void counts2(Text text,
}
}

void estimates2(std::size_t i,
VecNgrams &seqs, // seqs without padding
MapNgramsPair counts_seq,
DoubleParams &dice,
DoubleParams &pmi,
DoubleParams &logratio,
DoubleParams &chi2,
DoubleParams &gensim,
DoubleParams &lfmd,
const String &method,
const double smoothing) {

std::size_t n = seqs[i].size(); //n=2:5, seqs
if (n == 1) return; // ignore single words
// output counts
std::vector<double> counts_bit(std::pow(2, n), smoothing);
for (auto it = counts_seq.begin(); it != counts_seq.end(); ++it) {
if (it->first.size() != n) continue; // skip different lengths
int bit;
bit = match_bit2(seqs[i], it->first);
counts_bit[bit] += it->second.first;
}
}

void estimates_lambda2(std::size_t i,
const VecNgrams &seqs,
const VecPair &seqs_all,
const VecNgramsPair &seqs_counts,
DoubleParams &sgma,
DoubleParams &lmda,
const String &method,
Expand All @@ -190,10 +165,10 @@ void estimates_lambda2(std::size_t i,
if (n == 1) return; // ignore single words

std::vector<double> counts_bit(std::pow(2, n), smoothing);
for (std::size_t j = 0; j < seqs_all.size(); j++) {
if (seqs_all[j].first.size() != n) continue; // skip different lengths
int bit = match_bit2(seqs[i], seqs_all[j].first);
counts_bit[bit] += seqs_all[j].second;
for (std::size_t j = 0; j < seqs_counts.size(); j++) {
if (seqs_counts[j].first.size() != n) continue; // skip different lengths
int bit = match_bit2(seqs[i], seqs_counts[j].first);
counts_bit[bit] += seqs_counts[j].second;
}

//B-J algorithm
Expand Down Expand Up @@ -224,7 +199,7 @@ DataFrame cpp_collocations(const List &texts_,
const IntegerVector sizes_,
const String &method,
const double smoothing,
const int thread = 1){
const int thread = -1){

Texts texts = as<Texts>(texts_);
std::vector<unsigned int> sizes = as< std::vector<unsigned int> >(sizes_);
Expand Down Expand Up @@ -255,38 +230,42 @@ DataFrame cpp_collocations(const List &texts_,
}
#endif

MapNgramsPair counts_seq;
counts_seq.max_load_factor(GLOBAL_PATTERN_MAX_LOAD_FACTOR);
MapNgramsPair map_seqs;
map_seqs.max_load_factor(GLOBAL_PATTERN_MAX_LOAD_FACTOR);

//dev::Timer timer;
//dev::start_timer("Count", timer);
#if QUANTEDA_USE_TBB
arena.execute([&]{
tbb::parallel_for(tbb::blocked_range<int>(0, H), [&](tbb::blocked_range<int> r) {
for (int h = r.begin(); h < r.end(); ++h) {
counts2(texts[h], counts_seq, sizes, id_ignore);
counts2(texts[h], map_seqs, sizes, id_ignore);
}
});
});
#else
for (std::size_t h = 0; h < H; h++) {
counts2(texts[h], counts_seq, sizes, id_ignore);
counts2(texts[h], map_seqs, sizes, id_ignore);
}
#endif
//dev::stop_timer("Count", timer);

VecNgrams seqs;
VecPair seqs_all;
std::size_t N = map_seqs.size();

// for estimation
VecNgramsPair seqs_count; // all the collocation
seqs_count.reserve(N);
VecNgrams seqs; // only eligible collocation
seqs.reserve(N);

// for output
IntParams counts, counts_nested, lengths;
std::size_t len = counts_seq.size();
seqs.reserve(len);
seqs_all.reserve(len);
counts.reserve(len);
counts_nested.reserve(len);
lengths.reserve(len);
for (auto it = counts_seq.begin(); it != counts_seq.end(); ++it) {
// conver to a vector for faster itteration
seqs_all.push_back(std::make_pair(it->first, it->second.first));
counts.reserve(N);
counts_nested.reserve(N);
lengths.reserve(N);

for (auto it = map_seqs.begin(); it != map_seqs.end(); ++it) {
// convert to a vector for faster iteration
seqs_count.push_back(std::make_pair(it->first, (unsigned int)it->second.first));
if (it->second.first < count_min) continue;
// estimate only sequences without padding
if (std::none_of(it->first.begin(), it->first.end(), [](unsigned int v){ return v == 0; })) {
Expand All @@ -307,13 +286,13 @@ DataFrame cpp_collocations(const List &texts_,
arena.execute([&]{
tbb::parallel_for(tbb::blocked_range<int>(0, I), [&](tbb::blocked_range<int> r) {
for (int i = r.begin(); i < r.end(); ++i) {
estimates_lambda2(i, seqs, seqs_all, sgma, lmda, method, smoothing);
estimates_lambda2(i, seqs, seqs_count, sgma, lmda, method, smoothing);
}
});
});
#else
for (std::size_t i = 0; i < I; i++) {
estimates_lambda2(i, seqs, seqs_all, sgma, lmda, method, smoothing);
estimates_lambda2(i, seqs, seqs_count, sgma, lmda, method, smoothing);
}
#endif
//dev::stop_timer("Estimate", timer);
Expand Down
Loading

0 comments on commit 69e3d4e

Please sign in to comment.