diff --git a/R/ODDPub_functions.R b/R/ODDPub_functions.R index 7d67f74..9a256f0 100644 --- a/R/ODDPub_functions.R +++ b/R/ODDPub_functions.R @@ -145,7 +145,7 @@ keyword_list[["not_available"]] <- not_available - field_specific_db <- c("GEO", + field_specific_repo <- c("GEO", "Gene Expression Omnibus", "European Nucleotide Archive", "National Center for Biotechnology Information", @@ -189,7 +189,7 @@ "accession numbers", "accession codes") %>% .format_keyword_vector(end_boundary = TRUE) - keyword_list[["field_specific_db"]] <- field_specific_db + keyword_list[["field_specific_repo"]] <- field_specific_repo @@ -464,7 +464,7 @@ #not all keyword categories are used for the sentence search sentence_search_keywords <- c("available", "was_available", "not_available", - "field_specific_db", "accession_nr", "repositories", + "field_specific_repo", "accession_nr", "repositories", "github", "data", "all_data", "not_data", "source_code", "supplement", "file_formats", "upon_request", "dataset") @@ -504,12 +504,13 @@ #combine columns for the different open data keywords keyword_results_combined <- open_data_categories %>% - map(mutate, com_specific_db = field_specific_db & accession_nr & available & !not_available & !was_available) %>% - map(mutate, com_general_db = repositories & available & !not_available & !was_available) %>% + map(mutate, com_specific_repo = field_specific_repo & accession_nr & available & !not_available & !was_available) %>% + map(mutate, com_general_repo = repositories & available & !not_available & !was_available) %>% map(mutate, com_github_data = data & github & available & !not_available & !was_available) %>% map(mutate, com_code = source_code & available & !not_available & !was_available & !upon_request) %>% map(mutate, com_suppl_code = supplement & source_code) %>% - map(select, publ_sentences, com_specific_db, com_general_db, com_github_data, dataset, com_code, com_suppl_code) + map(select, publ_sentences, com_specific_repo, com_general_repo, + com_github_data, dataset, com_code, com_suppl_code) return(keyword_results_combined) } @@ -547,9 +548,12 @@ map_function <- map_lgl } keyword_results_near_wd <- tibble( - com_file_formats = map_function(PDF_text_full, str_function, pattern = keyword_list[["all_data_file_formats"]]), - com_supplemental_data = map_function(PDF_text_full, str_function, pattern = keyword_list[["supp_table_data"]]), - com_data_availibility = map_function(PDF_text_full, str_function, pattern = keyword_list[["data_availibility_statement"]])) + com_file_formats = map_function(PDF_text_full, str_function, + pattern = keyword_list[["all_data_file_formats"]]), + com_supplemental_data = map_function(PDF_text_full, str_function, + pattern = keyword_list[["supp_table_data"]]), + com_data_availibility = map_function(PDF_text_full, str_function, + pattern = keyword_list[["data_availibility_statement"]])) return(keyword_results_near_wd) } @@ -599,12 +603,14 @@ keyword_results_near_wd <- .keyword_search_near_wd(PDF_text_sentences) data_journal_doi <- .check_journal_doi(PDF_text_sentences) - keyword_results_combined <- cbind(keyword_results_tokenized, keyword_results_near_wd, data_journal_doi) %>% + keyword_results_combined <- cbind(keyword_results_tokenized, + keyword_results_near_wd, + data_journal_doi) %>% as_tibble() #check if any of the combined columns was positive to determine if the publication has Open Data or Open Code open_data_publication <- keyword_results_combined %>% - mutate(is_open_data = com_specific_db | com_general_db | com_file_formats | com_github_data | dataset | com_supplemental_data | com_data_availibility | is_data_journal) %>% + mutate(is_open_data = com_specific_repo | com_general_repo | com_file_formats | com_github_data | dataset | com_supplemental_data | com_data_availibility | is_data_journal) %>% mutate(is_open_code = com_code | com_suppl_code) %>% tibble::add_column(article = names(PDF_text_sentences)) %>% select(article, is_open_data, is_open_code) @@ -624,8 +630,19 @@ open_data_sentences <- cbind(names(keyword_results), open_data_sentences, keyword_results_near_wd) %>% as_tibble() %>% mutate_each(funs(as.character)) - colnames(open_data_sentences) <- c("article", "com_specific_db", "com_general_db", "com_github_data", "dataset", "com_code", "com_suppl_code", + colnames(open_data_sentences) <- c("article", "com_specific_repo", "com_general_repo", + "com_github_data", "dataset", "com_code", "com_suppl_code", "com_file_formats", "com_supplemental_data", "com_data_availibility") + open_data_sentences[is.na(open_data_sentences)] = "" #unify empty fields + + #collapse the found statements into one column for Open Data and one for Open Code + open_data_sentences <- open_data_sentences %>% + mutate(open_data_statements = paste(com_specific_repo, com_general_repo, com_github_data, + dataset, com_file_formats, com_supplemental_data, + com_data_availibility, sep = " ") %>% trimws()) %>% + mutate(open_code_statements = paste(com_code, com_suppl_code, sep = " ") %>% trimws()) %>% + select(article, open_data_statements, open_code_statements) + return(open_data_sentences) } diff --git a/README.md b/README.md index a6ebde3..d96d8b4 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ devtools::install_github("quest-bih/oddpub") The algorithm searches for several categories of similar keywords in each sentence. Multiple categories have to match for a single sentence to trigger a detection. Among keyword categories are categories for specific biomedical -databases as well as their corresponding accession numbers (as regular +repositories as well as their corresponding accession numbers (as regular expressions), general-purpose repositories or different file formats typically used to distribute raw data in the supplement. @@ -60,7 +60,7 @@ open_data_results <- oddpub::open_data_search_parallel(PDF_text_sentences) ``` Paralellized version of the algorithm that starts several parallel processes using the foreach and doParallel package to speed up the detection. Number of processes can be set with the parameter ```cluster_num``` (default value: 4). -To validate the algorithm, we manually screened a sample of 792 publications that were randomly selected from PubMed. On this validation dataset, our algorithm detects Open Data publications with a sensitivity of 0.74 and specificity of 0.97. +To validate the algorithm, we manually screened a sample of 792 publications that were randomly selected from PubMed. On this validation dataset, our algorithm detects Open Data publications with a sensitivity of 0.73 and specificity of 0.97. ## Detailed description of the keywords @@ -70,8 +70,8 @@ Those are the combined keyword categories that are searched in the full text. If | Combined Keyword Category | Keywords | |---------------------------|----------| -| Field-specific databases | FIELD_SPECIFIC_DB NEAR ACCESSION_NR NEAR (AVAILABLE NOT (NOT_AVAILABLE OR WAS_AVAILABLE)) | -| General-purpose databases | REPOSITORIES NEAR (AVAILABLE NOT (NOT_AVAILABLE OR WAS_AVAILABLE)) | +| Field-specific repositories | FIELD_SPECIFIC_REPO NEAR ACCESSION_NR NEAR (AVAILABLE NOT (NOT_AVAILABLE OR WAS_AVAILABLE)) | +| General-purpose repositories | REPOSITORIES NEAR (AVAILABLE NOT (NOT_AVAILABLE OR WAS_AVAILABLE)) | | Dataset | (DATASET_NAME OUTER_SYM DATASET_NUMBER) OR SUPPLEMENTAL_DATASET | | Supplemental table or data | SUPPLEMENTAL_TABLE NEAR_WD(10) (FILE_FORMATS OR ALL_DATA) | | Supplementary raw/full data with specific file format | (ALL_DATA NOT NOT_DATA) NEAR_WD(10) FILE_FORMATS | @@ -100,8 +100,8 @@ Individual keyword categories: | UPON_REQUEST | Phrase describing that data are only available upon request | ("upon request" OR "on request" OR "upon reasonable request") | | ALL_DATA | Set of words describing all data or raw data | ("all data" OR "all array data" OR "raw data" OR "full data set" OR "full dataset" OR "crystallographic data" OR "subject-level data") | | NOT_DATA | Set of negations of the data phrases | ("not all data" OR "not all array data" OR "no raw data" OR "no full data set" OR "no full dataset") | -| FIELD_SPECIFIC_DB | Set of names and abbreviations of field specific databases | ("GEO" OR "Gene Expression Omnibus" OR "European Nucleotide Archive" OR "National Center for Biotechnology Information" OR "European Molecular Biology Laboratory" OR "EMBL-EBI" OR "BioProject" OR "Sequence Read Archive" OR "SRA" OR "ENA" OR "MassIVE" OR "ProteomeXchange" OR "Proteome Exchange" OR "ProteomeExchange" OR "MetaboLights" OR "Array-Express" OR "ArrayExpress" OR "Array Express" OR "PRIDE" OR "DNA Data Bank of Japan" OR "DDBJ" OR "Genbank" OR "Protein Databank" OR "Protein Data Bank" OR "PDB" OR "Metagenomics Rapid Annotation using Subsystem Technology" OR "MG-RAST" OR "metabolights" OR "OpenAgrar" OR "Open Agrar" OR "Electron microscopy data bank" OR "emdb" OR "Cambridge Crystallographic Data Centre" OR "CCDC" OR "Treebase" OR "dbSNP" OR "dbGaP" OR "IntAct" OR "ClinVar" OR "accession number" OR "accession code" OR "accession numbers" OR "accession codes") | -| ACCESSION_NR | Set of regular expressions that represent the accession number formats of different (biomedicine-related) databases | ("G(SE\|SM\|DS\|PL)[[:digit:]]{2,}" OR "PRJ(E\|D\|N\|EB\|DB\|NB)[:digit:]+" OR "SAM(E\|D\|N)[A-Z]?[:digit:]+" OR "[A-Z]{1}[:digit:]{5}" OR "[A-Z]{2}[:digit:]{6}" OR "[A-Z]{3}[:digit:]{5}" OR "[A-Z]{4,6}[:digit:]{7,9}" OR "GCA_[:digit:]{9}\\.[:digit:]+" OR "PRJNA[[:digit:]]{3,}" OR "SR(P\|R\|X\|S\|Z)[[:digit:]]{3,}" OR "E-[A-Z]{4}-[:digit:]{1,}" OR "[:digit:]{1}[A-Z]{1}[[:alnum:]]{2}" OR "MTBLS[[:digit:]]{2,}" OR "10.17590" OR "10.5073" OR "EMD-[[:digit:]]{4,}" OR "[[:digit:]]{7}" OR "[A-Z]{2}_[:digit:]{6,}" OR "[A-Z]{2}-[:digit:]{4,}") | +| FIELD_SPECIFIC_REPO | Set of names and abbreviations of field-specific repositories | ("GEO" OR "Gene Expression Omnibus" OR "European Nucleotide Archive" OR "National Center for Biotechnology Information" OR "European Molecular Biology Laboratory" OR "EMBL-EBI" OR "BioProject" OR "Sequence Read Archive" OR "SRA" OR "ENA" OR "MassIVE" OR "ProteomeXchange" OR "Proteome Exchange" OR "ProteomeExchange" OR "MetaboLights" OR "Array-Express" OR "ArrayExpress" OR "Array Express" OR "PRIDE" OR "DNA Data Bank of Japan" OR "DDBJ" OR "Genbank" OR "Protein Databank" OR "Protein Data Bank" OR "PDB" OR "Metagenomics Rapid Annotation using Subsystem Technology" OR "MG-RAST" OR "metabolights" OR "OpenAgrar" OR "Open Agrar" OR "Electron microscopy data bank" OR "emdb" OR "Cambridge Crystallographic Data Centre" OR "CCDC" OR "Treebase" OR "dbSNP" OR "dbGaP" OR "IntAct" OR "ClinVar" OR "accession number" OR "accession code" OR "accession numbers" OR "accession codes") | +| ACCESSION_NR | Set of regular expressions that represent the accession number formats of different (biomedicine-related) repositories | ("G(SE\|SM\|DS\|PL)[[:digit:]]{2,}" OR "PRJ(E\|D\|N\|EB\|DB\|NB)[:digit:]+" OR "SAM(E\|D\|N)[A-Z]?[:digit:]+" OR "[A-Z]{1}[:digit:]{5}" OR "[A-Z]{2}[:digit:]{6}" OR "[A-Z]{3}[:digit:]{5}" OR "[A-Z]{4,6}[:digit:]{7,9}" OR "GCA_[:digit:]{9}\\.[:digit:]+" OR "PRJNA[[:digit:]]{3,}" OR "SR(P\|R\|X\|S\|Z)[[:digit:]]{3,}" OR "E-[A-Z]{4}-[:digit:]{1,}" OR "[:digit:]{1}[A-Z]{1}[[:alnum:]]{2}" OR "MTBLS[[:digit:]]{2,}" OR "10.17590" OR "10.5073" OR "EMD-[[:digit:]]{4,}" OR "[[:digit:]]{7}" OR "[A-Z]{2}_[:digit:]{6,}" OR "[A-Z]{2}-[:digit:]{4,}") | | REPOSITORIES | Set of names of general-purpose repositories | ("figshare" OR "dryad" OR "zenodo" OR "dataverse" OR "DataverseNL" OR "osf" OR "open science framework" OR "mendeley data" OR "GIGADB" OR "GigaScience database" OR "OpenNeuro") | | FILE_FORMATS | Set of file formats | ("csv" OR "zip" OR "xls" OR "xlsx" OR "sav" OR "cif" OR "fasta") | | GITHUB | Github for data has to be treated differently, as we need additional information that data and not only code was shared on Github | (“github”) | diff --git a/tests/testthat/test_keywords.R b/tests/testthat/test_keywords.R index 9035f49..c9b7afe 100644 --- a/tests/testthat/test_keywords.R +++ b/tests/testthat/test_keywords.R @@ -37,13 +37,13 @@ test_that("not_available", expect_false(.detect_keywords("The data are provided", "not_available")) }) -test_that("field_specific_db", +test_that("field_specific_repo", { - expect_true(.detect_keywords("data were deposited in the geo repository ", "field_specific_db")) - expect_true(.detect_keywords("data were deposited in the sequence read archive", "field_specific_db")) - expect_true(.detect_keywords("data were deposited in the dbGaP database ", "field_specific_db")) - expect_true(.detect_keywords("data were deposited in the treebase database ", "field_specific_db")) - expect_false(.detect_keywords("data were deposited in the database ", "field_specific_db")) + expect_true(.detect_keywords("data were deposited in the geo repository ", "field_specific_repo")) + expect_true(.detect_keywords("data were deposited in the sequence read archive", "field_specific_repo")) + expect_true(.detect_keywords("data were deposited in the dbGaP database ", "field_specific_repo")) + expect_true(.detect_keywords("data were deposited in the treebase database ", "field_specific_repo")) + expect_false(.detect_keywords("data were deposited in the database ", "field_specific_repo")) }) test_that("accession_nr", @@ -257,32 +257,32 @@ context("combined keywords") test_that("field_specific_databases", { - expect_true(.keyword_search_tokenized_2("deposited in geo with accession number gse77534")[["com_specific_db"]]) - expect_true(.keyword_search_tokenized_2(tolower("The crystallographic data and structure were deposited in the Protein Data Bank under ID: 5AHK."))[["com_specific_db"]]) - expect_true(.keyword_search_tokenized_2("CIF file for the Na12[Co5POM] is deposited with the Cambridge Crystallographic Data Centre (CCDC no. 1558372)" %>% tolower())[["com_specific_db"]]) - expect_true(.keyword_search_tokenized_2("data described herein is available at european nucleotide archive under the project identifiers prjeb402 prjeb6610 and prjeb7988 pangaea48–50" %>% tolower())[["com_specific_db"]]) - expect_true(.keyword_search_tokenized_2("Microarray-based methylome data are available at Array-Express (E-MTAB-5797)." %>% tolower())[["com_specific_db"]]) - expect_true(.keyword_search_tokenized_2("genome and chromatin immunoprecipitation sequencing data have been deposited in the ncbi sequence read archive and gene expression omnibus databases (bioproject 320056 and data set gse81160 respectively)." %>% tolower())[["com_specific_db"]]) - expect_true(.keyword_search_tokenized_2("the rna-seq data for human cd4 þ central memory t cells referenced in this study are available in the ‘european nucleotide archive’ with the accession codes erp004883 (ref." %>% tolower())[["com_specific_db"]]) - expect_true(.keyword_search_tokenized_2("the new 18s genotypes reported in this paper are available in the genbank under the accession number(s): kj170100.1 to kj170108.1 5." %>% tolower())[["com_specific_db"]]) - expect_true(.keyword_search_tokenized_2("the data reported in this article have been deposited in the proteomexchange (accession number pxd004606)." %>% tolower())[["com_specific_db"]]) - expect_true(.keyword_search_tokenized_2("Metabolome data have been submitted to the Metabolights database75 (http://www.ebi.ac.uk/metabolights/) with the accession number: MTBLS168." %>% tolower())[["com_specific_db"]]) - expect_true(.keyword_search_tokenized_2("are available in the OpenAgrar repository: https://doi.org/10.17590/20171025-153520 and https://doi.org/10.17590/20171025-154025" %>% tolower())[["com_specific_db"]]) - expect_true(.keyword_search_tokenized_2("Microarray data were uploaded in Gene Expression Omnibus (GEO) repository, accession number: GSE94381." %>% tolower())[["com_specific_db"]]) - expect_true(.keyword_search_tokenized_2("were deposited in the Electron Microscopy Data Bank (European Molecular Biology Laboratory-European Bioinformatics Institute, Cambridge, UK) with the accession numbers EMD-3221 (classical), EMD-3223" %>% tolower())[["com_specific_db"]]) - expect_false(.keyword_search_tokenized_2("Raw sequencing data are available via the European Genome–phenome Archive (accession EGAS00001002213)." %>% tolower())[["com_specific_db"]]) + expect_true(.keyword_search_tokenized_2("deposited in geo with accession number gse77534")[["com_specific_repo"]]) + expect_true(.keyword_search_tokenized_2(tolower("The crystallographic data and structure were deposited in the Protein Data Bank under ID: 5AHK."))[["com_specific_repo"]]) + expect_true(.keyword_search_tokenized_2("CIF file for the Na12[Co5POM] is deposited with the Cambridge Crystallographic Data Centre (CCDC no. 1558372)" %>% tolower())[["com_specific_repo"]]) + expect_true(.keyword_search_tokenized_2("data described herein is available at european nucleotide archive under the project identifiers prjeb402 prjeb6610 and prjeb7988 pangaea48–50" %>% tolower())[["com_specific_repo"]]) + expect_true(.keyword_search_tokenized_2("Microarray-based methylome data are available at Array-Express (E-MTAB-5797)." %>% tolower())[["com_specific_repo"]]) + expect_true(.keyword_search_tokenized_2("genome and chromatin immunoprecipitation sequencing data have been deposited in the ncbi sequence read archive and gene expression omnibus databases (bioproject 320056 and data set gse81160 respectively)." %>% tolower())[["com_specific_repo"]]) + expect_true(.keyword_search_tokenized_2("the rna-seq data for human cd4 þ central memory t cells referenced in this study are available in the ‘european nucleotide archive’ with the accession codes erp004883 (ref." %>% tolower())[["com_specific_repo"]]) + expect_true(.keyword_search_tokenized_2("the new 18s genotypes reported in this paper are available in the genbank under the accession number(s): kj170100.1 to kj170108.1 5." %>% tolower())[["com_specific_repo"]]) + expect_true(.keyword_search_tokenized_2("the data reported in this article have been deposited in the proteomexchange (accession number pxd004606)." %>% tolower())[["com_specific_repo"]]) + expect_true(.keyword_search_tokenized_2("Metabolome data have been submitted to the Metabolights database75 (http://www.ebi.ac.uk/metabolights/) with the accession number: MTBLS168." %>% tolower())[["com_specific_repo"]]) + expect_true(.keyword_search_tokenized_2("are available in the OpenAgrar repository: https://doi.org/10.17590/20171025-153520 and https://doi.org/10.17590/20171025-154025" %>% tolower())[["com_specific_repo"]]) + expect_true(.keyword_search_tokenized_2("Microarray data were uploaded in Gene Expression Omnibus (GEO) repository, accession number: GSE94381." %>% tolower())[["com_specific_repo"]]) + expect_true(.keyword_search_tokenized_2("were deposited in the Electron Microscopy Data Bank (European Molecular Biology Laboratory-European Bioinformatics Institute, Cambridge, UK) with the accession numbers EMD-3221 (classical), EMD-3223" %>% tolower())[["com_specific_repo"]]) + expect_false(.keyword_search_tokenized_2("Raw sequencing data are available via the European Genome–phenome Archive (accession EGAS00001002213)." %>% tolower())[["com_specific_repo"]]) }) test_that("general_purpose_databases", { - expect_true(.keyword_search_tokenized_2("openly available via http://dx.doi. org/10.5061/dryad.p4s57.")[["com_general_db"]]) - expect_true(.keyword_search_tokenized_2("data availability statement: all the relevant data for this study is available from the http://figshare.com database (figshare.com/s/ 205e6d4a552511e4856c06ec4bbcf141 http://dx.doi. org/10.6084/m9.figshare.1206313.")[["com_general_db"]]) - expect_true(.keyword_search_tokenized_2("data was uploaded as online supporting information (file s2) and deposited in zenodo (doi 10.5281/zenodo.17098).")[["com_general_db"]]) - expect_true(.keyword_search_tokenized_2("All files are available from the Open Science Framework database" %>% tolower())[["com_general_db"]]) - expect_true(.keyword_search_tokenized_2("the raw quantification data files have been deposited in the Mendeley Data repositroy (http://dx.doi.org/10.17632/vgpmnzdz55.1)" %>% tolower())[["com_general_db"]]) - expect_true(.keyword_search_tokenized_2("Data Availability Statement: All relevant data are available from Dryad (doi:10.5061/dryad.g8143).")[["com_general_db"]]) + expect_true(.keyword_search_tokenized_2("openly available via http://dx.doi. org/10.5061/dryad.p4s57.")[["com_general_repo"]]) + expect_true(.keyword_search_tokenized_2("data availability statement: all the relevant data for this study is available from the http://figshare.com database (figshare.com/s/ 205e6d4a552511e4856c06ec4bbcf141 http://dx.doi. org/10.6084/m9.figshare.1206313.")[["com_general_repo"]]) + expect_true(.keyword_search_tokenized_2("data was uploaded as online supporting information (file s2) and deposited in zenodo (doi 10.5281/zenodo.17098).")[["com_general_repo"]]) + expect_true(.keyword_search_tokenized_2("All files are available from the Open Science Framework database" %>% tolower())[["com_general_repo"]]) + expect_true(.keyword_search_tokenized_2("the raw quantification data files have been deposited in the Mendeley Data repositroy (http://dx.doi.org/10.17632/vgpmnzdz55.1)" %>% tolower())[["com_general_repo"]]) + expect_true(.keyword_search_tokenized_2("Data Availability Statement: All relevant data are available from Dryad (doi:10.5061/dryad.g8143).")[["com_general_repo"]]) }) test_that("data_on_github", @@ -327,6 +327,6 @@ test_that("open_data_search", test_that("open_data_sentences", { - expect_equivalent(open_data_search(example_text)$com_specific_db, + expect_equivalent(open_data_search(example_text)$open_data_statements, c("deposited in geo with accession number gse77534", "")) }) \ No newline at end of file