Skip to content

Commit

Permalink
Merge pull request #4 from quest-bih/ConsolidateStatements
Browse files Browse the repository at this point in the history
Consolidate statements
  • Loading branch information
NicoRiedel committed Apr 20, 2020
2 parents c39d474 + 2404311 commit b308783
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 45 deletions.
41 changes: 29 additions & 12 deletions R/ODDPub_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@
keyword_list[["not_available"]] <- not_available


field_specific_db <- c("GEO",
field_specific_repo <- c("GEO",
"Gene Expression Omnibus",
"European Nucleotide Archive",
"National Center for Biotechnology Information",
Expand Down Expand Up @@ -189,7 +189,7 @@
"accession numbers",
"accession codes") %>%
.format_keyword_vector(end_boundary = TRUE)
keyword_list[["field_specific_db"]] <- field_specific_db
keyword_list[["field_specific_repo"]] <- field_specific_repo



Expand Down Expand Up @@ -464,7 +464,7 @@

#not all keyword categories are used for the sentence search
sentence_search_keywords <- c("available", "was_available", "not_available",
"field_specific_db", "accession_nr", "repositories",
"field_specific_repo", "accession_nr", "repositories",
"github", "data", "all_data",
"not_data", "source_code", "supplement",
"file_formats", "upon_request", "dataset")
Expand Down Expand Up @@ -504,12 +504,13 @@

#combine columns for the different open data keywords
keyword_results_combined <- open_data_categories %>%
map(mutate, com_specific_db = field_specific_db & accession_nr & available & !not_available & !was_available) %>%
map(mutate, com_general_db = repositories & available & !not_available & !was_available) %>%
map(mutate, com_specific_repo = field_specific_repo & accession_nr & available & !not_available & !was_available) %>%
map(mutate, com_general_repo = repositories & available & !not_available & !was_available) %>%
map(mutate, com_github_data = data & github & available & !not_available & !was_available) %>%
map(mutate, com_code = source_code & available & !not_available & !was_available & !upon_request) %>%
map(mutate, com_suppl_code = supplement & source_code) %>%
map(select, publ_sentences, com_specific_db, com_general_db, com_github_data, dataset, com_code, com_suppl_code)
map(select, publ_sentences, com_specific_repo, com_general_repo,
com_github_data, dataset, com_code, com_suppl_code)

return(keyword_results_combined)
}
Expand Down Expand Up @@ -547,9 +548,12 @@
map_function <- map_lgl
}
keyword_results_near_wd <- tibble(
com_file_formats = map_function(PDF_text_full, str_function, pattern = keyword_list[["all_data_file_formats"]]),
com_supplemental_data = map_function(PDF_text_full, str_function, pattern = keyword_list[["supp_table_data"]]),
com_data_availibility = map_function(PDF_text_full, str_function, pattern = keyword_list[["data_availibility_statement"]]))
com_file_formats = map_function(PDF_text_full, str_function,
pattern = keyword_list[["all_data_file_formats"]]),
com_supplemental_data = map_function(PDF_text_full, str_function,
pattern = keyword_list[["supp_table_data"]]),
com_data_availibility = map_function(PDF_text_full, str_function,
pattern = keyword_list[["data_availibility_statement"]]))

return(keyword_results_near_wd)
}
Expand Down Expand Up @@ -599,12 +603,14 @@
keyword_results_near_wd <- .keyword_search_near_wd(PDF_text_sentences)
data_journal_doi <- .check_journal_doi(PDF_text_sentences)

keyword_results_combined <- cbind(keyword_results_tokenized, keyword_results_near_wd, data_journal_doi) %>%
keyword_results_combined <- cbind(keyword_results_tokenized,
keyword_results_near_wd,
data_journal_doi) %>%
as_tibble()

#check if any of the combined columns was positive to determine if the publication has Open Data or Open Code
open_data_publication <- keyword_results_combined %>%
mutate(is_open_data = com_specific_db | com_general_db | com_file_formats | com_github_data | dataset | com_supplemental_data | com_data_availibility | is_data_journal) %>%
mutate(is_open_data = com_specific_repo | com_general_repo | com_file_formats | com_github_data | dataset | com_supplemental_data | com_data_availibility | is_data_journal) %>%
mutate(is_open_code = com_code | com_suppl_code) %>%
tibble::add_column(article = names(PDF_text_sentences)) %>%
select(article, is_open_data, is_open_code)
Expand All @@ -624,8 +630,19 @@
open_data_sentences <- cbind(names(keyword_results), open_data_sentences, keyword_results_near_wd) %>%
as_tibble() %>%
mutate_each(funs(as.character))
colnames(open_data_sentences) <- c("article", "com_specific_db", "com_general_db", "com_github_data", "dataset", "com_code", "com_suppl_code",
colnames(open_data_sentences) <- c("article", "com_specific_repo", "com_general_repo",
"com_github_data", "dataset", "com_code", "com_suppl_code",
"com_file_formats", "com_supplemental_data", "com_data_availibility")
open_data_sentences[is.na(open_data_sentences)] = "" #unify empty fields

#collapse the found statements into one column for Open Data and one for Open Code
open_data_sentences <- open_data_sentences %>%
mutate(open_data_statements = paste(com_specific_repo, com_general_repo, com_github_data,
dataset, com_file_formats, com_supplemental_data,
com_data_availibility, sep = " ") %>% trimws()) %>%
mutate(open_code_statements = paste(com_code, com_suppl_code, sep = " ") %>% trimws()) %>%
select(article, open_data_statements, open_code_statements)


return(open_data_sentences)
}
Expand Down
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ devtools::install_github("quest-bih/oddpub")
The algorithm searches for several categories of similar keywords in each
sentence. Multiple categories have to match for a single sentence to trigger a
detection. Among keyword categories are categories for specific biomedical
databases as well as their corresponding accession numbers (as regular
repositories as well as their corresponding accession numbers (as regular
expressions), general-purpose repositories or different file formats typically
used to distribute raw data in the supplement.

Expand Down Expand Up @@ -60,7 +60,7 @@ open_data_results <- oddpub::open_data_search_parallel(PDF_text_sentences)
```
Paralellized version of the algorithm that starts several parallel processes using the foreach and doParallel package to speed up the detection. Number of processes can be set with the parameter ```cluster_num``` (default value: 4).

To validate the algorithm, we manually screened a sample of 792 publications that were randomly selected from PubMed. On this validation dataset, our algorithm detects Open Data publications with a sensitivity of 0.74 and specificity of 0.97.
To validate the algorithm, we manually screened a sample of 792 publications that were randomly selected from PubMed. On this validation dataset, our algorithm detects Open Data publications with a sensitivity of 0.73 and specificity of 0.97.

## Detailed description of the keywords

Expand All @@ -70,8 +70,8 @@ Those are the combined keyword categories that are searched in the full text. If

| Combined Keyword Category | Keywords |
|---------------------------|----------|
| Field-specific databases | FIELD_SPECIFIC_DB NEAR ACCESSION_NR NEAR (AVAILABLE NOT (NOT_AVAILABLE OR WAS_AVAILABLE)) |
| General-purpose databases | REPOSITORIES NEAR (AVAILABLE NOT (NOT_AVAILABLE OR WAS_AVAILABLE)) |
| Field-specific repositories | FIELD_SPECIFIC_REPO NEAR ACCESSION_NR NEAR (AVAILABLE NOT (NOT_AVAILABLE OR WAS_AVAILABLE)) |
| General-purpose repositories | REPOSITORIES NEAR (AVAILABLE NOT (NOT_AVAILABLE OR WAS_AVAILABLE)) |
| Dataset | (DATASET_NAME OUTER_SYM DATASET_NUMBER) OR SUPPLEMENTAL_DATASET |
| Supplemental table or data | SUPPLEMENTAL_TABLE NEAR_WD(10) (FILE_FORMATS OR ALL_DATA) |
| Supplementary raw/full data with specific file format | (ALL_DATA NOT NOT_DATA) NEAR_WD(10) FILE_FORMATS |
Expand Down Expand Up @@ -100,8 +100,8 @@ Individual keyword categories:
| UPON_REQUEST | Phrase describing that data are only available upon request | ("upon request" OR "on request" OR "upon reasonable request") |
| ALL_DATA | Set of words describing all data or raw data | ("all data" OR "all array data" OR "raw data" OR "full data set" OR "full dataset" OR "crystallographic data" OR "subject-level data") |
| NOT_DATA | Set of negations of the data phrases | ("not all data" OR "not all array data" OR "no raw data" OR "no full data set" OR "no full dataset") |
| FIELD_SPECIFIC_DB | Set of names and abbreviations of field specific databases | ("GEO" OR "Gene Expression Omnibus" OR "European Nucleotide Archive" OR "National Center for Biotechnology Information" OR "European Molecular Biology Laboratory" OR "EMBL-EBI" OR "BioProject" OR "Sequence Read Archive" OR "SRA" OR "ENA" OR "MassIVE" OR "ProteomeXchange" OR "Proteome Exchange" OR "ProteomeExchange" OR "MetaboLights" OR "Array-Express" OR "ArrayExpress" OR "Array Express" OR "PRIDE" OR "DNA Data Bank of Japan" OR "DDBJ" OR "Genbank" OR "Protein Databank" OR "Protein Data Bank" OR "PDB" OR "Metagenomics Rapid Annotation using Subsystem Technology" OR "MG-RAST" OR "metabolights" OR "OpenAgrar" OR "Open Agrar" OR "Electron microscopy data bank" OR "emdb" OR "Cambridge Crystallographic Data Centre" OR "CCDC" OR "Treebase" OR "dbSNP" OR "dbGaP" OR "IntAct" OR "ClinVar" OR "accession number" OR "accession code" OR "accession numbers" OR "accession codes") |
| ACCESSION_NR | Set of regular expressions that represent the accession number formats of different (biomedicine-related) databases | ("G(SE\|SM\|DS\|PL)[[:digit:]]{2,}" OR "PRJ(E\|D\|N\|EB\|DB\|NB)[:digit:]+" OR "SAM(E\|D\|N)[A-Z]?[:digit:]+" OR "[A-Z]{1}[:digit:]{5}" OR "[A-Z]{2}[:digit:]{6}" OR "[A-Z]{3}[:digit:]{5}" OR "[A-Z]{4,6}[:digit:]{7,9}" OR "GCA_[:digit:]{9}\\.[:digit:]+" OR "PRJNA[[:digit:]]{3,}" OR "SR(P\|R\|X\|S\|Z)[[:digit:]]{3,}" OR "E-[A-Z]{4}-[:digit:]{1,}" OR "[:digit:]{1}[A-Z]{1}[[:alnum:]]{2}" OR "MTBLS[[:digit:]]{2,}" OR "10.17590" OR "10.5073" OR "EMD-[[:digit:]]{4,}" OR "[[:digit:]]{7}" OR "[A-Z]{2}_[:digit:]{6,}" OR "[A-Z]{2}-[:digit:]{4,}") |
| FIELD_SPECIFIC_REPO | Set of names and abbreviations of field-specific repositories | ("GEO" OR "Gene Expression Omnibus" OR "European Nucleotide Archive" OR "National Center for Biotechnology Information" OR "European Molecular Biology Laboratory" OR "EMBL-EBI" OR "BioProject" OR "Sequence Read Archive" OR "SRA" OR "ENA" OR "MassIVE" OR "ProteomeXchange" OR "Proteome Exchange" OR "ProteomeExchange" OR "MetaboLights" OR "Array-Express" OR "ArrayExpress" OR "Array Express" OR "PRIDE" OR "DNA Data Bank of Japan" OR "DDBJ" OR "Genbank" OR "Protein Databank" OR "Protein Data Bank" OR "PDB" OR "Metagenomics Rapid Annotation using Subsystem Technology" OR "MG-RAST" OR "metabolights" OR "OpenAgrar" OR "Open Agrar" OR "Electron microscopy data bank" OR "emdb" OR "Cambridge Crystallographic Data Centre" OR "CCDC" OR "Treebase" OR "dbSNP" OR "dbGaP" OR "IntAct" OR "ClinVar" OR "accession number" OR "accession code" OR "accession numbers" OR "accession codes") |
| ACCESSION_NR | Set of regular expressions that represent the accession number formats of different (biomedicine-related) repositories | ("G(SE\|SM\|DS\|PL)[[:digit:]]{2,}" OR "PRJ(E\|D\|N\|EB\|DB\|NB)[:digit:]+" OR "SAM(E\|D\|N)[A-Z]?[:digit:]+" OR "[A-Z]{1}[:digit:]{5}" OR "[A-Z]{2}[:digit:]{6}" OR "[A-Z]{3}[:digit:]{5}" OR "[A-Z]{4,6}[:digit:]{7,9}" OR "GCA_[:digit:]{9}\\.[:digit:]+" OR "PRJNA[[:digit:]]{3,}" OR "SR(P\|R\|X\|S\|Z)[[:digit:]]{3,}" OR "E-[A-Z]{4}-[:digit:]{1,}" OR "[:digit:]{1}[A-Z]{1}[[:alnum:]]{2}" OR "MTBLS[[:digit:]]{2,}" OR "10.17590" OR "10.5073" OR "EMD-[[:digit:]]{4,}" OR "[[:digit:]]{7}" OR "[A-Z]{2}_[:digit:]{6,}" OR "[A-Z]{2}-[:digit:]{4,}") |
| REPOSITORIES | Set of names of general-purpose repositories | ("figshare" OR "dryad" OR "zenodo" OR "dataverse" OR "DataverseNL" OR "osf" OR "open science framework" OR "mendeley data" OR "GIGADB" OR "GigaScience database" OR "OpenNeuro") |
| FILE_FORMATS | Set of file formats | ("csv" OR "zip" OR "xls" OR "xlsx" OR "sav" OR "cif" OR "fasta") |
| GITHUB | Github for data has to be treated differently, as we need additional information that data and not only code was shared on Github | (“github”) |
Expand Down
Loading

0 comments on commit b308783

Please sign in to comment.