Skip to content

Commit

Permalink
Merge branch 'Add_Repos'
Browse files Browse the repository at this point in the history
  • Loading branch information
NicoRiedel committed Oct 7, 2020
2 parents 2945d73 + 357018b commit c86abdb
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 10 deletions.
99 changes: 91 additions & 8 deletions R/ODDPub_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,82 @@
"dbGaP",
"IntAct",
"ClinVar",
"European Variation Archive",
"dbVar",
"Mgnify",
"NCBI Trace Archive",
"NCBI Assembly",
"UniProtKB",
"Protein Circular Dichroism Data Bank",
"PCDDB",
"Crystallography Open Database",
"Coherent X-ray Imaging Data Bank",
"CXIDB",
"Biological Magnetic Resonance Data Bank",
"BMRB",
"Worldwide Protein Data Bank",
"wwPDB",
"Structural Biology Data Grid",
"NeuroMorpho",
"G-Node",
"Neuroimaging Informatics Tools and Resources Collaboratory",
"NITRC",
"EBRAINS",
"GenomeRNAi",
"Database of Interacting Proteins",
"IntAct",
"Japanese Genotype-phenotype Archive",
"Biological General Repository for Interaction Datasets",
"PubChem",
"Genomic Expression Archive",
"PeptideAtlas",
"Environmental Data Initiative",
"LTER Network Information System Data Portal",
"Global Biodiversity Information Facility",
"GBIF",
"Integrated Taxonomic Information System",
"ITIS",
"Knowledge Network for Biocomplexity",
"Morphobank",
"Kinetic Models of Biological Systems",
"KiMoSys",
"The Network Data Exchange",
"NDEx",
"FlowRepository",
"ImmPort",
"Image Data Resource",
"Cancer Imaging Archive",
"SICAS Medical Image Repository",
"Coherent X-ray Imaging Data Bank",
"CXIDB",
"Cell Image Library",
"Eukaryotic Pathogen Database Resources",
"EuPathDB",
"Influenza Research Database",
"Mouse Genome Informatics",
"Rat Genome Database",
"VectorBase",
"Xenbase",
"Zebrafish Model Organism Database",
"ZFIN",
"HIV Data Archive Program",
"NAHDAP",
"National Database for Autism Research",
"NDAR",
"PhysioNet",
"National Database for Clinical Trials related to Mental Illness",
"NDCT",
"Research Domain Criteria Database",
"RdoCdb",
"Synapse",
"UK Data Service",
"caNanoLab",
"ChEMBL",
"IoChem-BD",
"Computational Chemistry Datasets",
"STRENDA",
"European Genome–phenome Archive",
"European Genome phenome Archive",
"accession number",
"accession code",
"accession numbers",
Expand All @@ -196,22 +272,29 @@
accession_nr <- c("G(SE|SM|DS|PL)[[:digit:]]{2,}", #GEO
"PRJ(E|D|N|EB|DB|NB)[:digit:]+",
"SAM(E|D|N)[A-Z]?[:digit:]+",
"[A-Z]{1}[:digit:]{5}", #GenBank
"[A-Z]{1}[:digit:]{4}", #GenBank
"[A-Z]{2}[:digit:]{6}",
"[A-Z]{3}[:digit:]{5}",
"[A-Z]{4,6}[:digit:]{7,9}",
"[A-Z]{4,6}[:digit:]{3,}",
"GCA_[:digit:]{9}\\.[:digit:]+",
"PRJNA[[:digit:]]{3,}",
"SR(P|R|X|S|Z)[[:digit:]]{3,}",
"E-[A-Z]{4}-[:digit:]{1,}",
"(E|P)-[A-Z]{4}-[:digit:]{1,}",
"[:digit:]{1}[A-Z]{1}[[:alnum:]]{2}",
"MTBLS[[:digit:]]{2,}",
"10.17590",
"10.5073",
"EMD-[[:digit:]]{4,}",
"[[:digit:]]{7}",
"[A-Z]{2}_[:digit:]{6,}",
"[A-Z]{2}-[:digit:]{4,}") %>%
"10.25493",
"10.6073",
"10.15468",
"10.5063",
"[[:digit:]]{6}",
"[A-Z]{2,3}_[:digit:]{5,}",
"[A-Z]{2,3}-[:digit:]{4,}",
"[A-Z]{2}[:digit:]{5}-[A-Z]{1}",
"DIP:[:digit:]{3}",
"FR-FCM-[[:alnum:]]{4}",
"ICPSR [:digit:]{4}",
"SN [:digit:]{4}") %>%
.format_keyword_vector()
keyword_list[["accession_nr"]] <- accession_nr

Expand Down
40 changes: 38 additions & 2 deletions tests/testthat/test_keywords.R
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,32 @@ test_that("accession_nr",
expect_true(.detect_keywords("accession nr AB981664", "accession_nr"))
expect_true(.detect_keywords("accession nr SCV000677102,", "accession_nr"))
expect_true(.detect_keywords("accession nr MKYY00000000", "accession_nr"))
expect_true(.detect_keywords("accession nr GCA_000002285.2", "accession_nr"))
expect_true(.detect_keywords("accession nr GCJV01000000", "accession_nr"))
expect_true(.detect_keywords("accession nr KY286086", "accession_nr"))
expect_true(.detect_keywords("accession nr nstd102", "accession_nr"))
expect_true(.detect_keywords("accession nr SRS2651772", "accession_nr"))
expect_true(.detect_keywords("accession nr MGYA00088411", "accession_nr"))
expect_true(.detect_keywords("accession nr ASM1483986v1", "accession_nr"))
expect_true(.detect_keywords("accession nr CD0004002012", "accession_nr"))
expect_true(.detect_keywords("accession nr NMO_00001", "accession_nr"))
expect_true(.detect_keywords("accession nr 10.25493/A2KP-FKD", "accession_nr"))
expect_true(.detect_keywords("accession nr GR00339-A-1", "accession_nr"))
expect_true(.detect_keywords("accession nr DIP:19766N", "accession_nr"))
expect_true(.detect_keywords("accession nr IM-11377-7", "accession_nr"))
expect_true(.detect_keywords("accession nr JGAD00000000192", "accession_nr"))
expect_true(.detect_keywords("accession nr 146156434", "accession_nr"))
expect_true(.detect_keywords("accession nr P-GEAD-10", "accession_nr"))
expect_true(.detect_keywords("accession nr 10.6073/pasta/c174404b0bb5d9a65bc8eccb40db825c", "accession_nr"))
expect_true(.detect_keywords("accession nr 10.15468/aomfnb", "accession_nr"))
expect_true(.detect_keywords("accession nr 10.5063/F1J964SR", "accession_nr"))
expect_true(.detect_keywords("accession nr FR-FCM-ZY68", "accession_nr"))
expect_true(.detect_keywords("accession nr P1110", "accession_nr"))
expect_true(.detect_keywords("accession nr AJ457961", "accession_nr"))
expect_true(.detect_keywords("accession nr 708451", "accession_nr"))
expect_true(.detect_keywords("accession nr ICPSR 3049", "accession_nr"))
expect_true(.detect_keywords("accession nr SN 8680", "accession_nr"))
expect_true(.detect_keywords("accession nr CHEMBL3301451", "accession_nr"))
})

test_that("repositories",
Expand Down Expand Up @@ -257,7 +283,7 @@ context("combined keywords")

test_that("field_specific_databases",
{
expect_true(.keyword_search_tokenized_2("deposited in geo with accession number gse77534")[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("deposited in geo with accession number gse77534" %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2(tolower("The crystallographic data and structure were deposited in the Protein Data Bank under ID: 5AHK."))[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("CIF file for the Na12[Co5POM] is deposited with the Cambridge Crystallographic Data Centre (CCDC no. 1558372)" %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("data described herein is available at european nucleotide archive under the project identifiers prjeb402 prjeb6610 and prjeb7988 pangaea48–50" %>% tolower())[["com_specific_repo"]])
Expand All @@ -270,7 +296,17 @@ test_that("field_specific_databases",
expect_true(.keyword_search_tokenized_2("are available in the OpenAgrar repository: https://doi.org/10.17590/20171025-153520 and https://doi.org/10.17590/20171025-154025" %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("Microarray data were uploaded in Gene Expression Omnibus (GEO) repository, accession number: GSE94381." %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("were deposited in the Electron Microscopy Data Bank (European Molecular Biology Laboratory-European Bioinformatics Institute, Cambridge, UK) with the accession numbers EMD-3221 (classical), EMD-3223" %>% tolower())[["com_specific_repo"]])
expect_false(.keyword_search_tokenized_2("Raw sequencing data are available via the European Genome–phenome Archive (accession EGAS00001002213)." %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("Raw sequencing data are available via the European Genome–phenome Archive (accession EGAS00001002213)." %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("deposited in the European Variation Archive with ID GCA_000002285.2" %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("deposited in dbVar with ID nstd102" %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("deposited in Mgnify with ID MGYA00088411" %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("deposited in NeuroMorpho with ID NMO_00001" %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("deposited in GenomeRNAi with ID GR00175-A" %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("deposited in the Database of Interacting Proteins with ID DIP:310N" %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("deposited in the Global Biodiversity Information Facility with doi 10.15468/aomfnb" %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("deposited in the FlowRepository with ID FR-FCM-ZY68" %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("deposited in NAHDAP with ID ICPSR 33581" %>% tolower())[["com_specific_repo"]])
expect_true(.keyword_search_tokenized_2("deposited in the UK Data Service with ID SN 854233" %>% tolower())[["com_specific_repo"]])
})


Expand Down

0 comments on commit c86abdb

Please sign in to comment.