Skip to content

Commit

Permalink
Merge pull request #1407 from pagreene/get-citations
Browse files Browse the repository at this point in the history
Get article references, either just PMIDs or details
  • Loading branch information
bgyori committed May 2, 2023
2 parents bc087f0 + ac477d1 commit d6b7240
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 4 deletions.
45 changes: 41 additions & 4 deletions indra/literature/pubmed_client.py
Expand Up @@ -367,7 +367,10 @@ def _find_date(element):

def _parse_author(author_info, include_details=False):
if not include_details:
return author_info.find("LastName").text
last_name = author_info.find("LastName")
if last_name is None:
return None
return last_name.text

parsed_info = {
"last_name": None,
Expand Down Expand Up @@ -397,6 +400,27 @@ def _parse_author(author_info, include_details=False):
return parsed_info


def _get_references(reference_list, only_pmid=True):
"""Return a list of references for an article."""
if reference_list is None:
return None

references = []
for reference in reference_list.findall('Reference'):
pmid = _find_elem_text(reference, '*/ArticleId[@IdType="pubmed"]')
if only_pmid:
references.append(pmid)
else:
ref_dict = {
'pmid': pmid,
'doi': _find_elem_text(reference, '*/ArticleId[@IdType="doi"]'),
'pmcid': _find_elem_text(reference, '*/ArticleId[@IdType="pmcid"]'),
'citation': _find_elem_text(reference, 'Citation'),
}
references.append(ref_dict)
return references


def _get_article_info(medline_citation, pubmed_data, detailed_authors=False):
article = medline_citation.find('Article')
pmid = _find_elem_text(medline_citation, './PMID')
Expand Down Expand Up @@ -431,7 +455,8 @@ def _get_article_info(medline_citation, pubmed_data, detailed_authors=False):

def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
get_abstracts=False, prepend_title=False,
mesh_annotations=True, detailed_authors=False):
mesh_annotations=True, detailed_authors=False,
references_included=None):
"""Get metadata for an XML tree containing PubmedArticle elements.
Documentation on the XML structure can be found at:
Expand Down Expand Up @@ -459,6 +484,9 @@ def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
If True, extract as many of the author details as possible, such as
first name, identifiers, and institutions. If false, only last names
are returned. Default: False
references_included : Optional[str]
If 'detailed', include detailed references in the results. If 'pmid', only include
the PMID of the reference. If None, don't include references. Default: None
Returns
-------
Expand All @@ -483,6 +511,11 @@ def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
if mesh_annotations:
context_info = _get_annotations(medline_citation)
result.update(context_info)
if references_included:
references = _get_references(pubmed_data.find('ReferenceList'),
only_pmid=(references_included == 'pmid'))
result['references'] = references

publication_date = _get_pubmed_publication_date(pubmed_data)
result['publication_date'] = publication_date

Expand Down Expand Up @@ -566,7 +599,7 @@ def _major_topic(e):

def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False,
get_abstracts=False, prepend_title=False,
detailed_authors=False):
detailed_authors=False, references_included=None):
"""Get article metadata for up to 200 PMIDs from the Pubmed database.
Parameters
Expand All @@ -586,6 +619,9 @@ def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False,
If True, extract as many of the author details as possible, such as
first name, identifiers, and institutions. If false, only last names
are returned. Default: False
references_included : Optional[str]
If 'detailed', include detailed references in the results. If 'pmid', only include
the PMID of the reference. If None, don't include references. Default: None
Returns
-------
Expand All @@ -604,7 +640,8 @@ def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False,
return None
return get_metadata_from_xml_tree(tree, get_issns_from_nlm, get_abstracts,
prepend_title,
detailed_authors=detailed_authors)
detailed_authors=detailed_authors,
references_included=references_included)


@lru_cache(maxsize=1000)
Expand Down
16 changes: 16 additions & 0 deletions indra/tests/test_pubmed_client.py
Expand Up @@ -82,6 +82,7 @@ def test_get_complex_title():
assert title.lower().startswith('atomic structures')
assert title.lower().endswith('vascular plants.')


@pytest.mark.webservice
def test_expand_pagination():
time.sleep(0.5)
Expand Down Expand Up @@ -144,6 +145,21 @@ def test_get_metadata_for_ids():
metadata2[pmids1[0]]['authors'][0]['affiliations'][0]['name']


@pytest.mark.webservice
def test_get_paper_references():
time.sleep(0.5)
pmids = ['27123883', '27121204', '27115606']
test_pmid = '27121204'
referenced_pmid = '25439075'
metadata_1 = pubmed_client.get_metadata_for_ids(pmids, references_included='pmid')
assert len(metadata_1[test_pmid]['references']) != 0
assert metadata_1[test_pmid]['references'][0] == referenced_pmid

metadata_2 = pubmed_client.get_metadata_for_ids(pmids, references_included='detailed')
assert len(metadata_2[test_pmid]['references']) != 0
assert metadata_2[test_pmid]['references'][0]['pmid'] == referenced_pmid


@pytest.mark.webservice
def test_get_pub_date():
time.sleep(0.5)
Expand Down

0 comments on commit d6b7240

Please sign in to comment.