# Title
[]()

In [5]:

import sys
sys.path.append('../src')
sys.path.append(r"/home/silvhua/custom_python")
from silvhua import *

# Initialize

In [6]:
api_key = os.getenv('api_ncbi') # Pubmed API key
result_dict = dict()

# Iteration 1

In [9]:
import re
import requests
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def search_article(
        query, api_key, query_tag=None, publication=None, reldate=None,
        systematic_only=False, review_only=False, verbose=False
        ):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key
    - reldate (int): the search returns only those items that have a date specified by datetype within the last n days.

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.

    API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
    Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    data = {}
    if api_key:
        base_url += f'&api_key={api_key}'
    search_term = f'"{re.sub(r"not", "", query)}"' # Remove 'not' since it will be treated as a boolean
    if query_tag:
        search_term += f'{query_tag}'
    if publication:
        search_term = f'AND {publication} [ta]'
    if systematic_only:
        search_term += ' AND systematic[sb]'
    elif review_only:
        search_term += ' AND (systematic[sb] OR review[pt])'
    params = {
        'db': 'pubmed',
        'term': search_term,
        'retmax': 5,
        'retmode': 'json',
        'datetype':'edat',
    }
    if reldate:
        params['reldate'] = reldate
    print(f'Search term: {search_term}')

    response = requests.get(base_url, params=params)
    data = response.json()
    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            for index in range(len(id_list)):
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
            return result     
        else:
            print(f'No results found; returning API response object.')
            return data
                
    except Exception as error: 
        print(f'Response: \n{data}')
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return data

iteration = 1
query = 'resistance training'
result_dict[iteration] = search_article(
    query, api_key, reldate=10,
    systematic_only=True,    
    )
result_dict[iteration]

Search term: "resistance training" AND systematic[sb]


'<?xml version="1.0" ?>\n<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2024//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_240101.dtd">\n<PubmedArticleSet>\n<PubmedArticle><MedlineCitation Status="Publisher" Owner="NLM" IndexingMethod="Automated"><PMID Version="1">38432212</PMID><DateRevised><Year>2024</Year><Month>03</Month><Day>03</Day></DateRevised><Article PubModel="Print-Electronic"><Journal><ISSN IssnType="Electronic">1423-0399</ISSN><JournalIssue CitedMedium="Internet"><PubDate><Year>2024</Year><Month>Mar</Month><Day>02</Day></PubDate></JournalIssue><Title>Urologia internationalis</Title><ISOAbbreviation>Urol Int</ISOAbbreviation></Journal><ArticleTitle>Comparing efficacies of different exercises on androgen deprivation therapy adverse effects in prostate cancer patients: a network meta-analysis.</ArticleTitle><ELocationID EIdType="doi" ValidYN="Y">10.1159/000538114</ELocationID><Abstract><AbstractText Label="INTRODUCTION" NlmCategory="BACKGROUN

In [10]:
import re
import requests
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def search_article(
        query, api_key, query_tag=None, publication=None, reldate=None,
        systematic_only=False, review_only=False, verbose=False
        ):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key
    - reldate (int): the search returns only those items that have a date specified by datetype within the last n days.

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.

    API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
    Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    data = {}
    if api_key:
        base_url += f'&api_key={api_key}'
    search_term = f'"{re.sub(r"not", "", query)}"' # Remove 'not' since it will be treated as a boolean
    if query_tag:
        search_term += f'{query_tag}'
    if publication:
        search_term = f'AND {publication} [ta]'
    if systematic_only:
        search_term += ' AND systematic[sb]'
    elif review_only:
        search_term += ' AND (systematic[sb] OR review[pt])'
    params = {
        'db': 'pubmed',
        'term': search_term,
        'retmax': 5,
        'retmode': 'json',
        'datetype':'edat',
    }
    if reldate:
        params['reldate'] = reldate
    print(f'Search term: {search_term}')

    response = requests.get(base_url, params=params)
    data = response.json()
    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            for index in range(len(id_list)):
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
            return result     
        else:
            print(f'No results found; returning API response object.')
            return data
                
    except Exception as error: 
        print(f'Response: \n{data}')
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return data

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
    print(f'Number of abstract sections: {len(abstract_matches)}')
    if len(abstract_matches) > 1:
        cleaned_abstract_sections = []
        for match in abstract_matches:
            clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
            clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
            cleaned_abstract_sections.append(clean_match)
            
        abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
    else:
        abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
        
    # Extract MeshHeadingList
    MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
    MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''
    
    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'journal': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
        'mesh_headings': MeshHeadingList
    }

def pubmed_details_by_title(title, publication, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, publication, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

iteration = 1
# query = 'resistance training'
# result_dict[iteration] = search_article(
#     query, api_key, reldate=10,
#     systematic_only=True,    
#     )
record_string = result_dict[iteration]

extract_pubmed_details(record_string)

Number of abstract sections: 4


{'pubmed_title': 'Comparing efficacies of different exercises on androgen deprivation therapy adverse effects in prostate cancer patients: a network meta-analysis.',
 'abstract': 'INTRODUCTION: Previous studies showed exercise have efficacies for androgen deprivation therapy (ADT) adverse effects. To compare the efficacies of different exercises on ADT adverse effects, we conducted the network meta-analysis (NMA).<br>METHODS: Literature retrieval was performed in PubMed, Embase, Cochrane Central Register of Controlled Trials (CENTRAL). 19 studies (1184 participants) were included. All analyses were performed in R 4.1.2 or RevMan 5.4.1.<br>RESULTS: NMA results showed that compared with the control group, both aerobic + resistance training (ART) (MD = 5.92, 95% CI [0.38; 11.46]) and resistance exercise (RE) (MD = 5.62, 95% CI [2.70; 8.55]) improved quality of life (QoL). ART (P score: 0.72) may have superiority over RE (P score: 0.7). ART (MD = -10.89, 95% CI [-17.67; -4.11]) significant

# Iteration 2

In [11]:
import re
import requests
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def search_article(
        query, api_key, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, verbose=False,
        additional_search_params=None
        ):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key
    - reldate (int): the search returns only those items that have a date specified by datetype within the last n days.

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.

    API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
    Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    data = {}
    if api_key:
        base_url += f'&api_key={api_key}'
    search_term = f'"{re.sub(r"not", "", query)}"' # Remove 'not' since it will be treated as a boolean
    if query_tag:
        search_term += f'{query_tag}'
    if publication:
        search_term = f'AND {publication} [ta]'
    if systematic_only:
        search_term += ' AND systematic[sb]'
    elif review_only:
        search_term += ' AND (systematic[sb] OR review[pt])'
    params = {
        'db': 'pubmed',
        'term': search_term,
        'retmax': 5,
        'retmode': 'json',
        'datetype':'edat',
    }
    if reldate:
        params['reldate'] = reldate
    if retmax:
        params['retmax'] = retmax
    if additional_search_params:
        params.update(additional_search_params)
    print(f'Search term: {search_term}')

    response = requests.get(base_url, params=params)
    data = response.json()
    return data
    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            for index in range(len(id_list)):
                result = retrieve_citation(id_list[index], api_key).decode('utf-8')
            return result     
        else:
            print(f'No results found; returning API response object.')
            return data
                
    except Exception as error: 
        print(f'Response: \n{data}')
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
        return data

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
    print(f'Number of abstract sections: {len(abstract_matches)}')
    if len(abstract_matches) > 1:
        cleaned_abstract_sections = []
        for match in abstract_matches:
            clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
            clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
            cleaned_abstract_sections.append(clean_match)
            
        abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
    else:
        abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
        
    # Extract MeshHeadingList
    MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
    MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''
    
    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'journal': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
        'mesh_headings': MeshHeadingList
    }

def pubmed_details_by_title(title, publication, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, publication, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

iteration = 2
query = 'resistance training'
result_dict[iteration] = search_article(
    query, api_key, reldate=10,
    systematic_only=True,    
    )
result_dict[iteration]['esearchresult']['idlist']
# record_string = result_dict[iteration]

# extract_pubmed_details(record_string)

Search term: "resistance training" AND systematic[sb]


['38482104', '38440785', '38432828', '38432212']

## 2.1

In [19]:
import re
import requests
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response

def search_article(
        query, api_key, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, verbose=False,
        additional_search_params=None
        ):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key
    - reldate (int): the search returns only those items that have a date specified by datetype within the last n days.

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.

    API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
    Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    data = {}
    if api_key:
        base_url += f'&api_key={api_key}'
    search_term = f'"{re.sub(r"not", "", query)}"' # Remove 'not' since it will be treated as a boolean
    if query_tag:
        search_term += f'{query_tag}'
    if publication:
        search_term = f'AND {publication} [ta]'
    if systematic_only:
        search_term += ' AND systematic[sb]'
    elif review_only:
        search_term += ' AND (systematic[sb] OR review[pt])'
    params = {
        'db': 'pubmed',
        'term': search_term,
        'retmax': 5,
        'retmode': 'json',
        'datetype':'edat',
    }
    if reldate:
        params['reldate'] = reldate
    if retmax:
        params['retmax'] = retmax
    if additional_search_params:
        params.update(additional_search_params)
    print(f'Search term: {search_term}')

    response = requests.get(base_url, params=params)
    data = response.json()
    return data

def batch_retrieve_citation(data):
    result_dict = {}
    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            print(f'Extracting these {len(id_list)} PMIDs: {id_list}')
            for index in range(len(id_list)):
                result_dict[index] = retrieve_citation(id_list[index], api_key).decode('utf-8')
        else:
            print(f'No results found.')
                
    except Exception as error: 
        print(f'Response: \n{data}')
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
    result_dict

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
    print(f'Number of abstract sections: {len(abstract_matches)}')
    if len(abstract_matches) > 1:
        cleaned_abstract_sections = []
        for match in abstract_matches:
            clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
            clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
            cleaned_abstract_sections.append(clean_match)
            
        abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
    else:
        abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
        
    # Extract MeshHeadingList
    MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
    MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''
    
    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'journal': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
        'mesh_headings': MeshHeadingList
    }

def pubmed_details_by_title(title, publication, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, publication, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

query = 'resistance training'
# result_dict[iteration] = search_article(
#     query, api_key, reldate=10,
#     systematic_only=True,    
#     )
data = result_dict[2]

iteration = 2.1
result_dict[iteration] = batch_retrieve_citation(data)
result_dict[iteration]
# record_string = result_dict[iteration]

# extract_pubmed_details(record_string)

Extracting these 4 PMIDs: ['38482104', '38440785', '38432828', '38432212']


Response: 
{'header': {'type': 'esearch', 'version': '0.3'}, 'esearchresult': {'count': '4', 'retmax': '4', 'retstart': '0', 'idlist': ['38482104', '38440785', '38432828', '38432212'], 'translationset': [], 'querytranslation': '"resistance training"[All Fields] AND "systematic"[Filter] AND 2024/03/04:2024/03/14[Date - Entry]'}}
	An error occurred on line 76 in /tmp/ipykernel_29336/826443412.py: 'Response' object has no attribute 'decode'
Article not found.


## 2.2

In [21]:
import re
import requests
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def search_article(
        query, api_key, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, verbose=False,
        additional_search_params=None
        ):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key
    - reldate (int): the search returns only those items that have a date specified by datetype within the last n days.

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.

    API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
    Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    data = {}
    if api_key:
        base_url += f'&api_key={api_key}'
    search_term = f'"{re.sub(r"not", "", query)}"' # Remove 'not' since it will be treated as a boolean
    if query_tag:
        search_term += f'{query_tag}'
    if publication:
        search_term = f'AND {publication} [ta]'
    if systematic_only:
        search_term += ' AND systematic[sb]'
    elif review_only:
        search_term += ' AND (systematic[sb] OR review[pt])'
    params = {
        'db': 'pubmed',
        'term': search_term,
        'retmax': 5,
        'retmode': 'json',
        'datetype':'edat',
    }
    if reldate:
        params['reldate'] = reldate
    if retmax:
        params['retmax'] = retmax
    if additional_search_params:
        params.update(additional_search_params)
    print(f'Search term: {search_term}')

    response = requests.get(base_url, params=params)
    data = response.json()
    return data

def batch_retrieve_citation(data):
    result_dict = {}
    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            print(f'Extracting these {len(id_list)} PMIDs: {id_list}')
            for index in range(len(id_list)):
                result_dict[index] = retrieve_citation(id_list[index], api_key).decode('utf-8')
        else:
            print(f'No results found.')
                
    except Exception as error: 
        print(f'Response: \n{data}')
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article not found.')
    return result_dict

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
    print(f'Number of abstract sections: {len(abstract_matches)}')
    if len(abstract_matches) > 1:
        cleaned_abstract_sections = []
        for match in abstract_matches:
            clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
            clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
            cleaned_abstract_sections.append(clean_match)
            
        abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
    else:
        abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
        
    # Extract MeshHeadingList
    MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
    MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''
    
    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'journal': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
        'mesh_headings': MeshHeadingList
    }

def pubmed_details_by_title(title, publication, api_key):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key

    Returns:
    article_details (dict): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.
    """
    record_string = search_article(title, publication, api_key)
    # return record_string
    if record_string:
        article_details = extract_pubmed_details(record_string)
        return article_details
    else:
        return None

query = 'resistance training'
# result_dict[iteration] = search_article(
#     query, api_key, reldate=10,
#     systematic_only=True,    
#     )
data = result_dict[2]

iteration = 2.1
result_dict[iteration] = batch_retrieve_citation(data)
result_dict[iteration]
# record_string = result_dict[iteration]

# extract_pubmed_details(record_string)

Extracting these 4 PMIDs: ['38482104', '38440785', '38432828', '38432212']


{0: '<?xml version="1.0" ?>\n<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2024//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_240101.dtd">\n<PubmedArticleSet>\n<PubmedArticle><MedlineCitation Status="PubMed-not-MEDLINE" Owner="NLM" IndexingMethod="Automated"><PMID Version="1">38482104</PMID><DateRevised><Year>2024</Year><Month>03</Month><Day>14</Day></DateRevised><Article PubModel="Electronic-eCollection"><Journal><ISSN IssnType="Electronic">2590-1095</ISSN><JournalIssue CitedMedium="Internet"><Volume>6</Volume><Issue>1</Issue><PubDate><Year>2024</Year><Month>Mar</Month></PubDate></JournalIssue><Title>Archives of rehabilitation research and clinical translation</Title><ISOAbbreviation>Arch Rehabil Res Clin Transl</ISOAbbreviation></Journal><ArticleTitle>Exercise and Musculoskeletal Health in Men With Low Bone Mineral Density: A Systematic Review.</ArticleTitle><Pagination><StartPage>100313</StartPage><MedlinePgn>100313</MedlinePgn></Pagination><ELocat

In [22]:
result_dict[iteration]

{0: '<?xml version="1.0" ?>\n<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2024//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_240101.dtd">\n<PubmedArticleSet>\n<PubmedArticle><MedlineCitation Status="PubMed-not-MEDLINE" Owner="NLM" IndexingMethod="Automated"><PMID Version="1">38482104</PMID><DateRevised><Year>2024</Year><Month>03</Month><Day>14</Day></DateRevised><Article PubModel="Electronic-eCollection"><Journal><ISSN IssnType="Electronic">2590-1095</ISSN><JournalIssue CitedMedium="Internet"><Volume>6</Volume><Issue>1</Issue><PubDate><Year>2024</Year><Month>Mar</Month></PubDate></JournalIssue><Title>Archives of rehabilitation research and clinical translation</Title><ISOAbbreviation>Arch Rehabil Res Clin Transl</ISOAbbreviation></Journal><ArticleTitle>Exercise and Musculoskeletal Health in Men With Low Bone Mineral Density: A Systematic Review.</ArticleTitle><Pagination><StartPage>100313</StartPage><MedlinePgn>100313</MedlinePgn></Pagination><ELocat

In [16]:
result_dict[2]['esearchresult']['idlist']

['38482104', '38440785', '38432828', '38432212']

## 2.3

In [26]:
import re
import requests
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def search_article(
        query, api_key, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, verbose=False,
        additional_search_params=None
        ):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key
    - reldate (int): the search returns only those items that have a date specified by datetype within the last n days.

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.

    API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
    Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    data = {}
    if api_key:
        base_url += f'&api_key={api_key}'
    search_term = f'"{re.sub(r"not", "", query)}"' # Remove 'not' since it will be treated as a boolean
    if query_tag:
        search_term += f'{query_tag}'
    if publication:
        search_term = f'AND {publication} [ta]'
    if systematic_only:
        search_term += ' AND systematic[sb]'
    elif review_only:
        search_term += ' AND (systematic[sb] OR review[pt])'
    params = {
        'db': 'pubmed',
        'term': search_term,
        'retmax': 5,
        'retmode': 'json',
        'datetype':'edat',
    }
    if reldate:
        params['reldate'] = reldate
    if retmax:
        params['retmax'] = retmax
    if additional_search_params:
        params.update(additional_search_params)
    print(f'Search term: {search_term}')

    response = requests.get(base_url, params=params)
    data = response.json()
    return data

def batch_retrieve_citation(data):
    result_list = []
    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            print(f'Extracting these {len(id_list)} PMIDs: {id_list}')
            for index, id in enumerate(id_list):
                result_list.append(retrieve_citation(id, api_key).decode('utf-8'))
                current_index, current_id = index+1, id
        else:
            print(f'No results found.')
                
    except Exception as error: 
        print(f'Response: \n{data}')
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article {current_index} [{current_id}] not found.')
    return result_list

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
    print(f'Number of abstract sections: {len(abstract_matches)}')
    if len(abstract_matches) > 1:
        cleaned_abstract_sections = []
        for match in abstract_matches:
            clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
            clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
            cleaned_abstract_sections.append(clean_match)
            
        abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
    else:
        abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
        
    # Extract MeshHeadingList
    MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
    MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''
    
    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'journal': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
        'mesh_headings': MeshHeadingList
    }

def pubmed_details_by_title(api_response={}, record_strings_list=[], **kwargs):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - api_response (dict)
    - record_strings_list (list): List of record strings from `retrieve_citation()`.
    - **kwargs: Parameters to pass to the `search_article()` function.

    Returns:
    article_details (dict): Article metadata from PubMed database if present. 
    """
    result = api_response
    try:
        if api_response==None:
            api_response = search_article(**kwargs)
            result = api_response
        
        result_dict = {}
        if len(record_strings_list) == 0:
            record_strings_list = batch_retrieve_citation(api_response)
            result = record_strings_list
        for index, record_string in enumerate(record_strings_list):
            result_dict[index] = extract_pubmed_details(record_string)
        result = result_dict

    except Exception as error: 
        print(f'Response: \n{api_response}')
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
        print(message) 
    return result



query = 'resistance training'
# result_dict[iteration] = search_article(
#     query, api_key, reldate=10,
#     systematic_only=True,    
#     )
data = result_dict[2]

iteration = 2.3
# result_dict[iteration] = batch_retrieve_citation(data)
api_response = result_dict[2]
result_dict[iteration] = pubmed_details_by_title(api_response)

result_dict[iteration]
# extract_pubmed_details(record_string)

Extracting these 4 PMIDs: ['38482104', '38440785', '38432828', '38432212']
Number of abstract sections: 6
Number of abstract sections: 5
Number of abstract sections: 5
Number of abstract sections: 4


{0: {'pubmed_title': 'Exercise and Musculoskeletal Health in Men With Low Bone Mineral Density: A Systematic Review.',
  'abstract': 'OBJECTIVE: This systematic review aims to determine the effects of exercise on bone and muscle health in men with low bone density.<br>DATA SOURCES: An electronic search in the following databases was performed: Medline, AMED, Embase, Scopus, and SPORTDiscus between January 1940 and September 2021.<br>STUDY SELECTION: Randomized or non-randomized trials involving any form of exercise in adult men with a densitometric diagnosis of osteoporosis or osteopenia and reported outcomes relating to bone or muscle health. Two independent reviewers screened 12,018 records, resulting in 13 eligible articles.<br>DATA EXTRACTION: One reviewer extracted data into a pre-formed table, including characteristics of the exercise intervention, population examined, and primary and secondary outcomes. Study quality was assessed by 2 independent reviewers using the Tool for ass

In [28]:
pd.DataFrame(result_dict[iteration]).transpose()

Unnamed: 0,pubmed_title,abstract,journal,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,mesh_headings
0,Exercise and Musculoskeletal Health in Men Wit...,OBJECTIVE: This systematic review aims to dete...,Archives of rehabilitation research and clinic...,"Katherine Hu, Maree Cassimatis, Christian Girgis",2024,,6.0,1.0,100313.0,,10.1016/j.arrct.2023.100313,
1,Exercise training modalities in prediabetes: a...,BACKGROUND: Lifestyle modification based on ex...,Frontiers in endocrinology,"Hang Zhang, Yuting Guo, Guangshun Hua, Chenyan...",2024,,15.0,,1308959.0,,10.3389/fendo.2024.1308959,"<MeshHeading><DescriptorName UI=""D006801"" Majo..."
2,Does non-invasive brain stimulation improve sp...,BACKGROUND: Multiple sclerosis (MS) is a chron...,Journal of bodywork and movement therapies,"Bruno Henrique de Souza Fonseca, Pedro Henriqu...",2024,,37.0,,350.0,359.0,10.1016/j.jbmt.2023.11.043,"<MeshHeading><DescriptorName UI=""D006801"" Majo..."
3,Comparing efficacies of different exercises on...,INTRODUCTION: Previous studies showed exercise...,Urologia internationalis,"Siping Hu, Xingyu Xiong, Shi Qiu, Jiakun Li, H...",2024,,,,,,10.1159/000538114,


In [23]:
result_dict[2]

{'header': {'type': 'esearch', 'version': '0.3'},
 'esearchresult': {'count': '4',
  'retmax': '4',
  'retstart': '0',
  'idlist': ['38482104', '38440785', '38432828', '38432212'],
  'translationset': [],
  'querytranslation': '"resistance training"[All Fields] AND "systematic"[Filter] AND 2024/03/04:2024/03/14[Date - Entry]'}}

# *End of Page*