# Title
[]()

In [1]:

import sys
sys.path.append('../src')
sys.path.append(r"/home/silvhua/custom_python")
from silvhua import *

# Initialize

In [2]:
api_key = os.getenv('api_ncbi') # Pubmed API key
result_dict = dict()

# Iteration 1

In [10]:
import re
import requests
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def search_article(
        query, api_key, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, verbose=False,
        additional_search_params=None
        ):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key
    - reldate (int): the search returns only those items that have a date specified by datetype within the last n days.

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.

    API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
    Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    data = {}
    if api_key:
        base_url += f'&api_key={api_key}'
    search_term = f'"{re.sub(r"not", "", query)}"' # Remove 'not' since it will be treated as a boolean
    if query_tag:
        search_term += f'{query_tag}'
    if publication:
        search_term = f'AND {publication} [ta]'
    if systematic_only:
        search_term += ' AND systematic[sb]'
    elif review_only:
        search_term += ' AND (systematic[sb] OR review[pt])'
    params = {
        'db': 'pubmed',
        'term': search_term,
        'retmax': 5,
        'retmode': 'json',
        'datetype':'edat',
    }
    if reldate:
        params['reldate'] = reldate
    if retmax:
        params['retmax'] = retmax
    if additional_search_params:
        params.update(additional_search_params)
    print(f'Search term: {search_term}')

    response = requests.get(base_url, params=params)
    data = response.json()
    return data

def batch_retrieve_citation(data):
    result_list = []
    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            print(f'Extracting these {len(id_list)} PMIDs: {id_list}')
            for index, id in enumerate(id_list):
                result_list.append(retrieve_citation(id, api_key).decode('utf-8'))
                current_index, current_id = index+1, id
        else:
            print(f'No results found.')
                
    except Exception as error: 
        print(f'Response: \n{data}')
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article {current_index} [{current_id}] not found.')
    return result_list

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
    print(f'Number of abstract sections: {len(abstract_matches)}')
    if len(abstract_matches) > 1:
        cleaned_abstract_sections = []
        for match in abstract_matches:
            clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
            clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
            cleaned_abstract_sections.append(clean_match)
            
        abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
    else:
        abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
        
    # Extract MeshHeadingList
    MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
    MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''
    
    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'journal': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
        'mesh_headings': MeshHeadingList
    }

def pubmed_details_by_title(api_response={}, record_strings_list=[], **kwargs):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - api_response (dict)
    - record_strings_list (list): List of record strings from `retrieve_citation()`.
    - **kwargs: Parameters to pass to the `search_article()` function.

    Returns:
    article_details (dict): Article metadata from PubMed database if present. 
    """
    result = api_response
    try:
        if api_response==None:
            api_response = search_article(**kwargs)
            result = api_response
        
        result_dict = {}
        if len(record_strings_list) == 0:
            record_strings_list = batch_retrieve_citation(api_response)
            result = record_strings_list
        for index, record_string in enumerate(record_strings_list):
            result_dict[index] = extract_pubmed_details(record_string)
        result = result_dict

    except Exception as error: 
        print(f'Response: \n{api_response}')
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
        print(message) 
    return result



query = 'resistance training'
iteration = 1
result_dict[iteration] = search_article(
    query, api_key, reldate=10, #retmax=2,
    systematic_only=True,    
    )
data = result_dict[iteration]
api_response = result_dict[iteration ]

result_dict[iteration +0.02] = pubmed_details_by_title(api_response)
result_dict[iteration +0.02]

Search term: "resistance training" AND systematic[sb]
Extracting these 5 PMIDs: ['38563037', '38561438', '38559546', '38549168', '38541735']
Number of abstract sections: 4
Number of abstract sections: 5
Number of abstract sections: 1
Number of abstract sections: 3
Number of abstract sections: 1


{0: {'pubmed_title': 'Differences in the Impact of Various Types of Exercise on Irisin Levels: A Systematic Review and Meta-Analysis.',
  'abstract': "BACKGROUND: Irisin, a myokine that is responsive to exercise, induces significant changes in subcutaneous adipose tissue. By promoting the browning of white fat tissue, it enhances energy expenditure, thereby addressing overweight and obesity. This systematic review and meta-analysis aimed to compare the effects of different types of physical exercises on irisin levels in overweight and obese adults.<br>METHODS: Specifically, the review focused on studies involving obese or overweight individuals who participated in exercise training for a minimum of 8 weeks, with measured and reported changes in serum irisin levels compared to a control group. Data were collected from four databases (Google Scholar, ISI Web of Science Core Collection, PubMed, and Scopus). The risk of bias was assessed using the Begg and Egger tests, and the results were

In [11]:
pd.DataFrame(result_dict[iteration +0.02]).transpose()

Unnamed: 0,pubmed_title,abstract,journal,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,mesh_headings
0,Differences in the Impact of Various Types of ...,"BACKGROUND: Irisin, a myokine that is responsi...",International journal of preventive medicine,"Atefe Torabi, Jalil Reisi, Mehdi Kargarfard, M...",2024,,15,,11,,10.4103/ijpvm.ijpvm_76_23,
1,Differences in Biomechanical Determinants of A...,BACKGROUND: Change of direction (COD) movement...,Sports medicine - open,"Thomas A Donelon, Jamie Edwards, Mathew Brown,...",2024,,10,1.0,29,,10.1186/s40798-024-00701-z,
2,The Effectiveness and Optimal Dose of Resistan...,A subgroup of patients with low back pain (LBP...,Cureus,"Valerio Barbari, Maria M Carbone, Lorenzo Stor...",2024,,16,3.0,e57278,,10.7759/cureus.57278,
3,Sex differences in the physiological responses...,BACKGROUND: Heart disease is one of the leadin...,"BMC sports science, medicine &amp; rehabilitation","J Bouakkar, T J Pereira, H Johnston, M Pakosh,...",2024,,16,1.0,74,,10.1186/s13102-024-00867-9,
4,Effects of Low-Load Blood Flow Restriction Tra...,The aim of this meta-analysis was to determine...,"Life (Basel, Switzerland)","Ra&#xfa;l Fabero-Garrido, Miguel Gragera-Vela,...",2024,,14,3.0,,,10.3390/life14030411,


# Iteration 2

In [12]:
import re
import requests
def retrieve_citation(article_id, api_key):
    """
    Retrieve article metadata from PubMed database.
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if api_key:
        base_url += f'&api_key={api_key}'
    params = {
        'db': 'pubmed',
        'id': article_id
    }

    response = requests.get(base_url, params=params)
    return response.content

def search_article(
        query, api_key, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, verbose=False,
        additional_search_params=None
        ):
    """
    Search for article title in PubMed database.

    Parameters:
    - title (str): article title
    - api_key (str): NCBI API key
    - reldate (int): the search returns only those items that have a date specified by datetype within the last n days.

    Returns:
    response (str): Article metadata from PubMed database if present. Otherwise, returns list of PMIDs.

    API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
    Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
    """
    base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    data = {}
    if api_key:
        base_url += f'&api_key={api_key}'
    search_term = f'"{re.sub(r"not", "", query)}"' # Remove 'not' since it will be treated as a boolean
    if query_tag:
        search_term += f'{query_tag}'
    if publication:
        search_term = f'AND {publication} [ta]'
    if systematic_only:
        search_term += ' AND systematic[sb]'
    elif review_only:
        search_term += ' AND (systematic[sb] OR review[pt])'
    params = {
        'db': 'pubmed',
        'term': search_term,
        'retmax': 5,
        'retmode': 'json',
        'datetype':'edat',
    }
    if reldate:
        params['reldate'] = reldate
    if retmax:
        params['retmax'] = retmax
    if additional_search_params:
        params.update(additional_search_params)
    print(f'Search term: {search_term}')

    response = requests.get(base_url, params=params)
    data = response.json()
    return data

def batch_retrieve_citation(data):
    result_list = []
    try:
        id_list = data['esearchresult']['idlist']
        if id_list:
            print(f'Extracting these {len(id_list)} PMIDs: {id_list}')
            for index, id in enumerate(id_list):
                result_list.append(retrieve_citation(id, api_key).decode('utf-8'))
                current_index, current_id = index+1, id
        else:
            print(f'No results found.')
                
    except Exception as error: 
        print(f'Response: \n{data}')
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('Article {current_index} [{current_id}] not found.')
    return result_list

def extract_pubmed_details(record_string):
    """
    Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
    """
    authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
    formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

    # Extract publication year
    publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
    publication_year = publication_year.group(1) if publication_year else ''
    publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
    publication_month = publication_month.group(1) if publication_month else ''

    # Extract article title
    article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
    article_title = article_title.group(1) if article_title else ''

    # Extract journal title
    journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
    journal_title = journal_title.group(1) if journal_title else ''

    # Extract journal volume
    journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
    journal_volume = journal_volume.group(1) if journal_volume else ''

    # Extract journal issue
    journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
    journal_issue = journal_issue.group(1) if journal_issue else ''

    # Extract start page
    start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
    start_page = start_page.group(1) if start_page else ''

    # Extract end page
    end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
    end_page = end_page.group(1) if end_page else ''

    # Extract ELocationID
    doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
    doi = doi.group(1) if doi else ''

    abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
    print(f'Number of abstract sections: {len(abstract_matches)}')
    if len(abstract_matches) > 1:
        cleaned_abstract_sections = []
        for match in abstract_matches:
            clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
            clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
            cleaned_abstract_sections.append(clean_match)
            
        abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
    else:
        abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
        
    # Extract MeshHeadingList
    MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
    MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''
    
    return {
        'pubmed_title': article_title,
        'abstract': abstract,
        'journal': journal_title,
        'authors': formatted_authors,
        'year': publication_year,
        'month': publication_month,
        'pub_volume': journal_volume,
        'pub_issue': journal_issue,
        'start_page': start_page,
        'end_page': end_page,
        'doi': doi,
        'mesh_headings': MeshHeadingList
    }

def pubmed_details_by_title(api_response={}, record_strings_list=[], **kwargs):
    """
    Search for article title in PubMed database and return article details.

    Parameters:
    - api_response (dict)
    - record_strings_list (list): List of record strings from `retrieve_citation()`.
    - **kwargs: Parameters to pass to the `search_article()` function.

    Returns:
    article_details (dict): Article metadata from PubMed database if present. 
    """
    result = api_response
    try:
        if api_response==None:
            api_response = search_article(**kwargs)
            result = api_response
        
        result_dict = {}
        if len(record_strings_list) == 0:
            record_strings_list = batch_retrieve_citation(api_response)
            result = record_strings_list
        for index, record_string in enumerate(record_strings_list):
            result_dict[index] = extract_pubmed_details(record_string)
        result = result_dict

    except Exception as error: 
        print(f'Response: \n{api_response}')
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
        print(message) 
    return result



query = 'resistance training'
iteration = 1
result_dict[iteration] = search_article(
    query, api_key, reldate=10, retmax=2,
    systematic_only=True,    
    ) # returns PMIDs as a list
data = result_dict[iteration]
api_response = result_dict[iteration ]

# Retrieve the details of the articles in the PMID list
result_dict[iteration +0.02] = pubmed_details_by_title(api_response)
result_dict[iteration +0.02]

Search term: "resistance training" AND systematic[sb]
Extracting these 2 PMIDs: ['38563037', '38561438']
Number of abstract sections: 4
Number of abstract sections: 5


{0: {'pubmed_title': 'Differences in the Impact of Various Types of Exercise on Irisin Levels: A Systematic Review and Meta-Analysis.',
  'abstract': "BACKGROUND: Irisin, a myokine that is responsive to exercise, induces significant changes in subcutaneous adipose tissue. By promoting the browning of white fat tissue, it enhances energy expenditure, thereby addressing overweight and obesity. This systematic review and meta-analysis aimed to compare the effects of different types of physical exercises on irisin levels in overweight and obese adults.<br>METHODS: Specifically, the review focused on studies involving obese or overweight individuals who participated in exercise training for a minimum of 8 weeks, with measured and reported changes in serum irisin levels compared to a control group. Data were collected from four databases (Google Scholar, ISI Web of Science Core Collection, PubMed, and Scopus). The risk of bias was assessed using the Begg and Egger tests, and the results were

# Iteration 3

In [3]:
sys.path.append(r"/home/silvhua/custom_python")
from Custom_Logger import *

class Pubmed_API:
    def __init__(self, api_key=os.getenv('api_ncbi'), logger=None, logging_level=logging.INFO):
        self.api_key = api_key
        self.logger = create_function_logger('Pubmed_API', logger, level=logging_level)
        self.iteration = 1
        self.responses_dict = {}
        self.results_dict = {}

    def search_article(self, query, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, verbose=False, additional_search_params=None
        ):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        response = {}
        search_term = f'"{re.sub(r"not", "", query)}"'  # Remove 'not' since it will be treated as a boolean
        if query_tag:
            search_term += f'{query_tag}'
        if publication:
            search_term = f'AND {publication} [ta]'
        if systematic_only:
            search_term += ' AND systematic[sb]'
        elif review_only:
            search_term += ' AND (systematic[sb] OR review[pt])'
        params = {
            'db': 'pubmed',
            'term': search_term,
            'retmax': 5,
            'retmode': 'json',
            'datetype': 'edat',
        }
        if reldate:
            params['reldate'] = reldate
        if retmax:
            params['retmax'] = retmax
        if additional_search_params:
            params.update(additional_search_params)
        self.logger.info(f'Search term: {search_term}')
        try:
            response = requests.get(self.base_url, params=params)
            response_dict = response.json()
            self.responses_dict[self.iteration] = response_dict
            result_dict = self.get_article_data_by_title(response_dict)
            self.result_dict[self.iteration] = result_dict
            self.iteration += 1
        except Exception as error:
            error_messages = []
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.debug('\n'.join(error_messages))
        
        return pd.DataFrame(result_dict).transpose()

    def get_article_data_by_title(self):
        try:
            result_dict = {}
            record_strings_list = self.batch_retrieve_citation(self.responses_dict[self.iteration])
            for index, record_string in enumerate(record_strings_list):
                result_dict[index] = self.extract_pubmed_details(record_string)

        except Exception as error:
            error_messages = []
            error_messages.append(f'Response: \n{self.PMIDs_dict[self.iteration]}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.debug('\n'.join(error_messages))
        return result_dict

    def batch_retrieve_citation(self, response_dict):
        result_list = []
        messages = []
        try:
            id_list = response_dict['esearchresult']['idlist']
            if id_list:
                self.logger.info(f'Extracting these {len(id_list)} PMIDs: {id_list}')
                for index, id in enumerate(id_list):
                    result_list.append(self.retrieve_citation(id).decode('utf-8'))
                    current_index, current_id = index+1, id
            else:
                self.logger.warning(f'No results found.')

        except Exception as error:
            messages.append(f'Response: \n{response_dict}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            messages.append(f'\tAn error occurred on line {lineno} in {filename}: {error}')
            messages.append(f'Article {current_index} [{current_id}] not found.')
        return result_list

    def retrieve_citation(self, article_id):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        params = {
            'db': 'pubmed',
            'id': article_id
        }

        response = requests.get(base_url, params=params)
        return response.content

    def extract_pubmed_details(self, record_string):
        """
        Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
        """
        authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
        formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

        # Extract publication year
        publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
        publication_year = publication_year.group(1) if publication_year else ''
        publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
        publication_month = publication_month.group(1) if publication_month else ''

        # Extract article title
        article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
        article_title = article_title.group(1) if article_title else ''

        # Extract journal title
        journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
        journal_title = journal_title.group(1) if journal_title else ''

        # Extract journal volume
        journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
        journal_volume = journal_volume.group(1) if journal_volume else ''

        # Extract journal issue
        journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
        journal_issue = journal_issue.group(1) if journal_issue else ''

        # Extract start page
        start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
        start_page = start_page.group(1) if start_page else ''

        # Extract end page
        end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
        end_page = end_page.group(1) if end_page else ''

        # Extract ELocationID
        doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
        doi = doi.group(1) if doi else ''

        abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
        print(f'Number of abstract sections: {len(abstract_matches)}')
        if len(abstract_matches) > 1:
            cleaned_abstract_sections = []
            for match in abstract_matches:
                clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
                clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
                cleaned_abstract_sections.append(clean_match)
                
            abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
        else:
            abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
            
        # Extract MeshHeadingList
        MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
        MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''
        
        return {
            'pubmed_title': article_title,
            'abstract': abstract,
            'journal': journal_title,
            'authors': formatted_authors,
            'year': publication_year,
            'month': publication_month,
            'pub_volume': journal_volume,
            'pub_issue': journal_issue,
            'start_page': start_page,
            'end_page': end_page,
            'doi': doi,
            'mesh_headings': MeshHeadingList
        }
    
iteration = 3
query = 'resistance training'
result_dict[iteration] = Pubmed_API()


In [6]:
import re
query = 'human AND lifespan AND predictor'
result_dict[iteration].search_article(query, retmax=3)

2024-04-02 18:51:45,011 - Pubmed_API - INFO:
Search term: "human AND lifespan AND predictor"



UnboundLocalError: local variable 'result_dict' referenced before assignment

# Iteration 4

In [23]:
import sys
sys.path.append(r"/home/silvhua/custom_python")
import os
import pandas as pd
import string
import re
import requests
# from article_processing import create_text_dict_from_folder
# from orm_summarize import *
api_key = os.getenv('api_ncbi') # Pubmed API key

import sys
import os
import requests
from Custom_Logger import *

class Pubmed_API:
    def __init__(self, api_key=os.getenv('api_ncbi'), logger=None, logging_level=logging.INFO):
        self.api_key = api_key
        self.logger = create_function_logger('Pubmed_API', logger, level=logging_level)
        self.iteration = 1
        self.responses_dict = {}
        self.results_dict = {}
        self.PMIDs_dict = {}

    def search_article(self, query, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, verbose=False, additional_search_params=None
        ):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        response = {}
        results = pd.DataFrame()
        search_term = f'"{re.sub(r"not", "", query)}"'  # Remove 'not' since it will be treated as a boolean
        if query_tag:
            search_term += f'{query_tag}'
        if publication:
            search_term = f'AND {publication} [ta]'
        if systematic_only:
            search_term += ' AND systematic[sb]'
        elif review_only:
            search_term += ' AND (systematic[sb] OR review[pt])'
        params = {
            'db': 'pubmed',
            'term': search_term,
            'retmax': 5,
            'retmode': 'json',
            'datetype': 'edat',
        }
        if reldate:
            params['reldate'] = reldate
        if retmax:
            params['retmax'] = retmax
        if additional_search_params:
            params.update(additional_search_params)
        self.logger.info(f'Search term: {search_term}')
        try:
            response = requests.get(base_url, params=params)
            response_dict = response.json()
            self.responses_dict[self.iteration] = response_dict
            result_dict = self.get_article_data_by_title()
            self.results_dict[self.iteration] = result_dict
            self.iteration += 1
            results = pd.DataFrame(result_dict).transpose()
        except Exception as error:
            error_messages = []
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        
        return results

    def get_article_data_by_title(self):
        try:
            result_dict = {}
            record_strings_list = self.batch_retrieve_citation(self.responses_dict[self.iteration])
            for index, record_string in enumerate(record_strings_list):
                result_dict[index] = self.extract_pubmed_details(record_string)

        except Exception as error:
            error_messages = []
            error_messages.append(f'Response: \n{self.PMIDs_dict[self.iteration]}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        return result_dict

    def batch_retrieve_citation(self, response_dict):
        result_list = []
        messages = []
        try:
            id_list = response_dict['esearchresult']['idlist']
            self.PMIDs_dict[self.iteration] = id_list
            if id_list:
                self.logger.info(f'Extracting these {len(id_list)} PMIDs: {id_list}')
                for index, id in enumerate(id_list):
                    result_list.append(self.retrieve_citation(id).decode('utf-8'))
                    current_index, current_id = index+1, id
            else:
                self.logger.warning(f'No results found.')
        except Exception as error:
            messages.append(f'Response: \n{response_dict}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            messages.append(f'\tAn error occurred on line {lineno} in {filename}: {error}')
            messages.append(f'Article {current_index} [{current_id}] not found.')
        return result_list

    def retrieve_citation(self, article_id):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        params = {
            'db': 'pubmed',
            'id': article_id
        }

        response = requests.get(base_url, params=params)
        return response.content

    def extract_pubmed_details(self, record_string):
        """
        Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
        """
        authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
        formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

        # Extract publication year
        publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
        publication_year = publication_year.group(1) if publication_year else ''
        publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
        publication_month = publication_month.group(1) if publication_month else ''

        # Extract article title
        article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
        article_title = article_title.group(1) if article_title else ''

        # Extract journal title
        journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
        journal_title = journal_title.group(1) if journal_title else ''

        # Extract journal volume
        journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
        journal_volume = journal_volume.group(1) if journal_volume else ''

        # Extract journal issue
        journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
        journal_issue = journal_issue.group(1) if journal_issue else ''

        # Extract start page
        start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
        start_page = start_page.group(1) if start_page else ''

        # Extract end page
        end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
        end_page = end_page.group(1) if end_page else ''

        # Extract ELocationID
        doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
        doi = doi.group(1) if doi else ''

        abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
        print(f'Number of abstract sections: {len(abstract_matches)}')
        if len(abstract_matches) > 1:
            cleaned_abstract_sections = []
            for match in abstract_matches:
                clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
                clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
                cleaned_abstract_sections.append(clean_match)
                
            abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
        else:
            abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
            
        # Extract MeshHeadingList
        MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
        MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''
        
        return {
            'pubmed_title': article_title,
            'abstract': abstract,
            'journal': journal_title,
            'authors': formatted_authors,
            'year': publication_year,
            'month': publication_month,
            'pub_volume': journal_volume,
            'pub_issue': journal_issue,
            'start_page': start_page,
            'end_page': end_page,
            'doi': doi,
            'mesh_headings': MeshHeadingList
        }
    
iteration = 3
query = 'resistance training'
result_dict[iteration] = Pubmed_API()
result_dict[iteration].search_article(query, retmax=3)

2024-04-02 19:04:43,822 - Pubmed_API - INFO:
Search term: "resistance training"



2024-04-02 19:04:44,443 - Pubmed_API - INFO:
Extracting these 3 PMIDs: ['38563729', '38563578', '38563037']



Number of abstract sections: 1
Number of abstract sections: 4
Number of abstract sections: 4


Unnamed: 0,pubmed_title,abstract,journal,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,mesh_headings
0,Feasibility and Usefulness of Repetitions-In-R...,The intensity of resistance training (RT) exer...,Perceptual and motor skills,"Vasco Bastos, S&#xe9;rgio Machado, Diogo S Tei...",2024,,,,315125241241785.0,,10.1177/00315125241241785,
1,Prevalence of depression among subjects practi...,BACKGROUND: As already proven in the literatur...,The Journal of sports medicine and physical fi...,"Faisal J Alghamdi, Ali S Alrawdhan, Afraa A Al...",2024,,,,,,10.23736/S0022-4707.24.15480-1,
2,Differences in the Impact of Various Types of ...,"BACKGROUND: Irisin, a myokine that is responsi...",International journal of preventive medicine,"Atefe Torabi, Jalil Reisi, Mehdi Kargarfard, M...",2024,,15.0,,11.0,,10.4103/ijpvm.ijpvm_76_23,


In [18]:
result_dict[iteration].responses_dict

{}

# *End of Page*