# Title
[]()

In [1]:

import sys
sys.path.append('../src')
sys.path.append(r"/home/silvhua/custom_python")
from silvhua import *
from Pubmed_API import *

In [2]:
# set the option to wrap text within cells
pd.set_option('display.max_colwidth', 100)
# pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Initialize

In [3]:
api_key = os.getenv('api_ncbi') # Pubmed API key
result_dict = dict()

# Iteration 1

In [9]:
import numpy as np

class Pubmed_API:
    def __init__(self, api_key=os.getenv('api_ncbi'), logger=None, logging_level=logging.INFO):
        """
        Parameters:
        - api_key (str): NCBI API key
        ---
        # Example usage

        result_dict = dict()
        iteration = 1
        query = 'query string'
        result_dict[iteration] = Pubmed_API()

        ## Option 1

        2 steps: Get list of PMIDs first, then get the article data.

        ids_list = result_dict[iteration].search_article(query, retmax=5, ids_only=True)
        df = result_dict[iteration].get_article_data_by_title()

        ## Option 2

        Get the PMIDs and then the article data in one step.

        df = result_dict[iteration].search_article(query, retmax=5, ids_only=False)

        """
        self.api_key = api_key
        self.logger = create_function_logger('Pubmed_API', logger, level=logging_level)
        self.iteration = 0
        self.responses_dict = {}
        self.results_dict = {}
        self.PMIDs_dict = {}
        self.record_strings_dict = {}

    def search_article(self, query, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, period_filter=None,
        additional_search_params=None, ids_only=False, 
        verbose=True
        ):
        """
        Search for article title in PubMed database.

        Parameters:
        - query (str): Pubmed search query.
        - reldate (int, optional): The search returns only those items that have a date specified by datetype within the last n days.
        - query_tag (str, optional): Query tag to append to the search query.
        - publication (str, optional): Publication name.
        - retmax (int, optional): Maximum number of results to return. 
            If None, default is 20. API returns a maximum of 9999 results. To get more results for Pubmed,
            need to use the command line: https://www.ncbi.nlm.nih.gov/books/NBK179288/
        - systematic_only (bool, optional): If True, filter for only systematic review articles.
        - review_only (bool, optional): If True, filter for only systematic review or review articles.
        - additional_search_params (dict, optional): Additional search parameters to pass to the esearch API.
        - period_filter (1, 5, or 10, optional): Filter for articles published in the past 1, 5, or 10 years.
            Note: To filter by other periods, use `reldate` parameter as that is how the API works.
            
        Returns:


        API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
        Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
        """
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        response = {}
        results = pd.DataFrame()
        search_term = f'{re.sub(r"not", "", query)}'  # Remove 'not' since it will be treated as a boolean
        if query_tag:
            search_term += f'{query_tag}'
        if publication:
            search_term = f'AND {publication} [ta]'
        if systematic_only:
            search_term += ' AND systematic[sb]'
        elif review_only:
            search_term += ' AND (systematic[sb] OR review[pt])'
        if period_filter:
            search_term += f' AND y_{period_filter}[Filter]'
        params = {
            'db': 'pubmed',
            'term': search_term,
            'retmode': 'json',
            'datetype': 'edat',
        }
        if reldate:
            params['reldate'] = reldate
        if retmax:
            params['retmax'] = retmax
        if additional_search_params:
            params.update(additional_search_params)
        self.logger.info(f'Search term: {search_term}')
        messages = []
        try:
            self.iteration += 1
            response = requests.get(base_url, params=params)
            response_dict = response.json()
            id_list = response_dict['esearchresult']['idlist']
            messages.append(f'{len(id_list)} PMIDs found.')
            if verbose==True:
                messages.append(f'{id_list}')
            self.PMIDs_dict[self.iteration] = id_list
            self.responses_dict[self.iteration] = response_dict
            if ids_only==False:
                results = self.get_article_data_by_title()
            else:
                results = id_list
            self.logger.info('\n'.join(messages))
        except Exception as error:
            error_messages = []
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        
        return results

    def get_article_data_by_title(self, iteration=None):
        result_df = pd.DataFrame()
        try:
            result_dict = {}
            iteration = self.iteration if iteration == None else iteration
            record_strings_list = self.batch_retrieve_citation(iteration)
            self.record_strings_dict[iteration] = record_strings_list
            for index, record_string in enumerate(record_strings_list):
                result_dict[index] = self.extract_pubmed_details(record_string)
            self.results_dict[iteration] = result_dict
            result_df = pd.DataFrame(result_dict).transpose()
        except Exception as error:
            error_messages = []
            error_messages.append(f'Response: \n{self.PMIDs_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        return result_df

    def batch_retrieve_citation(self, iteration):
        result_list = []
        messages = []
        try:
            id_list = self.PMIDs_dict.get(iteration)
            if id_list:
                self.logger.info(f'Extracting these {len(id_list)} PMIDs: {id_list}')
                for index, id in enumerate(id_list):
                    result_list.append(self.retrieve_citation(id).decode('utf-8'))
                    current_index, current_id = index+1, id
                    # Show progress 
                    print('.', end='\n' if current_index%2==0 else '')
                self.logger.info("Processing complete.")
            else:
                self.logger.warning(f'No results found.')
        except Exception as error:
            messages.append(f'Response: \n{self.responses_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            messages.append(f'\tAn error occurred on line {lineno} in {filename}: {error}')
            messages.append(f'Article {current_index} [{current_id}] not found.')
        return result_list

    def retrieve_citation(self, article_id):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        params = {
            'db': 'pubmed',
            'id': article_id
        }
        response = requests.get(base_url, params=params)
        return response.content

    def extract_pubmed_details_df(self, iteration=None):
        """
        Extract the Pubmed article details for the given list of record strings for the given iteration.

        Returns:
        DataFrame of the Pubmed article details.
        """
        df = pd.DataFrame()
        record_strings = pd.Series(self.record_strings_dict.get(iteration if iteration else self.iteration))
        regex_dict = {
            'article_title': r'<ArticleTitle>(.*?)</ArticleTitle>',
            'pmid': r'<PMID.*?>(.*?)</PMID>',
            'journal': r'<Title>(.*?)</Title>',
            'volume': r'<Volume>(.*?)</Volume>',
            'issue': r'<Issue>(.*?)</Issue>',
            'year': r'<PubDate><Year>(\d{4})</Year>',
            'month': r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>',
            'start_page': r'<StartPage>(.*?)</StartPage>',
            'end_page': r'<EndPage>(.*?)</EndPage>',
            'doi': r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>',
        }
        for column, regex in regex_dict.items():
            df[column] = record_strings.str.extract(regex)
        df['abstract'] = self.df_extractall(
            record_strings, parent_regex=r'<Abstract>(.*?)</Abstract>',
            regex = r'<AbstractText.*?(?: Label="(.*?)")?.*?>(.*?)</AbstractText>',
            logger=self.logger, sep=': ', join_strings=' '
        )
        df['mesh_headings'] = self.df_extractall(
            record_strings, 
            parent_regex=r'<MeshHeadingList>(.*?)</MeshHeadingList>',
            regex=r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>',
            nested_regex=r'<QualifierName.*?>(.*?)</QualifierName>', logger=self.logger
        )
        df['authors'] = self.df_extractall(
            record_strings, sep=' ',
            regex=r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>',
            logger=self.logger 
        )
        df['keywords'] = self.df_extractall(
            record_strings, parent_regex=r'<KeywordList.*?>(.*?)</KeywordList>',
            regex=r'<Keyword.*?>(.*?)</Keyword>', 
            logger=self.logger
        )
        df['major_topics'] = self.df_extractall(
            record_strings, 
            regex=r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', 
            logger=self.logger
        )
        df['publication_type'] = self.df_extractall(
            record_strings, parent_regex=r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>',
            regex=r'<PublicationType.*?>(.*?)</PublicationType>', 
            logger=self.logger
        )
        columns = [
            'article_title',
            'abstract',
            'mesh_headings',
            'keywords',
            'major_topics',
            'pmid',
            'doi',
            'journal',
            'volume',
            'issue',
            'year',
            'month',
            'start_page',
            'end_page',
            'authors',
            'publication_type'
        ]
        return df[columns]

    def df_extractall(self, 
            series, regex, parent_regex=None, nested_regex=None, sep=[' ', ' / '], 
            join_strings=False, logger=None
            ):
        """
        Helper function called by `.search_article()` and `.get_article_data_by_title()` to parse 
        article metadata from PubMed database.

        Parameters:
        - series: pd.Series
        - regex: Regular expression to extract from the series.
        - parent_regex (optional): Regular expression from which to extract the `regex`.
            If None, `regex` will be extracted from the series.
        - nested_regex (optional): Regular expression that is nested within `regex` to extract.
        - sep (str or list; optional): String or list of strings used to separate multiple capture groups.
            If it is a list, then the first value is used to separate the main capture groups. 
            The second value is used to separate the nested capture groups. If the nested regex 
            has multiple capture groups, then the last value is used to separate them.
        - join_strings (optional): Boolean indicating whether to join the extracted values.
        - logger (optional): Instance of Custom_Logger class.

        Returns:
        - pd.Series with the extracted values.
        """
        logger = create_function_logger('df_extractall', logger)
        messages = []
        messages.append(f'***Running `df_extractall` with regex {regex}***')
        if parent_regex:
            messages.append(f'\tparent_regex: {parent_regex}')
        if nested_regex:
            messages.append(f'\tnested_regex: {nested_regex}')
        if parent_regex:
            extracted = series.str.extract(parent_regex, expand=False)
            series = extracted
        extracted = series.str.extractall(regex).replace({np.nan: ''})
        if extracted.shape[1] >= 1:
            joined_values = extracted[0]
        else:
            messages.warning('No matches found.')
            return series
        if extracted.shape[1] > 1:
            extracted.index.names = [f'{name if name else "index"}{index if index !=0 else ""}' for index, name in enumerate(extracted.index.names)]
            for i in range(1, extracted.shape[1]):
                if nested_regex:
                    matches = extracted[i].str.extractall(nested_regex)#.replace({np.nan: ''})
                    messages.append(f'Number of nested capture groups: {matches.shape[1]}')
                    matches.columns = [f'nested_text{column}' for column in matches.columns]
                    regex_df = extracted.merge(
                        matches, how='left', left_index=True, right_index=True
                    ).replace({np.nan: ''})
                    nested_separator = sep if type(sep) == str else sep[1]
                    if i == 1:
                        root_column = 0 
                        capture_group_separator = nested_separator
                    else:
                        root_column = 'Text'
                        capture_group_separator = sep if type(sep) == str else sep[-1]
                    regex_df = concat_columns(
                        regex_df, [root_column, 'nested_text0'], 'Text', 
                        sep=capture_group_separator
                    )
                    joined_values = regex_df['Text']

                else:
                    separator = sep if type(sep) == str else sep[0]
                    joined_values = joined_values + separator + extracted[i]
        new_series = joined_values.groupby(level=0).apply(lambda groupby: [match for match in groupby])
        if (type(join_strings) == str) | (join_strings == True):
            new_series = new_series.apply(lambda x: f'{join_strings if type(join_strings) == str else " "}'.join(x))
        logger.debug('\n'.join(messages))
        return new_series

    def extract_pubmed_details(self, record_string):
        """
        [Archived: Use `extract_pubmed_details_df` instead to perform regex operations on the entire dataframe.]
        Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
        """
        authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
        formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

        # Extract publication year
        publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
        publication_year = publication_year.group(1) if publication_year else ''
        publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
        publication_month = publication_month.group(1) if publication_month else ''

        # Extract article title
        article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
        article_title = article_title.group(1) if article_title else ''

        # Extract journal title
        journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
        journal_title = journal_title.group(1) if journal_title else ''

        # Extract journal volume
        journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
        journal_volume = journal_volume.group(1) if journal_volume else ''

        # Extract journal issue
        journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
        journal_issue = journal_issue.group(1) if journal_issue else ''

        # Extract start page
        start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
        start_page = start_page.group(1) if start_page else ''

        # Extract end page
        end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
        end_page = end_page.group(1) if end_page else ''

        # Extract ELocationID
        doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
        doi = doi.group(1) if doi else ''

        # Extract PMID
        pmid = re.search(r'<PMID.*?>(.*?)</PMID>', record_string)
        pmid = pmid.group(1) if pmid else ''

        abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
        # self.logger.debug(f'Number of abstract sections: {len(abstract_matches)}')
        if len(abstract_matches) > 1:
            cleaned_abstract_sections = []
            for match in abstract_matches:
                clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
                clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
                cleaned_abstract_sections.append(clean_match)
                
            abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
        else:
            abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
            
        # Extract MeshHeadingList
        MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
        MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''

        # Estract MeshHeading text and any QualifierName
        mesh_headings = []
        pattern = r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>'
        matches = re.findall(pattern, MeshHeadingList)
        for match in matches:
            heading = match[0]
            if match[1]: # Estract Mesh QualifierName                
                MeshQualifiers = re.findall(
                    r'<QualifierName.*?>(.*?)</QualifierName>', match[1]
                    )
                print(f'mesh qualifiers: {MeshQualifiers}')
                for qualifier in MeshQualifiers:
                    heading = f"{match[0]} / {qualifier}"
                    mesh_headings.append(heading)
            else:
                mesh_headings.append(heading)

        # Extract keyword
        Keyword_List = re.search(r'<KeywordList.*?>(.*?)</KeywordList>', record_string)
        Keyword_List = Keyword_List.group(1) if Keyword_List else ''
        Keywords = re.findall(
            r'<Keyword.*?>(.*?)</Keyword>', Keyword_List
            )
        # Extract MajorTopic text
        MajorTopics = re.findall(
            r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', record_string
            )
        # Extract Publication Type
        PublicationTypeList = re.search(r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>', record_string)
        PublicationTypeList = PublicationTypeList.group(1) if PublicationTypeList else ''
        PublicationType = re.findall(
            r'<PublicationType.*?>(.*?)</PublicationType>', PublicationTypeList
            )
        return {
            'pubmed_title': article_title,
            'abstract': abstract,
            'journal': journal_title,
            'authors': formatted_authors,
            'year': publication_year,
            'month': publication_month,
            'pub_volume': journal_volume,
            'pub_issue': journal_issue,
            'start_page': start_page,
            'end_page': end_page,
            'doi': doi,
            'pmid': pmid,
            'mesh_headings': mesh_headings,
            'keywords': Keywords,
            'major_topics': MajorTopics,
            'publication_type': PublicationType
        }


iteration = 1
query = 'exercise'
retmax = 5
result_dict[iteration] = Pubmed_API()
df = result_dict[iteration].search_article(query, retmax=retmax, ids_only=False)


2024-04-06 20:39:43,597 - Pubmed_API - INFO:
Search term: exercise



2024-04-06 20:39:49,243 - Pubmed_API - INFO:
Extracting these 5 PMIDs: ['38581603', '38581560', '38581554', '38581479', '38581449']



..
..


2024-04-06 20:40:17,053 - Pubmed_API - INFO:
Processing complete.

2024-04-06 20:40:17,079 - Pubmed_API - INFO:
5 PMIDs found.
['38581603', '38581560', '38581554', '38581479', '38581449']



.

In [13]:
result_dict[iteration].results_dict[1]

{0: {'pubmed_title': "Exploring home rehabilitation therapists' experiences of supporting older persons to physical exercise after acute hospitalization: a qualitative interview study.",
  'abstract': "PURPOSE: After hospitalization, older persons may face a decline in physical function and daily independence. In-hospital exercise interventions can mitigate this decline, and continued support from primary healthcare post-discharge may enhance sustainability. This study aimed to explore home rehabilitation therapists' experiences of supporting physical exercise after acute hospitalization, including exercise programs initiated during hospital stay.<br>METHODS: This qualitative study was conducted alongside a randomized-controlled trial to investigate prerequisites for a transitional care intervention. Twelve interviews were conducted with physiotherapists, occupational therapists, and managers across seven rehabilitation therapy services in Stockholm, Sweden. Data were analyzed using re

In [14]:
filename = 'test_data'
path = '../data'
save_csv(
    df, filename, path, append_version=True
)

File saved:  ../data/test_data_2024-04-06_2144.csv
	Time completed: 2024-04-06 21:44:41.103610


In [11]:
time_per_id = 33.5/5
num_hours = 9999*time_per_id/3600
print(f'{num_hours} hours')

18.60925 hours


# 1.1

In [11]:
import numpy as np

class Pubmed_API:
    def __init__(self, api_key=os.getenv('api_ncbi'), logger=None, logging_level=logging.INFO):
        """
        Parameters:
        - api_key (str): NCBI API key
        ---
        # Example usage

        result_dict = dict()
        iteration = 1
        query = 'query string'
        result_dict[iteration] = Pubmed_API()

        ## Option 1

        2 steps: Get list of PMIDs first, then get the article data.

        ids_list = result_dict[iteration].search_article(query, retmax=5, ids_only=True)
        df = result_dict[iteration].get_article_data_by_title()

        ## Option 2

        Get the PMIDs and then the article data in one step.

        df = result_dict[iteration].search_article(query, retmax=5, ids_only=False)

        """
        self.api_key = api_key
        self.logger = create_function_logger('Pubmed_API', logger, level=logging_level)
        self.iteration = 0
        self.responses_dict = {}
        self.results_dict = {}
        self.PMIDs_dict = {}
        self.record_strings_dict = {}

    def search_article(self, query, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, period_filter=None,
        additional_search_params=None, ids_only=False, 
        verbose=True
        ):
        """
        Search for article title in PubMed database.

        Parameters:
        - query (str): Pubmed search query.
        - reldate (int, optional): The search returns only those items that have a date specified by datetype within the last n days.
        - query_tag (str, optional): Query tag to append to the search query.
        - publication (str, optional): Publication name.
        - retmax (int, optional): Maximum number of results to return. 
            If None, default is 20. API returns a maximum of 9999 results. To get more results for Pubmed,
            need to use the command line: https://www.ncbi.nlm.nih.gov/books/NBK179288/
        - systematic_only (bool, optional): If True, filter for only systematic review articles.
        - review_only (bool, optional): If True, filter for only systematic review or review articles.
        - additional_search_params (dict, optional): Additional search parameters to pass to the esearch API.
        - period_filter (1, 5, or 10, optional): Filter for articles published in the past 1, 5, or 10 years.
            Note: To filter by other periods, use `reldate` parameter as that is how the API works.
            
        Returns:


        API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
        Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
        """
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        response = {}
        results = pd.DataFrame()
        search_term = f'{re.sub(r"not", "", query)}'  # Remove 'not' since it will be treated as a boolean
        if query_tag:
            search_term += f'{query_tag}'
        if publication:
            search_term = f'AND {publication} [ta]'
        if systematic_only:
            search_term += ' AND systematic[sb]'
        elif review_only:
            search_term += ' AND (systematic[sb] OR review[pt])'
        if period_filter:
            search_term += f' AND y_{period_filter}[Filter]'
        params = {
            'db': 'pubmed',
            'term': search_term,
            'retmode': 'json',
            'datetype': 'edat',
        }
        if reldate:
            params['reldate'] = reldate
        if retmax:
            params['retmax'] = retmax
        if additional_search_params:
            params.update(additional_search_params)
        self.logger.info(f'Search term: {search_term}')
        messages = []
        try:
            self.iteration += 1
            response = requests.get(base_url, params=params)
            response_dict = response.json()
            id_list = response_dict['esearchresult']['idlist']
            messages.append(f'{len(id_list)} PMIDs found.')
            if verbose==True:
                messages.append(f'{id_list}')
            self.PMIDs_dict[self.iteration] = id_list
            self.responses_dict[self.iteration] = response_dict
            if ids_only==False:
                results = self.get_article_data_by_title()
            else:
                results = id_list
            self.logger.info('\n'.join(messages))
        except Exception as error:
            error_messages = []
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        
        return results

    def get_article_data_by_title(self, iteration=None):
        result_df = pd.DataFrame()
        try:
            result_dict = {}
            iteration = self.iteration if iteration == None else iteration
            record_strings_list = self.batch_retrieve_citation(iteration)
            self.record_strings_dict[iteration] = record_strings_list
            for index, record_string in enumerate(record_strings_list):
                result_dict[index] = self.extract_pubmed_details(record_string)
            self.results_dict[iteration] = result_dict
            result_df = pd.DataFrame(result_dict).transpose()
        except Exception as error:
            error_messages = []
            error_messages.append(f'Response: \n{self.PMIDs_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        return result_df

    def batch_retrieve_citation(self, iteration):
        result_list = []
        messages = []
        try:
            id_list = self.PMIDs_dict.get(iteration)
            if id_list:
                self.logger.info(f'Extracting these {len(id_list)} PMIDs: {id_list}')
                for index, id in enumerate(id_list):
                    result_list.append(self.retrieve_citation(id).decode('utf-8'))
                    current_index, current_id = index+1, id
                    # Show progress 
                    print('.', end='\n' if current_index%2==0 else '')
                self.logger.info("Processing complete.")
            else:
                self.logger.warning(f'No results found.')
        except Exception as error:
            messages.append(f'Response: \n{self.responses_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            messages.append(f'\tAn error occurred on line {lineno} in {filename}: {error}')
            messages.append(f'Article {current_index} [{current_id}] not found.')
        return result_list

    def retrieve_citation(self, article_id):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        params = {
            'db': 'pubmed',
            'id': article_id
        }
        response = requests.get(base_url, params=params)
        return response.content

    def extract_pubmed_details_df(self, iteration=None):
        """
        Extract the Pubmed article details for the given list of record strings for the given iteration.

        Returns:
        DataFrame of the Pubmed article details.
        """
        df = pd.DataFrame()
        record_strings = pd.Series(self.record_strings_dict.get(iteration if iteration else self.iteration))
        regex_dict = {
            'article_title': r'<ArticleTitle>(.*?)</ArticleTitle>',
            'pmid': r'<PMID.*?>(.*?)</PMID>',
            'journal': r'<Title>(.*?)</Title>',
            'volume': r'<Volume>(.*?)</Volume>',
            'issue': r'<Issue>(.*?)</Issue>',
            'year': r'<PubDate><Year>(\d{4})</Year>',
            'month': r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>',
            'start_page': r'<StartPage>(.*?)</StartPage>',
            'end_page': r'<EndPage>(.*?)</EndPage>',
            'doi': r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>',
        }
        for column, regex in regex_dict.items():
            df[column] = record_strings.str.extract(regex)
        df['abstract'] = self.df_extractall(
            record_strings, parent_regex=r'<Abstract>(.*?)</Abstract>',
            regex = r'<AbstractText.*?(?: Label="(.*?)")?.*?>(.*?)</AbstractText>',
            logger=self.logger, sep=': ', join_strings=' '
        )
        df['mesh_headings'] = self.df_extractall(
            record_strings, 
            parent_regex=r'<MeshHeadingList>(.*?)</MeshHeadingList>',
            regex=r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>',
            nested_regex=r'<QualifierName.*?>(.*?)</QualifierName>', logger=self.logger
        )
        df['authors'] = self.df_extractall(
            record_strings, sep=' ',
            regex=r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>',
            logger=self.logger 
        )
        df['keywords'] = self.df_extractall(
            record_strings, parent_regex=r'<KeywordList.*?>(.*?)</KeywordList>',
            regex=r'<Keyword.*?>(.*?)</Keyword>', 
            logger=self.logger
        )
        df['major_topics'] = self.df_extractall(
            record_strings, 
            regex=r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', 
            logger=self.logger
        )
        df['publication_type'] = self.df_extractall(
            record_strings, parent_regex=r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>',
            regex=r'<PublicationType.*?>(.*?)</PublicationType>', 
            logger=self.logger
        )
        columns = [
            'article_title',
            'abstract',
            'mesh_headings',
            'keywords',
            'major_topics',
            'pmid',
            'doi',
            'journal',
            'volume',
            'issue',
            'year',
            'month',
            'start_page',
            'end_page',
            'authors',
            'publication_type'
        ]
        return df[columns]

    def df_extractall(self, 
            series, regex, parent_regex=None, nested_regex=None, sep=[' ', ' / '], 
            join_strings=False, logger=None
            ):
        """
        Helper function called by `.search_article()` and `.get_article_data_by_title()` to parse 
        article metadata from PubMed database.

        Parameters:
        - series: pd.Series
        - regex: Regular expression to extract from the series.
        - parent_regex (optional): Regular expression from which to extract the `regex`.
            If None, `regex` will be extracted from the series.
        - nested_regex (optional): Regular expression that is nested within `regex` to extract.
        - sep (str or list; optional): String or list of strings used to separate multiple capture groups.
            If it is a list, then the first value is used to separate the main capture groups. 
            The second value is used to separate the nested capture groups. If the nested regex 
            has multiple capture groups, then the last value is used to separate them.
        - join_strings (optional): Boolean indicating whether to join the extracted values.
        - logger (optional): Instance of Custom_Logger class.

        Returns:
        - pd.Series with the extracted values.
        """
        logger = create_function_logger('df_extractall', logger)
        messages = []
        messages.append(f'***Running `df_extractall` with regex {regex}***')
        if parent_regex:
            messages.append(f'\tparent_regex: {parent_regex}')
        if nested_regex:
            messages.append(f'\tnested_regex: {nested_regex}')
        if parent_regex:
            extracted = series.str.extract(parent_regex, expand=False)
            series = extracted
        extracted = series.str.extractall(regex).replace({np.nan: ''})
        if extracted.shape[1] >= 1:
            joined_values = extracted[0]
        else:
            messages.warning('No matches found.')
            return series
        if extracted.shape[1] > 1:
            extracted.index.names = [f'{name if name else "index"}{index if index !=0 else ""}' for index, name in enumerate(extracted.index.names)]
            for i in range(1, extracted.shape[1]):
                if nested_regex:
                    matches = extracted[i].str.extractall(nested_regex)#.replace({np.nan: ''})
                    messages.append(f'Number of nested capture groups: {matches.shape[1]}')
                    matches.columns = [f'nested_text{column}' for column in matches.columns]
                    regex_df = extracted.merge(
                        matches, how='left', left_index=True, right_index=True
                    ).replace({np.nan: ''})
                    nested_separator = sep if type(sep) == str else sep[1]
                    if i == 1:
                        root_column = 0 
                        capture_group_separator = nested_separator
                    else:
                        root_column = 'Text'
                        capture_group_separator = sep if type(sep) == str else sep[-1]
                    regex_df = concat_columns(
                        regex_df, [root_column, 'nested_text0'], 'Text', 
                        sep=capture_group_separator
                    )
                    joined_values = regex_df['Text']

                else:
                    separator = sep if type(sep) == str else sep[0]
                    joined_values = joined_values + separator + extracted[i]
        new_series = joined_values.groupby(level=0).apply(lambda groupby: [match for match in groupby])
        if (type(join_strings) == str) | (join_strings == True):
            new_series = new_series.apply(lambda x: f'{join_strings if type(join_strings) == str else " "}'.join(x))
        logger.debug('\n'.join(messages))
        return new_series

    def extract_pubmed_details(self, record_string):
        """
        [Archived: Use `extract_pubmed_details_df` instead to perform regex operations on the entire dataframe.]
        Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
        """
        authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
        formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

        # Extract publication year
        publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
        publication_year = publication_year.group(1) if publication_year else ''
        publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
        publication_month = publication_month.group(1) if publication_month else ''

        # Extract article title
        article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
        article_title = article_title.group(1) if article_title else ''

        # Extract journal title
        journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
        journal_title = journal_title.group(1) if journal_title else ''

        # Extract journal volume
        journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
        journal_volume = journal_volume.group(1) if journal_volume else ''

        # Extract journal issue
        journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
        journal_issue = journal_issue.group(1) if journal_issue else ''

        # Extract start page
        start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
        start_page = start_page.group(1) if start_page else ''

        # Extract end page
        end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
        end_page = end_page.group(1) if end_page else ''

        # Extract ELocationID
        doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
        doi = doi.group(1) if doi else ''

        # Extract PMID
        pmid = re.search(r'<PMID.*?>(.*?)</PMID>', record_string)
        pmid = pmid.group(1) if pmid else ''

        abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
        # self.logger.debug(f'Number of abstract sections: {len(abstract_matches)}')
        if len(abstract_matches) > 1:
            cleaned_abstract_sections = []
            for match in abstract_matches:
                clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
                clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
                cleaned_abstract_sections.append(clean_match)
                
            abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
        else:
            abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
            
        # Extract MeshHeadingList
        MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
        MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''

        # Estract MeshHeading text and any QualifierName
        mesh_headings = []
        pattern = r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>'
        matches = re.findall(pattern, MeshHeadingList)
        for match in matches:
            heading = match[0]
            if match[1]: # Estract Mesh QualifierName                
                MeshQualifiers = re.findall(
                    r'<QualifierName.*?>(.*?)</QualifierName>', match[1]
                    )
                print(f'mesh qualifiers: {MeshQualifiers}')
                for qualifier in MeshQualifiers:
                    heading = f"{match[0]} / {qualifier}"
                    mesh_headings.append(heading)
            else:
                mesh_headings.append(heading)

        # Extract keyword
        Keyword_List = re.search(r'<KeywordList.*?>(.*?)</KeywordList>', record_string)
        Keyword_List = Keyword_List.group(1) if Keyword_List else ''
        Keywords = re.findall(
            r'<Keyword.*?>(.*?)</Keyword>', Keyword_List
            )
        # Extract MajorTopic text
        MajorTopics = re.findall(
            r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', record_string
            )
        # Extract Publication Type
        PublicationTypeList = re.search(r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>', record_string)
        PublicationTypeList = PublicationTypeList.group(1) if PublicationTypeList else ''
        PublicationType = re.findall(
            r'<PublicationType.*?>(.*?)</PublicationType>', PublicationTypeList
            )
        return {
            'pubmed_title': article_title,
            'abstract': abstract,
            'journal': journal_title,
            'authors': formatted_authors,
            'year': publication_year,
            'month': publication_month,
            'pub_volume': journal_volume,
            'pub_issue': journal_issue,
            'start_page': start_page,
            'end_page': end_page,
            'doi': doi,
            'pmid': pmid,
            'mesh_headings': mesh_headings,
            'keywords': Keywords,
            'major_topics': MajorTopics,
            'publication_type': PublicationType
        }


iteration = 1.1
query = 'exercise'
retmax = 15
result_dict[iteration] = Pubmed_API()
df = result_dict[iteration].search_article(query, retmax=retmax, ids_only=False)


2024-04-06 22:31:26,476 - Pubmed_API - INFO:
Search term: exercise



2024-04-06 22:31:32,203 - Pubmed_API - INFO:
Extracting these 15 PMIDs: ['38581603', '38581560', '38581554', '38581479', '38581449', '38581398', '38581338', '38581216', '38581070', '38581046', '38580947', '38580913', '38580840', '38580835', '38580668']



..
..
..
..
..
..
..


2024-04-06 22:32:56,678 - Pubmed_API - INFO:
Processing complete.

2024-04-06 22:32:56,711 - Pubmed_API - INFO:
15 PMIDs found.
['38581603', '38581560', '38581554', '38581479', '38581449', '38581398', '38581338', '38581216', '38581070', '38581046', '38580947', '38580913', '38580840', '38580835', '38580668']



.

# iteration 2: fix `get_article_data_by_title` to use `extract_pubmed_details_df`

In [7]:
class Pubmed_API:
    def __init__(self, api_key=os.getenv('api_ncbi'), logger=None, logging_level=logging.INFO):
        """
        Parameters:
        - api_key (str): NCBI API key
        ---
        # Example usage

        result_dict = dict()
        iteration = 1
        query = 'query string'
        result_dict[iteration] = Pubmed_API()

        ## Option 1

        2 steps: Get list of PMIDs first, then get the article data.

        ids_list = result_dict[iteration].search_article(query, retmax=5, ids_only=True)
        df = result_dict[iteration].get_article_data_by_title()

        ## Option 2

        Get the PMIDs and then the article data in one step.

        df = result_dict[iteration].search_article(query, retmax=5, ids_only=False)

        """
        self.api_key = api_key
        self.logger = create_function_logger('Pubmed_API', logger, level=logging_level)
        self.iteration = 0
        self.responses_dict = {}
        self.results_dict = {}
        self.PMIDs_dict = {}
        self.record_strings_dict = {}

    def search_article(self, query, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, period_filter=None,
        additional_search_params=None, ids_only=False, 
        verbose=True
        ):
        """
        Search for article title in PubMed database.

        Parameters:
        - query (str): Pubmed search query.
        - reldate (int, optional): The search returns only those items that have a date specified by datetype within the last n days.
        - query_tag (str, optional): Query tag to append to the search query.
        - publication (str, optional): Publication name.
        - retmax (int, optional): Maximum number of results to return. 
            If None, default is 20. API returns a maximum of 9999 results. To get more results for Pubmed,
            need to use the command line: https://www.ncbi.nlm.nih.gov/books/NBK179288/
        - systematic_only (bool, optional): If True, filter for only systematic review articles.
        - review_only (bool, optional): If True, filter for only systematic review or review articles.
        - additional_search_params (dict, optional): Additional search parameters to pass to the esearch API.
        - period_filter (1, 5, or 10, optional): Filter for articles published in the past 1, 5, or 10 years.
            Note: To filter by other periods, use `reldate` parameter as that is how the API works.
            
        Returns:


        API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
        Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
        """
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        response = {}
        results = pd.DataFrame()
        search_term = f'{re.sub(r"not", "", query)}'  # Remove 'not' since it will be treated as a boolean
        if query_tag:
            search_term += f'{query_tag}'
        if publication:
            search_term = f'AND {publication} [ta]'
        if systematic_only:
            search_term += ' AND systematic[sb]'
        elif review_only:
            search_term += ' AND (systematic[sb] OR review[pt])'
        params = {
            'db': 'pubmed',
            'term': search_term,
            'retmax': 5,
            'retmode': 'json',
            'datetype': 'edat',
        }
        if reldate:
            params['reldate'] = reldate
        if retmax:
            params['retmax'] = retmax
        if additional_search_params:
            params.update(additional_search_params)
        self.logger.info(f'Search term: {search_term}')
        messages = []
        try:
            self.iteration += 1
            response = requests.get(base_url, params=params)
            response_dict = response.json()
            id_list = response_dict['esearchresult']['idlist']
            messages.append(f'{len(id_list)} PMIDs found.')
            if verbose==True:
                messages.append(f'{id_list}')
            self.PMIDs_dict[self.iteration] = id_list
            self.responses_dict[self.iteration] = response_dict
            if ids_only==False:
                results = self.get_article_data_by_title()
            else:
                results = id_list
            self.logger.info('\n'.join(messages))
        except Exception as error:
            error_messages = []
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        
        return results

    def get_article_data_by_title(self, iteration=None):
        result_df = pd.DataFrame()
        try:
            result_dict = {}
            iteration = self.iteration if iteration == None else iteration
            record_strings_list = self.batch_retrieve_citation(iteration)
            self.record_strings_dict[iteration] = record_strings_list
            for index, record_string in enumerate(record_strings_list):
                result_dict[index] = self.extract_pubmed_details(record_string)
            self.results_dict[iteration] = result_dict
            result_df = pd.DataFrame(result_dict).transpose()
        except Exception as error:
            error_messages = []
            error_messages.append(f'Response: \n{self.PMIDs_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        return result_df

    def batch_retrieve_citation(self, iteration):
        result_list = []
        messages = []
        try:
            id_list = self.PMIDs_dict.get(iteration)
            if id_list:
                self.logger.info(f'Extracting these {len(id_list)} PMIDs: {id_list}')
                for index, id in enumerate(id_list):
                    result_list.append(self.retrieve_citation(id).decode('utf-8'))
                    current_index, current_id = index+1, id
                    # Show progress 
                    indicator = '.'
                    if current_index % 10 == 0:
                        indicator+='|'
                    print(indicator, end='\n' if current_index%2==100 else '')
                self.logger.info("Processing complete.")
            else:
                self.logger.warning(f'No results found.')
        except Exception as error:
            messages.append(f'Response: \n{self.responses_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            messages.append(f'\tAn error occurred on line {lineno} in {filename}: {error}')
            messages.append(f'Article {current_index} [{current_id}] not found.')
        return result_list

    def retrieve_citation(self, article_id):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        params = {
            'db': 'pubmed',
            'id': article_id
        }
        response = requests.get(base_url, params=params)
        return response.content

    def extract_pubmed_details_df(self, iteration=None):
        """
        Extract the Pubmed article details for the given list of record strings for the given iteration.

        Returns:
        DataFrame of the Pubmed article details.
        """
        df = pd.DataFrame()
        record_strings = pd.Series(self.record_strings_dict.get(iteration if iteration else self.iteration))
        regex_dict = {
            'article_title': r'<ArticleTitle>(.*?)</ArticleTitle>',
            'pmid': r'<PMID.*?>(.*?)</PMID>',
            'journal': r'<Title>(.*?)</Title>',
            'volume': r'<Volume>(.*?)</Volume>',
            'issue': r'<Issue>(.*?)</Issue>',
            'year': r'<PubDate><Year>(\d{4})</Year>',
            'month': r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>',
            'start_page': r'<StartPage>(.*?)</StartPage>',
            'end_page': r'<EndPage>(.*?)</EndPage>',
            'doi': r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>',
        }
        for column, regex in regex_dict.items():
            df[column] = record_strings.str.extract(regex)
        df['abstract'] = self.df_extractall(
            record_strings, parent_regex=r'<Abstract>(.*?)</Abstract>',
            regex = r'<AbstractText.*?(?: Label="(.*?)")?.*?>(.*?)</AbstractText>',
            logger=self.logger, sep=': ', join_strings=' '
        )
        df['mesh_headings'] = self.df_extractall(
            record_strings, 
            parent_regex=r'<MeshHeadingList>(.*?)</MeshHeadingList>',
            regex=r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>',
            nested_regex=r'<QualifierName.*?>(.*?)</QualifierName>', logger=self.logger
        )
        df['authors'] = self.df_extractall(
            record_strings, sep=' ',
            regex=r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>',
            logger=self.logger 
        )
        df['keywords'] = self.df_extractall(
            record_strings, parent_regex=r'<KeywordList.*?>(.*?)</KeywordList>',
            regex=r'<Keyword.*?>(.*?)</Keyword>', 
            logger=self.logger
        )
        df['major_topics'] = self.df_extractall(
            record_strings, 
            regex=r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', 
            logger=self.logger
        )
        df['publication_type'] = self.df_extractall(
            record_strings, parent_regex=r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>',
            regex=r'<PublicationType.*?>(.*?)</PublicationType>', 
            logger=self.logger
        )
        columns = [
            'article_title',
            'abstract',
            'mesh_headings',
            'keywords',
            'major_topics',
            'pmid',
            'doi',
            'journal',
            'volume',
            'issue',
            'year',
            'month',
            'start_page',
            'end_page',
            'authors',
            'publication_type'
        ]
        return df[columns]

    def df_extractall(self, 
            series, regex, parent_regex=None, nested_regex=None, sep=[' ', ' / '], 
            join_strings=False, logger=None
            ):
        """
        Helper function called by `.search_article()` and `.get_article_data_by_title()` to parse 
        article metadata from PubMed database.

        Parameters:
        - series: pd.Series
        - regex: Regular expression to extract from the series.
        - parent_regex (optional): Regular expression from which to extract the `regex`.
            If None, `regex` will be extracted from the series.
        - nested_regex (optional): Regular expression that is nested within `regex` to extract.
        - sep (str or list; optional): String or list of strings used to separate multiple capture groups.
            If it is a list, then the first value is used to separate the main capture groups. 
            The second value is used to separate the nested capture groups. If the nested regex 
            has multiple capture groups, then the last value is used to separate them.
        - join_strings (optional): Boolean indicating whether to join the extracted values.
        - logger (optional): Instance of Custom_Logger class.

        Returns:
        - pd.Series with the extracted values.
        """
        logger = create_function_logger('df_extractall', logger)
        messages = []
        messages.append(f'***Running `df_extractall` with regex {regex}***')
        if parent_regex:
            messages.append(f'\tparent_regex: {parent_regex}')
        if nested_regex:
            messages.append(f'\tnested_regex: {nested_regex}')
        if parent_regex:
            extracted = series.str.extract(parent_regex, expand=False)
            series = extracted
        extracted = series.str.extractall(regex).replace({np.nan: ''})
        if extracted.shape[1] >= 1:
            joined_values = extracted[0]
        else:
            messages.warning('No matches found.')
            return series
        if extracted.shape[1] > 1:
            extracted.index.names = [f'{name if name else "index"}{index if index !=0 else ""}' for index, name in enumerate(extracted.index.names)]
            for i in range(1, extracted.shape[1]):
                if nested_regex:
                    matches = extracted[i].str.extractall(nested_regex)#.replace({np.nan: ''})
                    messages.append(f'Number of nested capture groups: {matches.shape[1]}')
                    matches.columns = [f'nested_text{column}' for column in matches.columns]
                    regex_df = extracted.merge(
                        matches, how='left', left_index=True, right_index=True
                    ).replace({np.nan: ''})
                    nested_separator = sep if type(sep) == str else sep[1]
                    if i == 1:
                        root_column = 0 
                        capture_group_separator = nested_separator
                    else:
                        root_column = 'Text'
                        capture_group_separator = sep if type(sep) == str else sep[-1]
                    regex_df = concat_columns(
                        regex_df, [root_column, 'nested_text0'], 'Text', 
                        sep=capture_group_separator
                    )
                    joined_values = regex_df['Text']

                else:
                    separator = sep if type(sep) == str else sep[0]
                    joined_values = joined_values + separator + extracted[i]
        new_series = joined_values.groupby(level=0).apply(lambda groupby: [match for match in groupby])
        if (type(join_strings) == str) | (join_strings == True):
            new_series = new_series.apply(lambda x: f'{join_strings if type(join_strings) == str else " "}'.join(x))
        logger.debug('\n'.join(messages))
        return new_series

    def extract_pubmed_details(self, record_string):
        """
        [Archived: Use `extract_pubmed_details_df` instead to perform regex operations on the entire dataframe.]
        Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
        """
        authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
        formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

        # Extract publication year
        publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
        publication_year = publication_year.group(1) if publication_year else ''
        publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
        publication_month = publication_month.group(1) if publication_month else ''

        # Extract article title
        article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
        article_title = article_title.group(1) if article_title else ''

        # Extract journal title
        journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
        journal_title = journal_title.group(1) if journal_title else ''

        # Extract journal volume
        journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
        journal_volume = journal_volume.group(1) if journal_volume else ''

        # Extract journal issue
        journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
        journal_issue = journal_issue.group(1) if journal_issue else ''

        # Extract start page
        start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
        start_page = start_page.group(1) if start_page else ''

        # Extract end page
        end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
        end_page = end_page.group(1) if end_page else ''

        # Extract ELocationID
        doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
        doi = doi.group(1) if doi else ''

        # Extract PMID
        pmid = re.search(r'<PMID.*?>(.*?)</PMID>', record_string)
        pmid = pmid.group(1) if pmid else ''

        abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
        # self.logger.debug(f'Number of abstract sections: {len(abstract_matches)}')
        if len(abstract_matches) > 1:
            cleaned_abstract_sections = []
            for match in abstract_matches:
                clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
                clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
                cleaned_abstract_sections.append(clean_match)
                
            abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
        else:
            abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
            
        # Extract MeshHeadingList
        MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
        MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''

        # Estract MeshHeading text and any QualifierName
        mesh_headings = []
        pattern = r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>'
        matches = re.findall(pattern, MeshHeadingList)
        for match in matches:
            heading = match[0]
            if match[1]: # Estract Mesh QualifierName                
                MeshQualifiers = re.findall(
                    r'<QualifierName.*?>(.*?)</QualifierName>', match[1]
                    )
                print(f'mesh qualifiers: {MeshQualifiers}')
                for qualifier in MeshQualifiers:
                    heading = f"{match[0]} / {qualifier}"
                    mesh_headings.append(heading)
            else:
                mesh_headings.append(heading)

        # Extract keyword
        Keyword_List = re.search(r'<KeywordList.*?>(.*?)</KeywordList>', record_string)
        Keyword_List = Keyword_List.group(1) if Keyword_List else ''
        Keywords = re.findall(
            r'<Keyword.*?>(.*?)</Keyword>', Keyword_List
            )
        # Extract MajorTopic text
        MajorTopics = re.findall(
            r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', record_string
            )
        # Extract Publication Type
        PublicationTypeList = re.search(r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>', record_string)
        PublicationTypeList = PublicationTypeList.group(1) if PublicationTypeList else ''
        PublicationType = re.findall(
            r'<PublicationType.*?>(.*?)</PublicationType>', PublicationTypeList
            )
        return {
            'pubmed_title': article_title,
            'abstract': abstract,
            'journal': journal_title,
            'authors': formatted_authors,
            'year': publication_year,
            'month': publication_month,
            'pub_volume': journal_volume,
            'pub_issue': journal_issue,
            'start_page': start_page,
            'end_page': end_page,
            'doi': doi,
            'pmid': pmid,
            'mesh_headings': mesh_headings,
            'keywords': Keywords,
            'major_topics': MajorTopics,
            'publication_type': PublicationType
        }


iteration = 2
query = 'exercise'
retmax = 2
result_dict[iteration] = Pubmed_API()
df = result_dict[iteration].search_article(query, retmax=retmax, ids_only=False)


2024-04-06 22:25:37,791 - Pubmed_API - INFO:
Search term: exercise



2024-04-06 22:25:43,507 - Pubmed_API - INFO:
Extracting these 2 PMIDs: ['38581603', '38581560']



.

2024-04-06 22:25:54,593 - Pubmed_API - INFO:
Processing complete.

2024-04-06 22:25:54,609 - Pubmed_API - INFO:
2 PMIDs found.
['38581603', '38581560']



.

In [8]:
df

Unnamed: 0,pubmed_title,abstract,journal,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,pmid,mesh_headings,keywords,major_topics,publication_type
0,Exploring home rehabilitation therapists' experiences of supporting older persons to physical ex...,"PURPOSE: After hospitalization, older persons may face a decline in physical function and daily ...",European geriatric medicine,"Christina Sandlund, Linda Sandberg, Sebastian Lindblom, Nathalie Frisendahl, Anne-Marie Bostr&#x...",2024,,,,,,10.1007/s41999-024-00972-5,38581603,[],"[Exercise, Older adults, Primary care, Qualitative study, Rehabilitation therapy, Transitional c...",[],[Journal Article]
1,Current awareness and use of transthoracic echocardiography in evaluation of valvular heart dise...,BACKGROUND: There are few reports on transthoracic echocardiography (TTE) for the evaluation of ...,Journal of echocardiography,"Hiroki Usuku, Eiichiro Yamamoto, Fumi Oike, Kenichi Yoshida, Yuji Ogata, Saori Kato, Syota Fukus...",2024,,,,,,10.1007/s12574-024-00648-w,38581560,[],"[Aortic stenosis, Transthoracic echocardiography, Valvular heart disease]",[],[Journal Article]


# 2.1

In [9]:
class Pubmed_API:
    def __init__(self, api_key=os.getenv('api_ncbi'), logger=None, logging_level=logging.INFO):
        """
        Parameters:
        - api_key (str): NCBI API key
        ---
        # Example usage

        result_dict = dict()
        iteration = 1
        query = 'query string'
        result_dict[iteration] = Pubmed_API()

        ## Option 1

        2 steps: Get list of PMIDs first, then get the article data.

        ids_list = result_dict[iteration].search_article(query, retmax=5, ids_only=True)
        df = result_dict[iteration].get_article_data_by_title()

        ## Option 2

        Get the PMIDs and then the article data in one step.

        df = result_dict[iteration].search_article(query, retmax=5, ids_only=False)

        """
        self.api_key = api_key
        self.logger = create_function_logger('Pubmed_API', logger, level=logging_level)
        self.iteration = 0
        self.responses_dict = {}
        self.results_dict = {}
        self.PMIDs_dict = {}
        self.record_strings_dict = {}

    def search_article(self, query, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, period_filter=None,
        additional_search_params=None, ids_only=False, 
        verbose=True
        ):
        """
        Search for article title in PubMed database.

        Parameters:
        - query (str): Pubmed search query.
        - reldate (int, optional): The search returns only those items that have a date specified by datetype within the last n days.
        - query_tag (str, optional): Query tag to append to the search query.
        - publication (str, optional): Publication name.
        - retmax (int, optional): Maximum number of results to return. 
            If None, default is 20. API returns a maximum of 9999 results. To get more results for Pubmed,
            need to use the command line: https://www.ncbi.nlm.nih.gov/books/NBK179288/
        - systematic_only (bool, optional): If True, filter for only systematic review articles.
        - review_only (bool, optional): If True, filter for only systematic review or review articles.
        - additional_search_params (dict, optional): Additional search parameters to pass to the esearch API.
        - period_filter (1, 5, or 10, optional): Filter for articles published in the past 1, 5, or 10 years.
            Note: To filter by other periods, use `reldate` parameter as that is how the API works.
            
        Returns:


        API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
        Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
        """
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        response = {}
        results = pd.DataFrame()
        search_term = f'{re.sub(r"not", "", query)}'  # Remove 'not' since it will be treated as a boolean
        if query_tag:
            search_term += f'{query_tag}'
        if publication:
            search_term = f'AND {publication} [ta]'
        if systematic_only:
            search_term += ' AND systematic[sb]'
        elif review_only:
            search_term += ' AND (systematic[sb] OR review[pt])'
        if period_filter:
            search_term += f' AND y_{period_filter}[Filter]'
        params = {
            'db': 'pubmed',
            'term': search_term,
            'retmax': 5,
            'retmode': 'json',
            'datetype': 'edat',
        }
        if reldate:
            params['reldate'] = reldate
        if retmax:
            params['retmax'] = retmax
        if additional_search_params:
            params.update(additional_search_params)
        self.logger.info(f'Search term: {search_term}')
        messages = []
        try:
            self.iteration += 1
            response = requests.get(base_url, params=params)
            response_dict = response.json()
            id_list = response_dict['esearchresult']['idlist']
            messages.append(f'{len(id_list)} PMIDs found.')
            if verbose==True:
                messages.append(f'{id_list}')
            self.PMIDs_dict[self.iteration] = id_list
            self.responses_dict[self.iteration] = response_dict
            if ids_only==False:
                results = self.get_article_data_by_title()
            else:
                results = id_list
            self.logger.info('\n'.join(messages))
        except Exception as error:
            error_messages = []
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        
        return results

    def get_article_data_by_title(self, iteration=None):
        result_df = pd.DataFrame()
        try:
            result_dict = {}
            iteration = self.iteration if iteration == None else iteration
            record_strings_list = self.batch_retrieve_citation(iteration)
            self.record_strings_dict[iteration] = record_strings_list
            for index, record_string in enumerate(record_strings_list):
                result_dict[index] = self.extract_pubmed_details(record_string)
            self.results_dict[iteration] = result_dict
            result_df = pd.DataFrame(result_dict).transpose()
        except Exception as error:
            error_messages = []
            error_messages.append(f'Response: \n{self.PMIDs_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        return result_df

    def batch_retrieve_citation(self, iteration):
        result_list = []
        messages = []
        try:
            id_list = self.PMIDs_dict.get(iteration)
            if id_list:
                self.logger.info(f'Extracting these {len(id_list)} PMIDs: {id_list}')
                for index, id in enumerate(id_list):
                    result_list.append(self.retrieve_citation(id).decode('utf-8'))
                    current_index, current_id = index+1, id
                    # Show progress 
                    indicator = '.'
                    if current_index % 10 == 0:
                        indicator+='|'
                    print(indicator, end='\n' if current_index%2==100 else '')
                self.logger.info("Processing complete.")
            else:
                self.logger.warning(f'No results found.')
        except Exception as error:
            messages.append(f'Response: \n{self.responses_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            messages.append(f'\tAn error occurred on line {lineno} in {filename}: {error}')
            messages.append(f'Article {current_index} [{current_id}] not found.')
        return result_list

    def retrieve_citation(self, article_id):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        params = {
            'db': 'pubmed',
            'id': article_id
        }
        response = requests.get(base_url, params=params)
        return response.content

    def extract_pubmed_details_df(self, iteration=None):
        """
        Extract the Pubmed article details for the given list of record strings for the given iteration.

        Returns:
        DataFrame of the Pubmed article details.
        """
        df = pd.DataFrame()
        record_strings = pd.Series(self.record_strings_dict.get(iteration if iteration else self.iteration))
        regex_dict = {
            'article_title': r'<ArticleTitle>(.*?)</ArticleTitle>',
            'pmid': r'<PMID.*?>(.*?)</PMID>',
            'journal': r'<Title>(.*?)</Title>',
            'volume': r'<Volume>(.*?)</Volume>',
            'issue': r'<Issue>(.*?)</Issue>',
            'year': r'<PubDate><Year>(\d{4})</Year>',
            'month': r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>',
            'start_page': r'<StartPage>(.*?)</StartPage>',
            'end_page': r'<EndPage>(.*?)</EndPage>',
            'doi': r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>',
        }
        for column, regex in regex_dict.items():
            df[column] = record_strings.str.extract(regex)
        df['abstract'] = self.df_extractall(
            record_strings, parent_regex=r'<Abstract>(.*?)</Abstract>',
            regex = r'<AbstractText.*?(?: Label="(.*?)")?.*?>(.*?)</AbstractText>',
            logger=self.logger, sep=': ', join_strings=' '
        )
        df['mesh_headings'] = self.df_extractall(
            record_strings, 
            parent_regex=r'<MeshHeadingList>(.*?)</MeshHeadingList>',
            regex=r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>',
            nested_regex=r'<QualifierName.*?>(.*?)</QualifierName>', logger=self.logger
        )
        df['authors'] = self.df_extractall(
            record_strings, sep=' ',
            regex=r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>',
            logger=self.logger 
        )
        df['keywords'] = self.df_extractall(
            record_strings, parent_regex=r'<KeywordList.*?>(.*?)</KeywordList>',
            regex=r'<Keyword.*?>(.*?)</Keyword>', 
            logger=self.logger
        )
        df['major_topics'] = self.df_extractall(
            record_strings, 
            regex=r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', 
            logger=self.logger
        )
        df['publication_type'] = self.df_extractall(
            record_strings, parent_regex=r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>',
            regex=r'<PublicationType.*?>(.*?)</PublicationType>', 
            logger=self.logger
        )
        columns = [
            'article_title',
            'abstract',
            'mesh_headings',
            'keywords',
            'major_topics',
            'pmid',
            'doi',
            'journal',
            'volume',
            'issue',
            'year',
            'month',
            'start_page',
            'end_page',
            'authors',
            'publication_type'
        ]
        return df[columns]

    def df_extractall(self, 
            series, regex, parent_regex=None, nested_regex=None, sep=[' ', ' / '], 
            join_strings=False, logger=None
            ):
        """
        Helper function called by `.search_article()` and `.get_article_data_by_title()` to parse 
        article metadata from PubMed database.

        Parameters:
        - series: pd.Series
        - regex: Regular expression to extract from the series.
        - parent_regex (optional): Regular expression from which to extract the `regex`.
            If None, `regex` will be extracted from the series.
        - nested_regex (optional): Regular expression that is nested within `regex` to extract.
        - sep (str or list; optional): String or list of strings used to separate multiple capture groups.
            If it is a list, then the first value is used to separate the main capture groups. 
            The second value is used to separate the nested capture groups. If the nested regex 
            has multiple capture groups, then the last value is used to separate them.
        - join_strings (optional): Boolean indicating whether to join the extracted values.
        - logger (optional): Instance of Custom_Logger class.

        Returns:
        - pd.Series with the extracted values.
        """
        logger = create_function_logger('df_extractall', logger)
        messages = []
        messages.append(f'***Running `df_extractall` with regex {regex}***')
        if parent_regex:
            messages.append(f'\tparent_regex: {parent_regex}')
        if nested_regex:
            messages.append(f'\tnested_regex: {nested_regex}')
        if parent_regex:
            extracted = series.str.extract(parent_regex, expand=False)
            series = extracted
        extracted = series.str.extractall(regex).replace({np.nan: ''})
        if extracted.shape[1] >= 1:
            joined_values = extracted[0]
        else:
            messages.warning('No matches found.')
            return series
        if extracted.shape[1] > 1:
            extracted.index.names = [f'{name if name else "index"}{index if index !=0 else ""}' for index, name in enumerate(extracted.index.names)]
            for i in range(1, extracted.shape[1]):
                if nested_regex:
                    matches = extracted[i].str.extractall(nested_regex)#.replace({np.nan: ''})
                    messages.append(f'Number of nested capture groups: {matches.shape[1]}')
                    matches.columns = [f'nested_text{column}' for column in matches.columns]
                    regex_df = extracted.merge(
                        matches, how='left', left_index=True, right_index=True
                    ).replace({np.nan: ''})
                    nested_separator = sep if type(sep) == str else sep[1]
                    if i == 1:
                        root_column = 0 
                        capture_group_separator = nested_separator
                    else:
                        root_column = 'Text'
                        capture_group_separator = sep if type(sep) == str else sep[-1]
                    regex_df = concat_columns(
                        regex_df, [root_column, 'nested_text0'], 'Text', 
                        sep=capture_group_separator
                    )
                    joined_values = regex_df['Text']

                else:
                    separator = sep if type(sep) == str else sep[0]
                    joined_values = joined_values + separator + extracted[i]
        new_series = joined_values.groupby(level=0).apply(lambda groupby: [match for match in groupby])
        if (type(join_strings) == str) | (join_strings == True):
            new_series = new_series.apply(lambda x: f'{join_strings if type(join_strings) == str else " "}'.join(x))
        logger.debug('\n'.join(messages))
        return new_series

    def extract_pubmed_details(self, record_string):
        """
        [Archived: Use `extract_pubmed_details_df` instead to perform regex operations on the entire dataframe.]
        Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
        """
        authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
        formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

        # Extract publication year
        publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
        publication_year = publication_year.group(1) if publication_year else ''
        publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
        publication_month = publication_month.group(1) if publication_month else ''

        # Extract article title
        article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
        article_title = article_title.group(1) if article_title else ''

        # Extract journal title
        journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
        journal_title = journal_title.group(1) if journal_title else ''

        # Extract journal volume
        journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
        journal_volume = journal_volume.group(1) if journal_volume else ''

        # Extract journal issue
        journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
        journal_issue = journal_issue.group(1) if journal_issue else ''

        # Extract start page
        start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
        start_page = start_page.group(1) if start_page else ''

        # Extract end page
        end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
        end_page = end_page.group(1) if end_page else ''

        # Extract ELocationID
        doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
        doi = doi.group(1) if doi else ''

        # Extract PMID
        pmid = re.search(r'<PMID.*?>(.*?)</PMID>', record_string)
        pmid = pmid.group(1) if pmid else ''

        abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
        # self.logger.debug(f'Number of abstract sections: {len(abstract_matches)}')
        if len(abstract_matches) > 1:
            cleaned_abstract_sections = []
            for match in abstract_matches:
                clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
                clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
                cleaned_abstract_sections.append(clean_match)
                
            abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
        else:
            abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
            
        # Extract MeshHeadingList
        MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
        MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''

        # Estract MeshHeading text and any QualifierName
        mesh_headings = []
        pattern = r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>'
        matches = re.findall(pattern, MeshHeadingList)
        for match in matches:
            heading = match[0]
            if match[1]: # Estract Mesh QualifierName                
                MeshQualifiers = re.findall(
                    r'<QualifierName.*?>(.*?)</QualifierName>', match[1]
                    )
                print(f'mesh qualifiers: {MeshQualifiers}')
                for qualifier in MeshQualifiers:
                    heading = f"{match[0]} / {qualifier}"
                    mesh_headings.append(heading)
            else:
                mesh_headings.append(heading)

        # Extract keyword
        Keyword_List = re.search(r'<KeywordList.*?>(.*?)</KeywordList>', record_string)
        Keyword_List = Keyword_List.group(1) if Keyword_List else ''
        Keywords = re.findall(
            r'<Keyword.*?>(.*?)</Keyword>', Keyword_List
            )
        # Extract MajorTopic text
        MajorTopics = re.findall(
            r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', record_string
            )
        # Extract Publication Type
        PublicationTypeList = re.search(r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>', record_string)
        PublicationTypeList = PublicationTypeList.group(1) if PublicationTypeList else ''
        PublicationType = re.findall(
            r'<PublicationType.*?>(.*?)</PublicationType>', PublicationTypeList
            )
        return {
            'pubmed_title': article_title,
            'abstract': abstract,
            'journal': journal_title,
            'authors': formatted_authors,
            'year': publication_year,
            'month': publication_month,
            'pub_volume': journal_volume,
            'pub_issue': journal_issue,
            'start_page': start_page,
            'end_page': end_page,
            'doi': doi,
            'pmid': pmid,
            'mesh_headings': mesh_headings,
            'keywords': Keywords,
            'major_topics': MajorTopics,
            'publication_type': PublicationType
        }


iteration = 2.1
query = 'exercise'
retmax = 5
result_dict[iteration] = Pubmed_API()
df = result_dict[iteration].search_article(query, retmax=retmax, ids_only=False)


2024-04-06 22:29:32,720 - Pubmed_API - INFO:
Search term: exercise



2024-04-06 22:29:38,538 - Pubmed_API - INFO:
Extracting these 5 PMIDs: ['38581603', '38581560', '38581554', '38581479', '38581449']



....

2024-04-06 22:30:07,126 - Pubmed_API - INFO:
Processing complete.

2024-04-06 22:30:07,219 - Pubmed_API - INFO:
5 PMIDs found.
['38581603', '38581560', '38581554', '38581479', '38581449']



.

In [10]:
df

Unnamed: 0,pubmed_title,abstract,journal,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,pmid,mesh_headings,keywords,major_topics,publication_type
0,Exploring home rehabilitation therapists' experiences of supporting older persons to physical ex...,"PURPOSE: After hospitalization, older persons may face a decline in physical function and daily ...",European geriatric medicine,"Christina Sandlund, Linda Sandberg, Sebastian Lindblom, Nathalie Frisendahl, Anne-Marie Bostr&#x...",2024,,,,,,10.1007/s41999-024-00972-5,38581603,[],"[Exercise, Older adults, Primary care, Qualitative study, Rehabilitation therapy, Transitional c...",[],[Journal Article]
1,Current awareness and use of transthoracic echocardiography in evaluation of valvular heart dise...,BACKGROUND: There are few reports on transthoracic echocardiography (TTE) for the evaluation of ...,Journal of echocardiography,"Hiroki Usuku, Eiichiro Yamamoto, Fumi Oike, Kenichi Yoshida, Yuji Ogata, Saori Kato, Syota Fukus...",2024,,,,,,10.1007/s12574-024-00648-w,38581560,[],"[Aortic stenosis, Transthoracic echocardiography, Valvular heart disease]",[],[Journal Article]
2,Clinical and ultrasonographic evaluation of uninjured dominant shoulder in amateur rugby players...,BACKGROUND: Rugby is a sport involving a great number of shoulder collisions. Traumatic stress o...,Journal of ultrasound,"Giovanni Monteleone, Alfonso Tramontana, Roberto Sorge",2024,,,,,,10.1007/s40477-024-00897-6,38581554,[],"[Clinical exam of the shoulder, Rotator cuff injuries, Rugby, Shoulder injuries, Tendon calcific...",[],[Journal Article]
3,"A Letter to the Editor on ""Effect of Visual Biofeedback Obtained Using the Iowa Oral Performance...",The author has provided an alternative biofeedback method to help maintain the effortful swallow...,Dysphagia,&#xd6;zg&#xfc;n Uysal,2024,,,,,,10.1007/s00455-024-10698-y,38581479,[],"[Attention, Effortful swallow, Electromyography]",[],[Letter]
4,Effects of sporadic inclusion body myositis on skeletal muscle fibre type specific morphology an...,Sporadic inclusion body myositis (sIBM) is a subgroup of idiopathic inflammatory myopathies char...,Rheumatology international,"Kasper Yde Jensen, Jakob Lindberg Nielsen, Per Aagaard, Mikkel Jacobsen, Anders N&#xf8;rk&#xe6;r...",2024,,,,,,10.1007/s00296-024-05567-8,38581449,[],"[Immunology, Macrophages, Myonuclei, Myopathies, Satellite cells]",[],[Journal Article]


In [12]:
iteration = 2.11
query = 'exercise'
retmax = 15
result_dict[iteration] = Pubmed_API()
df = result_dict[iteration].search_article(query, retmax=retmax, ids_only=False)

2024-04-06 22:33:08,579 - Pubmed_API - INFO:
Search term: exercise

2024-04-06 22:33:14,324 - Pubmed_API - INFO:
Extracting these 15 PMIDs: ['38581603', '38581560', '38581554', '38581479', '38581449', '38581398', '38581338', '38581216', '38581070', '38581046', '38580947', '38580913', '38580840', '38580835', '38580668']



..
..
..
..
..
..
..


2024-04-06 22:34:37,933 - Pubmed_API - INFO:
Processing complete.

2024-04-06 22:34:37,949 - Pubmed_API - INFO:
15 PMIDs found.
['38581603', '38581560', '38581554', '38581479', '38581449', '38581398', '38581338', '38581216', '38581070', '38581046', '38580947', '38580913', '38580840', '38580835', '38580668']



.

# 2.2

In [13]:
class Pubmed_API:
    def __init__(self, api_key=os.getenv('api_ncbi'), logger=None, logging_level=logging.INFO):
        """
        Parameters:
        - api_key (str): NCBI API key
        ---
        # Example usage

        result_dict = dict()
        iteration = 1
        query = 'query string'
        result_dict[iteration] = Pubmed_API()

        ## Option 1

        2 steps: Get list of PMIDs first, then get the article data.

        ids_list = result_dict[iteration].search_article(query, retmax=5, ids_only=True)
        df = result_dict[iteration].get_article_data_by_title()

        ## Option 2

        Get the PMIDs and then the article data in one step.

        df = result_dict[iteration].search_article(query, retmax=5, ids_only=False)

        """
        self.api_key = api_key
        self.logger = create_function_logger('Pubmed_API', logger, level=logging_level)
        self.iteration = 0
        self.responses_dict = {}
        self.results_dict = {}
        self.PMIDs_dict = {}
        self.record_strings_dict = {}

    def search_article(self, query, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, period_filter=None,
        additional_search_params=None, ids_only=False, 
        verbose=True
        ):
        """
        Search for article title in PubMed database.

        Parameters:
        - query (str): Pubmed search query.
        - reldate (int, optional): The search returns only those items that have a date specified by datetype within the last n days.
        - query_tag (str, optional): Query tag to append to the search query.
        - publication (str, optional): Publication name.
        - retmax (int, optional): Maximum number of results to return. 
            If None, default is 20. API returns a maximum of 9999 results. To get more results for Pubmed,
            need to use the command line: https://www.ncbi.nlm.nih.gov/books/NBK179288/
        - systematic_only (bool, optional): If True, filter for only systematic review articles.
        - review_only (bool, optional): If True, filter for only systematic review or review articles.
        - additional_search_params (dict, optional): Additional search parameters to pass to the esearch API.
        - period_filter (1, 5, or 10, optional): Filter for articles published in the past 1, 5, or 10 years.
            Note: To filter by other periods, use `reldate` parameter as that is how the API works.
            
        Returns:


        API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
        Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
        """
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        response = {}
        results = pd.DataFrame()
        search_term = f'{re.sub(r"not", "", query)}'  # Remove 'not' since it will be treated as a boolean
        if query_tag:
            search_term += f'{query_tag}'
        if publication:
            search_term = f'AND {publication} [ta]'
        if systematic_only:
            search_term += ' AND systematic[sb]'
        elif review_only:
            search_term += ' AND (systematic[sb] OR review[pt])'
        if period_filter:
            search_term += f' AND y_{period_filter}[Filter]'
        params = {
            'db': 'pubmed',
            'term': search_term,
            'retmax': 5,
            'retmode': 'json',
            'datetype': 'edat',
        }
        if reldate:
            params['reldate'] = reldate
        if retmax:
            params['retmax'] = retmax
        if additional_search_params:
            params.update(additional_search_params)
        self.logger.info(f'Search term: {search_term}')
        messages = []
        try:
            self.iteration += 1
            response = requests.get(base_url, params=params)
            response_dict = response.json()
            id_list = response_dict['esearchresult']['idlist']
            messages.append(f'{len(id_list)} PMIDs found.')
            if verbose==True:
                messages.append(f'{id_list}')
            self.PMIDs_dict[self.iteration] = id_list
            self.responses_dict[self.iteration] = response_dict
            if ids_only==False:
                results = self.get_article_data_by_title()
            else:
                results = id_list
            self.logger.info('\n'.join(messages))
        except Exception as error:
            error_messages = []
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        
        return results

    def get_article_data_by_title(self, iteration=None):
        result_df = pd.DataFrame()
        try:
            result_dict = {}
            iteration = self.iteration if iteration == None else iteration
            record_strings_list = self.batch_retrieve_citation(iteration)
            self.record_strings_dict[iteration] = record_strings_list
            for index, record_string in enumerate(record_strings_list):
                result_dict[index] = self.extract_pubmed_details(record_string)
            self.results_dict[iteration] = result_dict
            result_df = pd.DataFrame(result_dict).transpose()
        except Exception as error:
            error_messages = []
            error_messages.append(f'Response: \n{self.PMIDs_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        return result_df

    def batch_retrieve_citation(self, iteration):
        result_list = []
        messages = []
        try:
            id_list = self.PMIDs_dict.get(iteration)
            if id_list:
                self.logger.info(f'Extracting these {len(id_list)} PMIDs: {id_list}')
                for index, id in enumerate(id_list):
                    result_list.append(self.retrieve_citation(id).decode('utf-8'))
                    current_index, current_id = index+1, id
                    # Show progress 
                    indicator = '.'
                    if current_index % 10 == 0:
                        indicator+='|'
                    print(indicator, end='\n' if current_index%2==100 else '')
                self.logger.info("Processing complete.")
            else:
                self.logger.warning(f'No results found.')
        except Exception as error:
            messages.append(f'Response: \n{self.responses_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            messages.append(f'\tAn error occurred on line {lineno} in {filename}: {error}')
            messages.append(f'Article {current_index} [{current_id}] not found.')
        return result_list

    def retrieve_citation(self, article_id):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        params = {
            'db': 'pubmed',
            'id': article_id
        }
        response = requests.get(base_url, params=params)
        return response.content

    def extract_pubmed_details_df(self, iteration=None):
        """
        Extract the Pubmed article details for the given list of record strings for the given iteration.

        Returns:
        DataFrame of the Pubmed article details.
        """
        df = pd.DataFrame()
        self.logger.debug('***Running `.extract_pubmed_details_df`***')
        record_strings = pd.Series(self.record_strings_dict.get(iteration if iteration else self.iteration))
        regex_dict = {
            'article_title': r'<ArticleTitle>(.*?)</ArticleTitle>',
            'pmid': r'<PMID.*?>(.*?)</PMID>',
            'journal': r'<Title>(.*?)</Title>',
            'volume': r'<Volume>(.*?)</Volume>',
            'issue': r'<Issue>(.*?)</Issue>',
            'year': r'<PubDate><Year>(\d{4})</Year>',
            'month': r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>',
            'start_page': r'<StartPage>(.*?)</StartPage>',
            'end_page': r'<EndPage>(.*?)</EndPage>',
            'doi': r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>',
        }
        for column, regex in regex_dict.items():
            df[column] = record_strings.str.extract(regex)
        df['abstract'] = self.df_extractall(
            record_strings, parent_regex=r'<Abstract>(.*?)</Abstract>',
            regex = r'<AbstractText.*?(?: Label="(.*?)")?.*?>(.*?)</AbstractText>',
            logger=self.logger, sep=': ', join_strings=' '
        )
        df['mesh_headings'] = self.df_extractall(
            record_strings, 
            parent_regex=r'<MeshHeadingList>(.*?)</MeshHeadingList>',
            regex=r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>',
            nested_regex=r'<QualifierName.*?>(.*?)</QualifierName>', logger=self.logger
        )
        df['authors'] = self.df_extractall(
            record_strings, sep=' ',
            regex=r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>',
            logger=self.logger 
        )
        df['keywords'] = self.df_extractall(
            record_strings, parent_regex=r'<KeywordList.*?>(.*?)</KeywordList>',
            regex=r'<Keyword.*?>(.*?)</Keyword>', 
            logger=self.logger
        )
        df['major_topics'] = self.df_extractall(
            record_strings, 
            regex=r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', 
            logger=self.logger
        )
        df['publication_type'] = self.df_extractall(
            record_strings, parent_regex=r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>',
            regex=r'<PublicationType.*?>(.*?)</PublicationType>', 
            logger=self.logger
        )
        columns = [
            'article_title',
            'abstract',
            'mesh_headings',
            'keywords',
            'major_topics',
            'pmid',
            'doi',
            'journal',
            'volume',
            'issue',
            'year',
            'month',
            'start_page',
            'end_page',
            'authors',
            'publication_type'
        ]
        return df[columns]

    def df_extractall(self, 
            series, regex, parent_regex=None, nested_regex=None, sep=[' ', ' / '], 
            join_strings=False, logger=None
            ):
        """
        Helper function called by `.search_article()` and `.get_article_data_by_title()` to parse 
        article metadata from PubMed database.

        Parameters:
        - series: pd.Series
        - regex: Regular expression to extract from the series.
        - parent_regex (optional): Regular expression from which to extract the `regex`.
            If None, `regex` will be extracted from the series.
        - nested_regex (optional): Regular expression that is nested within `regex` to extract.
        - sep (str or list; optional): String or list of strings used to separate multiple capture groups.
            If it is a list, then the first value is used to separate the main capture groups. 
            The second value is used to separate the nested capture groups. If the nested regex 
            has multiple capture groups, then the last value is used to separate them.
        - join_strings (optional): Boolean indicating whether to join the extracted values.
        - logger (optional): Instance of Custom_Logger class.

        Returns:
        - pd.Series with the extracted values.
        """
        logger = create_function_logger('df_extractall', logger)
        messages = []
        messages.append(f'***Running `df_extractall` with regex {regex}***')
        if parent_regex:
            messages.append(f'\tparent_regex: {parent_regex}')
        if nested_regex:
            messages.append(f'\tnested_regex: {nested_regex}')
        if parent_regex:
            extracted = series.str.extract(parent_regex, expand=False)
            series = extracted
        extracted = series.str.extractall(regex).replace({np.nan: ''})
        if extracted.shape[1] >= 1:
            joined_values = extracted[0]
        else:
            messages.warning('No matches found.')
            return series
        if extracted.shape[1] > 1:
            extracted.index.names = [f'{name if name else "index"}{index if index !=0 else ""}' for index, name in enumerate(extracted.index.names)]
            for i in range(1, extracted.shape[1]):
                if nested_regex:
                    matches = extracted[i].str.extractall(nested_regex)#.replace({np.nan: ''})
                    messages.append(f'Number of nested capture groups: {matches.shape[1]}')
                    matches.columns = [f'nested_text{column}' for column in matches.columns]
                    regex_df = extracted.merge(
                        matches, how='left', left_index=True, right_index=True
                    ).replace({np.nan: ''})
                    nested_separator = sep if type(sep) == str else sep[1]
                    if i == 1:
                        root_column = 0 
                        capture_group_separator = nested_separator
                    else:
                        root_column = 'Text'
                        capture_group_separator = sep if type(sep) == str else sep[-1]
                    regex_df = concat_columns(
                        regex_df, [root_column, 'nested_text0'], 'Text', 
                        sep=capture_group_separator
                    )
                    joined_values = regex_df['Text']

                else:
                    separator = sep if type(sep) == str else sep[0]
                    joined_values = joined_values + separator + extracted[i]
        new_series = joined_values.groupby(level=0).apply(lambda groupby: [match for match in groupby])
        if (type(join_strings) == str) | (join_strings == True):
            new_series = new_series.apply(lambda x: f'{join_strings if type(join_strings) == str else " "}'.join(x))
        logger.debug('\n'.join(messages))
        return new_series

    def extract_pubmed_details(self, record_string):
        """
        [Archived: Use `extract_pubmed_details_df` instead to perform regex operations on the entire dataframe.]
        Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
        """
        authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
        formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

        # Extract publication year
        publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
        publication_year = publication_year.group(1) if publication_year else ''
        publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
        publication_month = publication_month.group(1) if publication_month else ''

        # Extract article title
        article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
        article_title = article_title.group(1) if article_title else ''

        # Extract journal title
        journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
        journal_title = journal_title.group(1) if journal_title else ''

        # Extract journal volume
        journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
        journal_volume = journal_volume.group(1) if journal_volume else ''

        # Extract journal issue
        journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
        journal_issue = journal_issue.group(1) if journal_issue else ''

        # Extract start page
        start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
        start_page = start_page.group(1) if start_page else ''

        # Extract end page
        end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
        end_page = end_page.group(1) if end_page else ''

        # Extract ELocationID
        doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
        doi = doi.group(1) if doi else ''

        # Extract PMID
        pmid = re.search(r'<PMID.*?>(.*?)</PMID>', record_string)
        pmid = pmid.group(1) if pmid else ''

        abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
        # self.logger.debug(f'Number of abstract sections: {len(abstract_matches)}')
        if len(abstract_matches) > 1:
            cleaned_abstract_sections = []
            for match in abstract_matches:
                clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
                clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
                cleaned_abstract_sections.append(clean_match)
                
            abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
        else:
            abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
            
        # Extract MeshHeadingList
        MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
        MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''

        # Estract MeshHeading text and any QualifierName
        mesh_headings = []
        pattern = r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>'
        matches = re.findall(pattern, MeshHeadingList)
        for match in matches:
            heading = match[0]
            if match[1]: # Estract Mesh QualifierName                
                MeshQualifiers = re.findall(
                    r'<QualifierName.*?>(.*?)</QualifierName>', match[1]
                    )
                print(f'mesh qualifiers: {MeshQualifiers}')
                for qualifier in MeshQualifiers:
                    heading = f"{match[0]} / {qualifier}"
                    mesh_headings.append(heading)
            else:
                mesh_headings.append(heading)

        # Extract keyword
        Keyword_List = re.search(r'<KeywordList.*?>(.*?)</KeywordList>', record_string)
        Keyword_List = Keyword_List.group(1) if Keyword_List else ''
        Keywords = re.findall(
            r'<Keyword.*?>(.*?)</Keyword>', Keyword_List
            )
        # Extract MajorTopic text
        MajorTopics = re.findall(
            r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', record_string
            )
        # Extract Publication Type
        PublicationTypeList = re.search(r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>', record_string)
        PublicationTypeList = PublicationTypeList.group(1) if PublicationTypeList else ''
        PublicationType = re.findall(
            r'<PublicationType.*?>(.*?)</PublicationType>', PublicationTypeList
            )
        return {
            'pubmed_title': article_title,
            'abstract': abstract,
            'journal': journal_title,
            'authors': formatted_authors,
            'year': publication_year,
            'month': publication_month,
            'pub_volume': journal_volume,
            'pub_issue': journal_issue,
            'start_page': start_page,
            'end_page': end_page,
            'doi': doi,
            'pmid': pmid,
            'mesh_headings': mesh_headings,
            'keywords': Keywords,
            'major_topics': MajorTopics,
            'publication_type': PublicationType
        }


iteration = 2.2
query = 'exercise'
retmax = 15
result_dict[iteration] = Pubmed_API()
df = result_dict[iteration].search_article(query, retmax=retmax, ids_only=False)


2024-04-06 22:39:04,454 - Pubmed_API - INFO:
Search term: exercise



2024-04-06 22:39:10,458 - Pubmed_API - INFO:
Extracting these 15 PMIDs: ['38581603', '38581560', '38581554', '38581479', '38581449', '38581398', '38581338', '38581216', '38581070', '38581046', '38580947', '38580913', '38580840', '38580835', '38580668']



..........|....

2024-04-06 22:40:34,889 - Pubmed_API - INFO:
Processing complete.

2024-04-06 22:40:34,953 - Pubmed_API - INFO:
15 PMIDs found.
['38581603', '38581560', '38581554', '38581479', '38581449', '38581398', '38581338', '38581216', '38581070', '38581046', '38580947', '38580913', '38580840', '38580835', '38580668']



.

# 2.4

In [20]:
class Pubmed_API:
    def __init__(self, api_key=os.getenv('api_ncbi'), logger=None, logging_level=logging.INFO):
        """
        Parameters:
        - api_key (str): NCBI API key
        ---
        # Example usage

        result_dict = dict()
        iteration = 1
        query = 'query string'
        result_dict[iteration] = Pubmed_API()

        ## Option 1

        2 steps: Get list of PMIDs first, then get the article data.

        ids_list = result_dict[iteration].search_article(query, retmax=5, ids_only=True)
        df = result_dict[iteration].get_article_data_by_title()

        ## Option 2

        Get the PMIDs and then the article data in one step.

        df = result_dict[iteration].search_article(query, retmax=5, ids_only=False)

        """
        self.api_key = api_key
        self.logger = create_function_logger('Pubmed_API', logger, level=logging_level)
        self.iteration = 0
        self.responses_dict = {}
        self.results_dict = {}
        self.PMIDs_dict = {}
        self.record_strings_dict = {}

    def search_article(self, query, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, period_filter=None,
        additional_search_params=None, ids_only=False, 
        verbose=True
        ):
        """
        Search for article title in PubMed database.

        Parameters:
        - query (str): Pubmed search query.
        - reldate (int, optional): The search returns only those items that have a date specified by datetype within the last n days.
        - query_tag (str, optional): Query tag to append to the search query.
        - publication (str, optional): Publication name.
        - retmax (int, optional): Maximum number of results to return. 
            If None, default is 20. API returns a maximum of 9999 results. To get more results for Pubmed,
            need to use the command line: https://www.ncbi.nlm.nih.gov/books/NBK179288/
        - systematic_only (bool, optional): If True, filter for only systematic review articles.
        - review_only (bool, optional): If True, filter for only systematic review or review articles.
        - additional_search_params (dict, optional): Additional search parameters to pass to the esearch API.
        - period_filter (1, 5, or 10, optional): Filter for articles published in the past 1, 5, or 10 years.
            Note: To filter by other periods, use `reldate` parameter as that is how the API works.
            
        Returns:


        API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
        Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
        """
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        response = {}
        results = pd.DataFrame()
        search_term = f'{re.sub(r"not", "", query)}'  # Remove 'not' since it will be treated as a boolean
        if query_tag:
            search_term += f'{query_tag}'
        if publication:
            search_term = f'AND {publication} [ta]'
        if systematic_only:
            search_term += ' AND systematic[sb]'
        elif review_only:
            search_term += ' AND (systematic[sb] OR review[pt])'
        if period_filter:
            search_term += f' AND y_{period_filter}[Filter]'
        params = {
            'db': 'pubmed',
            'term': search_term,
            'retmax': 5,
            'retmode': 'json',
            'datetype': 'edat',
        }
        if reldate:
            params['reldate'] = reldate
        if retmax:
            params['retmax'] = retmax
        if additional_search_params:
            params.update(additional_search_params)
        self.logger.info(f'Search term: {search_term}')
        messages = []
        try:
            self.iteration += 1
            response = requests.get(base_url, params=params)
            response_dict = response.json()
            id_list = response_dict['esearchresult']['idlist']
            messages.append(f'{len(id_list)} PMIDs found.')
            if verbose==True:
                messages.append(f'{id_list}')
            self.PMIDs_dict[self.iteration] = id_list
            self.responses_dict[self.iteration] = response_dict
            if ids_only==False:
                results = self.get_article_data_by_title()
            else:
                results = id_list
            self.logger.info('\n'.join(messages))
        except Exception as error:
            error_messages = []
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        
        return results

    def get_article_data_by_title(self, iteration=None):
        result_df = pd.DataFrame()
        try:
            iteration = self.iteration if iteration == None else iteration
            record_strings_list = self.batch_retrieve_citation(iteration)
            self.record_strings_dict[iteration] = record_strings_list
            result_df = self.extract_pubmed_details_df(iteration)
            self.results_dict[iteration] = result_df
        except Exception as error:
            error_messages = []
            error_messages.append(f'Response: \n{self.PMIDs_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        return result_df

    def batch_retrieve_citation(self, iteration):
        result_list = []
        messages = []
        try:
            id_list = self.PMIDs_dict.get(iteration)
            if id_list:
                self.logger.debug(f'Extracting these {len(id_list)} PMIDs: {id_list}')
                for index, id in enumerate(id_list):
                    result_list.append(self.retrieve_citation(id).decode('utf-8'))
                    current_index, current_id = index+1, id
                    # Show progress 
                    indicator = '.'
                    if current_index % 10 == 0:
                        indicator+='|'
                    print(indicator, end='\n' if current_index%2==100 else '')
                self.logger.info("Processing complete.")
            else:
                self.logger.warning(f'No results found.')
        except Exception as error:
            messages.append(f'Response: \n{self.responses_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            messages.append(f'\tAn error occurred on line {lineno} in {filename}: {error}')
            messages.append(f'Article {current_index} [{current_id}] not found.')
        return result_list

    def retrieve_citation(self, article_id):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        params = {
            'db': 'pubmed',
            'id': article_id
        }
        response = requests.get(base_url, params=params)
        return response.content

    def extract_pubmed_details_df(self, iteration=None):
        """
        Extract the Pubmed article details for the given list of record strings for the given iteration.

        Returns:
        DataFrame of the Pubmed article details.
        """
        df = pd.DataFrame()
        self.logger.info('***Running `.extract_pubmed_details_df`***')
        record_strings = pd.Series(self.record_strings_dict.get(iteration if iteration else self.iteration))
        regex_dict = {
            'article_title': r'<ArticleTitle>(.*?)</ArticleTitle>',
            'pmid': r'<PMID.*?>(.*?)</PMID>',
            'journal': r'<Title>(.*?)</Title>',
            'volume': r'<Volume>(.*?)</Volume>',
            'issue': r'<Issue>(.*?)</Issue>',
            'year': r'<PubDate><Year>(\d{4})</Year>',
            'month': r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>',
            'start_page': r'<StartPage>(.*?)</StartPage>',
            'end_page': r'<EndPage>(.*?)</EndPage>',
            'doi': r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>',
        }
        for column, regex in regex_dict.items():
            df[column] = record_strings.str.extract(regex)
        df['abstract'] = self.df_extractall(
            record_strings, parent_regex=r'<Abstract>(.*?)</Abstract>',
            regex = r'<AbstractText.*?(?: Label="(.*?)")?.*?>(.*?)</AbstractText>',
            logger=self.logger, sep=': ', join_strings=' '
        )
        df['mesh_headings'] = self.df_extractall(
            record_strings, 
            parent_regex=r'<MeshHeadingList>(.*?)</MeshHeadingList>',
            regex=r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>',
            nested_regex=r'<QualifierName.*?>(.*?)</QualifierName>', logger=self.logger
        )
        df['authors'] = self.df_extractall(
            record_strings, sep=' ',
            regex=r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>',
            logger=self.logger 
        )
        df['keywords'] = self.df_extractall(
            record_strings, parent_regex=r'<KeywordList.*?>(.*?)</KeywordList>',
            regex=r'<Keyword.*?>(.*?)</Keyword>', 
            logger=self.logger
        )
        df['major_topics'] = self.df_extractall(
            record_strings, 
            regex=r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', 
            logger=self.logger
        )
        df['publication_type'] = self.df_extractall(
            record_strings, parent_regex=r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>',
            regex=r'<PublicationType.*?>(.*?)</PublicationType>', 
            logger=self.logger
        )
        columns = [
            'article_title',
            'abstract',
            'mesh_headings',
            'keywords',
            'major_topics',
            'pmid',
            'doi',
            'journal',
            'volume',
            'issue',
            'year',
            'month',
            'start_page',
            'end_page',
            'authors',
            'publication_type'
        ]
        return df[columns]

    def df_extractall(self, 
            series, regex, parent_regex=None, nested_regex=None, sep=[' ', ' / '], 
            join_strings=False, logger=None
            ):
        """
        Helper function called by `.search_article()` and `.get_article_data_by_title()` to parse 
        article metadata from PubMed database.

        Parameters:
        - series: pd.Series
        - regex: Regular expression to extract from the series.
        - parent_regex (optional): Regular expression from which to extract the `regex`.
            If None, `regex` will be extracted from the series.
        - nested_regex (optional): Regular expression that is nested within `regex` to extract.
        - sep (str or list; optional): String or list of strings used to separate multiple capture groups.
            If it is a list, then the first value is used to separate the main capture groups. 
            The second value is used to separate the nested capture groups. If the nested regex 
            has multiple capture groups, then the last value is used to separate them.
        - join_strings (optional): Boolean indicating whether to join the extracted values.
        - logger (optional): Instance of Custom_Logger class.

        Returns:
        - pd.Series with the extracted values.
        """
        logger = create_function_logger('df_extractall', logger)
        messages = []
        messages.append(f'***Running `df_extractall` with regex {regex}***')
        if parent_regex:
            messages.append(f'\tparent_regex: {parent_regex}')
        if nested_regex:
            messages.append(f'\tnested_regex: {nested_regex}')
        if parent_regex:
            extracted = series.str.extract(parent_regex, expand=False)
            series = extracted
        extracted = series.str.extractall(regex).replace({np.nan: ''})
        if extracted.shape[1] >= 1:
            joined_values = extracted[0]
        else:
            messages.warning('No matches found.')
            return series
        if extracted.shape[1] > 1:
            extracted.index.names = [f'{name if name else "index"}{index if index !=0 else ""}' for index, name in enumerate(extracted.index.names)]
            for i in range(1, extracted.shape[1]):
                if nested_regex:
                    matches = extracted[i].str.extractall(nested_regex)#.replace({np.nan: ''})
                    messages.append(f'Number of nested capture groups: {matches.shape[1]}')
                    matches.columns = [f'nested_text{column}' for column in matches.columns]
                    regex_df = extracted.merge(
                        matches, how='left', left_index=True, right_index=True
                    ).replace({np.nan: ''})
                    nested_separator = sep if type(sep) == str else sep[1]
                    if i == 1:
                        root_column = 0 
                        capture_group_separator = nested_separator
                    else:
                        root_column = 'Text'
                        capture_group_separator = sep if type(sep) == str else sep[-1]
                    regex_df = concat_columns(
                        regex_df, [root_column, 'nested_text0'], 'Text', 
                        sep=capture_group_separator
                    )
                    joined_values = regex_df['Text']

                else:
                    separator = sep if type(sep) == str else sep[0]
                    joined_values = joined_values + separator + extracted[i]
        new_series = joined_values.groupby(level=0).apply(lambda groupby: [match for match in groupby])
        if (type(join_strings) == str) | (join_strings == True):
            new_series = new_series.apply(lambda x: f'{join_strings if type(join_strings) == str else " "}'.join(x))
        logger.debug('\n'.join(messages))
        return new_series

    def extract_pubmed_details(self, record_string):
        """
        [Archived: Use `extract_pubmed_details_df` instead to perform regex operations on the entire dataframe.]
        Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
        """
        authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
        formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

        # Extract publication year
        publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
        publication_year = publication_year.group(1) if publication_year else ''
        publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
        publication_month = publication_month.group(1) if publication_month else ''

        # Extract article title
        article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
        article_title = article_title.group(1) if article_title else ''

        # Extract journal title
        journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
        journal_title = journal_title.group(1) if journal_title else ''

        # Extract journal volume
        journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
        journal_volume = journal_volume.group(1) if journal_volume else ''

        # Extract journal issue
        journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
        journal_issue = journal_issue.group(1) if journal_issue else ''

        # Extract start page
        start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
        start_page = start_page.group(1) if start_page else ''

        # Extract end page
        end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
        end_page = end_page.group(1) if end_page else ''

        # Extract ELocationID
        doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
        doi = doi.group(1) if doi else ''

        # Extract PMID
        pmid = re.search(r'<PMID.*?>(.*?)</PMID>', record_string)
        pmid = pmid.group(1) if pmid else ''

        abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
        # self.logger.debug(f'Number of abstract sections: {len(abstract_matches)}')
        if len(abstract_matches) > 1:
            cleaned_abstract_sections = []
            for match in abstract_matches:
                clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
                clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
                cleaned_abstract_sections.append(clean_match)
                
            abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
        else:
            abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
            
        # Extract MeshHeadingList
        MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
        MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''

        # Estract MeshHeading text and any QualifierName
        mesh_headings = []
        pattern = r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>'
        matches = re.findall(pattern, MeshHeadingList)
        for match in matches:
            heading = match[0]
            if match[1]: # Estract Mesh QualifierName                
                MeshQualifiers = re.findall(
                    r'<QualifierName.*?>(.*?)</QualifierName>', match[1]
                    )
                print(f'mesh qualifiers: {MeshQualifiers}')
                for qualifier in MeshQualifiers:
                    heading = f"{match[0]} / {qualifier}"
                    mesh_headings.append(heading)
            else:
                mesh_headings.append(heading)

        # Extract keyword
        Keyword_List = re.search(r'<KeywordList.*?>(.*?)</KeywordList>', record_string)
        Keyword_List = Keyword_List.group(1) if Keyword_List else ''
        Keywords = re.findall(
            r'<Keyword.*?>(.*?)</Keyword>', Keyword_List
            )
        # Extract MajorTopic text
        MajorTopics = re.findall(
            r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', record_string
            )
        # Extract Publication Type
        PublicationTypeList = re.search(r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>', record_string)
        PublicationTypeList = PublicationTypeList.group(1) if PublicationTypeList else ''
        PublicationType = re.findall(
            r'<PublicationType.*?>(.*?)</PublicationType>', PublicationTypeList
            )
        return {
            'pubmed_title': article_title,
            'abstract': abstract,
            'journal': journal_title,
            'authors': formatted_authors,
            'year': publication_year,
            'month': publication_month,
            'pub_volume': journal_volume,
            'pub_issue': journal_issue,
            'start_page': start_page,
            'end_page': end_page,
            'doi': doi,
            'pmid': pmid,
            'mesh_headings': mesh_headings,
            'keywords': Keywords,
            'major_topics': MajorTopics,
            'publication_type': PublicationType
        }


iteration = 2.4
query = 'exercise'
retmax = 5
result_dict[iteration] = Pubmed_API()
df = result_dict[iteration].search_article(query, retmax=retmax, ids_only=False)


2024-04-06 22:54:35,505 - Pubmed_API - INFO:
Search term: exercise



2024-04-06 22:54:41,122 - Pubmed_API - INFO:
Extracting these 5 PMIDs: ['38581603', '38581560', '38581554', '38581479', '38581449']



....

2024-04-06 22:55:08,885 - Pubmed_API - INFO:
Processing complete.

2024-04-06 22:55:08,887 - Pubmed_API - INFO:
***Running `.extract_pubmed_details_df`***

2024-04-06 22:55:08,951 - Pubmed_API - INFO:
5 PMIDs found.
['38581603', '38581560', '38581554', '38581479', '38581449']



.

In [23]:
vars(result_dict[iteration]).keys()

dict_keys(['api_key', 'logger', 'iteration', 'responses_dict', 'results_dict', 'PMIDs_dict', 'record_strings_dict'])

In [24]:
result_dict[iteration].record_strings_dict

{1: ['<?xml version="1.0" ?>\n<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2024//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_240101.dtd">\n<PubmedArticleSet>\n<PubmedArticle><MedlineCitation Status="Publisher" Owner="NLM" IndexingMethod="Automated"><PMID Version="1">38581603</PMID><DateRevised><Year>2024</Year><Month>04</Month><Day>06</Day></DateRevised><Article PubModel="Print-Electronic"><Journal><ISSN IssnType="Print">1878-7649</ISSN><JournalIssue CitedMedium="Print"><PubDate><Year>2024</Year><Month>Apr</Month><Day>06</Day></PubDate></JournalIssue><Title>European geriatric medicine</Title><ISOAbbreviation>Eur Geriatr Med</ISOAbbreviation></Journal><ArticleTitle>Exploring home rehabilitation therapists\' experiences of supporting older persons to physical exercise after acute hospitalization: a qualitative interview study.</ArticleTitle><ELocationID EIdType="doi" ValidYN="Y">10.1007/s41999-024-00972-5</ELocationID><Abstract><AbstractText Label="PU

# 2.5

In [32]:
class Pubmed_API:
    def __init__(self, api_key=os.getenv('api_ncbi'), logger=None, logging_level=logging.INFO):
        """
        Parameters:
        - api_key (str): NCBI API key
        ---
        # Example usage

        result_dict = dict()
        iteration = 1
        query = 'query string'
        result_dict[iteration] = Pubmed_API()

        ## Option 1

        2 steps: Get list of PMIDs first, then get the article data.

        ids_list = result_dict[iteration].search_article(query, retmax=5, ids_only=True)
        df = result_dict[iteration].get_article_data_by_title()

        ## Option 2

        Get the PMIDs and then the article data in one step.

        df = result_dict[iteration].search_article(query, retmax=5, ids_only=False)

        """
        self.api_key = api_key
        self.logger = create_function_logger('Pubmed_API', logger, level=logging_level)
        self.iteration = 0
        self.responses_dict = {}
        self.results_dict = {}
        self.PMIDs_dict = {}
        self.record_strings_dict = {}

    def search_article(self, query, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, period_filter=None,
        additional_search_params=None, ids_only=False, 
        verbose=True
        ):
        """
        Search for article title in PubMed database.

        Parameters:
        - query (str): Pubmed search query.
        - reldate (int, optional): The search returns only those items that have a date specified by datetype within the last n days.
        - query_tag (str, optional): Query tag to append to the search query.
        - publication (str, optional): Publication name.
        - retmax (int, optional): Maximum number of results to return. 
            If None, default is 20. API returns a maximum of 9999 results. To get more results for Pubmed,
            need to use the command line: https://www.ncbi.nlm.nih.gov/books/NBK179288/
        - systematic_only (bool, optional): If True, filter for only systematic review articles.
        - review_only (bool, optional): If True, filter for only systematic review or review articles.
        - additional_search_params (dict, optional): Additional search parameters to pass to the esearch API.
        - period_filter (1, 5, or 10, optional): Filter for articles published in the past 1, 5, or 10 years.
            Note: To filter by other periods, use `reldate` parameter as that is how the API works.
            
        Returns:


        API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
        Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
        """
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        response = {}
        results = pd.DataFrame()
        search_term = f'{re.sub(r"not", "", query)}'  # Remove 'not' since it will be treated as a boolean
        if query_tag:
            search_term += f'{query_tag}'
        if publication:
            search_term = f'AND {publication} [ta]'
        if systematic_only:
            search_term += ' AND systematic[sb]'
        elif review_only:
            search_term += ' AND (systematic[sb] OR review[pt])'
        if period_filter:
            search_term += f' AND y_{period_filter}[Filter]'
        params = {
            'db': 'pubmed',
            'term': search_term,
            'retmode': 'json',
            'datetype': 'edat',
        }
        if reldate:
            params['reldate'] = reldate
        if retmax:
            params['retmax'] = retmax
        if additional_search_params:
            params.update(additional_search_params)
        self.logger.info(f'Search term: {search_term}')
        messages = []
        try:
            self.iteration += 1
            response = requests.get(base_url, params=params)
            response_dict = response.json()
            id_list = response_dict['esearchresult']['idlist']
            messages.append(f'{len(id_list)} PMIDs found.')
            if verbose==True:
                messages.append(f'{id_list}')
            self.PMIDs_dict[self.iteration] = id_list
            self.responses_dict[self.iteration] = response_dict
            if ids_only==False:
                results = self.get_article_data_by_title()
            else:
                results = id_list
            self.logger.info('\n'.join(messages))
        except Exception as error:
            error_messages = []
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        
        return results

    def get_article_data_by_title(self, iteration=None):
        result_df = pd.DataFrame()
        try:
            iteration = self.iteration if iteration == None else iteration
            record_strings_list = self.batch_retrieve_citation(iteration)
            self.record_strings_dict[iteration] = record_strings_list
            result_df = self.extract_pubmed_details_df(iteration)
            self.results_dict[iteration] = result_df
        except Exception as error:
            error_messages = []
            error_messages.append(f'Response: \n{self.PMIDs_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        return result_df

    def batch_retrieve_citation(self, iteration):
        result_list = []
        messages = []
        messages.append(f'***Running `batch_retrieve_citation` with iteration {iteration}***')
        try:
            id_list = self.PMIDs_dict.get(iteration)
            if id_list:
                messages.append(f'Extracting {len(id_list)} PMIDs from iteration {iteration}.')
                for index, id in enumerate(id_list):
                    result_list.append(self.retrieve_citation(id).decode('utf-8'))
                    current_index, current_id = index+1, id
                    # Show progress 
                    indicator = '.'
                    if current_index % 10 == 0:
                        indicator+='|'
                    print(indicator, end='\n' if current_index%2==100 else '')
                messages.append("Processing complete.")
            else:
                self.logger.warning(f'No results found.')
        except Exception as error:
            messages.append(f'Response: \n{self.responses_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            messages.append(f'\tAn error occurred on line {lineno} in {filename}: {error}')
            messages.append(f'Article {current_index} [{current_id}] not found.')
        self.logger.info('\n'.join(messages))
        return result_list

    def retrieve_citation(self, article_id):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        params = {
            'db': 'pubmed',
            'id': article_id
        }
        response = requests.get(base_url, params=params)
        return response.content

    def extract_pubmed_details_df(self, iteration=None):
        """
        Extract the Pubmed article details for the given list of record strings for the given iteration.

        Returns:
        DataFrame of the Pubmed article details.
        """
        df = pd.DataFrame()
        self.logger.info('***Running `.extract_pubmed_details_df`***')
        record_strings = pd.Series(self.record_strings_dict.get(iteration if iteration else self.iteration))
        regex_dict = {
            'article_title': r'<ArticleTitle>(.*?)</ArticleTitle>',
            'pmid': r'<PMID.*?>(.*?)</PMID>',
            'journal': r'<Title>(.*?)</Title>',
            'volume': r'<Volume>(.*?)</Volume>',
            'issue': r'<Issue>(.*?)</Issue>',
            'year': r'<PubDate><Year>(\d{4})</Year>',
            'month': r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>',
            'start_page': r'<StartPage>(.*?)</StartPage>',
            'end_page': r'<EndPage>(.*?)</EndPage>',
            'doi': r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>',
        }
        for column, regex in regex_dict.items():
            df[column] = record_strings.str.extract(regex)
        df['abstract'] = self.df_extractall(
            record_strings, parent_regex=r'<Abstract>(.*?)</Abstract>',
            regex = r'<AbstractText.*?(?: Label="(.*?)")?.*?>(.*?)</AbstractText>',
            logger=self.logger, sep=': ', join_strings=' '
        )
        df['mesh_headings'] = self.df_extractall(
            record_strings, 
            parent_regex=r'<MeshHeadingList>(.*?)</MeshHeadingList>',
            regex=r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>',
            nested_regex=r'<QualifierName.*?>(.*?)</QualifierName>', logger=self.logger
        )
        df['authors'] = self.df_extractall(
            record_strings, sep=' ',
            regex=r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>',
            logger=self.logger 
        )
        df['keywords'] = self.df_extractall(
            record_strings, parent_regex=r'<KeywordList.*?>(.*?)</KeywordList>',
            regex=r'<Keyword.*?>(.*?)</Keyword>', 
            logger=self.logger
        )
        df['major_topics'] = self.df_extractall(
            record_strings, 
            regex=r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', 
            logger=self.logger
        )
        df['publication_type'] = self.df_extractall(
            record_strings, parent_regex=r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>',
            regex=r'<PublicationType.*?>(.*?)</PublicationType>', 
            logger=self.logger
        )
        columns = [
            'article_title',
            'abstract',
            'mesh_headings',
            'keywords',
            'major_topics',
            'pmid',
            'doi',
            'journal',
            'volume',
            'issue',
            'year',
            'month',
            'start_page',
            'end_page',
            'authors',
            'publication_type'
        ]
        return df[columns]

    def df_extractall(self, 
            series, regex, parent_regex=None, nested_regex=None, sep=[' ', ' / '], 
            join_strings=False, logger=None
            ):
        """
        Helper function called by `.search_article()` and `.get_article_data_by_title()` to parse 
        article metadata from PubMed database.

        Parameters:
        - series: pd.Series
        - regex: Regular expression to extract from the series.
        - parent_regex (optional): Regular expression from which to extract the `regex`.
            If None, `regex` will be extracted from the series.
        - nested_regex (optional): Regular expression that is nested within `regex` to extract.
        - sep (str or list; optional): String or list of strings used to separate multiple capture groups.
            If it is a list, then the first value is used to separate the main capture groups. 
            The second value is used to separate the nested capture groups. If the nested regex 
            has multiple capture groups, then the last value is used to separate them.
        - join_strings (optional): Boolean indicating whether to join the extracted values.
        - logger (optional): Instance of Custom_Logger class.

        Returns:
        - pd.Series with the extracted values.
        """
        messages = []
        messages.append(f'***Running `df_extractall` with regex {regex}***')
        if parent_regex:
            messages.append(f'\tparent_regex: {parent_regex}')
        if nested_regex:
            messages.append(f'\tnested_regex: {nested_regex}')
        if parent_regex:
            extracted = series.str.extract(parent_regex, expand=False)
            series = extracted
        extracted = series.str.extractall(regex).replace({np.nan: ''})
        if extracted.shape[1] >= 1:
            joined_values = extracted[0]
        else:
            messages.warning('No matches found.')
            return series
        if extracted.shape[1] > 1:
            extracted.index.names = [f'{name if name else "index"}{index if index !=0 else ""}' for index, name in enumerate(extracted.index.names)]
            for i in range(1, extracted.shape[1]):
                if nested_regex:
                    matches = extracted[i].str.extractall(nested_regex)#.replace({np.nan: ''})
                    messages.append(f'Number of nested capture groups: {matches.shape[1]}')
                    matches.columns = [f'nested_text{column}' for column in matches.columns]
                    regex_df = extracted.merge(
                        matches, how='left', left_index=True, right_index=True
                    ).replace({np.nan: ''})
                    nested_separator = sep if type(sep) == str else sep[1]
                    if i == 1:
                        root_column = 0 
                        capture_group_separator = nested_separator
                    else:
                        root_column = 'Text'
                        capture_group_separator = sep if type(sep) == str else sep[-1]
                    regex_df = concat_columns(
                        regex_df, [root_column, 'nested_text0'], 'Text', 
                        sep=capture_group_separator
                    )
                    joined_values = regex_df['Text']

                else:
                    separator = sep if type(sep) == str else sep[0]
                    joined_values = joined_values + separator + extracted[i]
        new_series = joined_values.groupby(level=0).apply(lambda groupby: [match for match in groupby])
        if (type(join_strings) == str) | (join_strings == True):
            new_series = new_series.apply(lambda x: f'{join_strings if type(join_strings) == str else " "}'.join(x))
        self.logger.debug('\n'.join(messages))
        return new_series

    def extract_pubmed_details(self, record_string):
        """
        [Archived: Use `extract_pubmed_details_df` instead to perform regex operations on the entire dataframe.]
        Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
        """
        authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
        formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

        # Extract publication year
        publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
        publication_year = publication_year.group(1) if publication_year else ''
        publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
        publication_month = publication_month.group(1) if publication_month else ''

        # Extract article title
        article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
        article_title = article_title.group(1) if article_title else ''

        # Extract journal title
        journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
        journal_title = journal_title.group(1) if journal_title else ''

        # Extract journal volume
        journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
        journal_volume = journal_volume.group(1) if journal_volume else ''

        # Extract journal issue
        journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
        journal_issue = journal_issue.group(1) if journal_issue else ''

        # Extract start page
        start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
        start_page = start_page.group(1) if start_page else ''

        # Extract end page
        end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
        end_page = end_page.group(1) if end_page else ''

        # Extract ELocationID
        doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
        doi = doi.group(1) if doi else ''

        # Extract PMID
        pmid = re.search(r'<PMID.*?>(.*?)</PMID>', record_string)
        pmid = pmid.group(1) if pmid else ''

        abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
        # self.logger.debug(f'Number of abstract sections: {len(abstract_matches)}')
        if len(abstract_matches) > 1:
            cleaned_abstract_sections = []
            for match in abstract_matches:
                clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
                clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
                cleaned_abstract_sections.append(clean_match)
                
            abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
        else:
            abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
            
        # Extract MeshHeadingList
        MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
        MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''

        # Estract MeshHeading text and any QualifierName
        mesh_headings = []
        pattern = r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>'
        matches = re.findall(pattern, MeshHeadingList)
        for match in matches:
            heading = match[0]
            if match[1]: # Estract Mesh QualifierName                
                MeshQualifiers = re.findall(
                    r'<QualifierName.*?>(.*?)</QualifierName>', match[1]
                    )
                print(f'mesh qualifiers: {MeshQualifiers}')
                for qualifier in MeshQualifiers:
                    heading = f"{match[0]} / {qualifier}"
                    mesh_headings.append(heading)
            else:
                mesh_headings.append(heading)

        # Extract keyword
        Keyword_List = re.search(r'<KeywordList.*?>(.*?)</KeywordList>', record_string)
        Keyword_List = Keyword_List.group(1) if Keyword_List else ''
        Keywords = re.findall(
            r'<Keyword.*?>(.*?)</Keyword>', Keyword_List
            )
        # Extract MajorTopic text
        MajorTopics = re.findall(
            r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', record_string
            )
        # Extract Publication Type
        PublicationTypeList = re.search(r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>', record_string)
        PublicationTypeList = PublicationTypeList.group(1) if PublicationTypeList else ''
        PublicationType = re.findall(
            r'<PublicationType.*?>(.*?)</PublicationType>', PublicationTypeList
            )
        return {
            'pubmed_title': article_title,
            'abstract': abstract,
            'journal': journal_title,
            'authors': formatted_authors,
            'year': publication_year,
            'month': publication_month,
            'pub_volume': journal_volume,
            'pub_issue': journal_issue,
            'start_page': start_page,
            'end_page': end_page,
            'doi': doi,
            'pmid': pmid,
            'mesh_headings': mesh_headings,
            'keywords': Keywords,
            'major_topics': MajorTopics,
            'publication_type': PublicationType
        }


iteration = 2.5
query = 'exercise'
retmax = 15
result_dict[iteration] = Pubmed_API()
df = result_dict[iteration].search_article(query, retmax=retmax, ids_only=False)


2024-04-06 23:20:32,262 - Pubmed_API - INFO:
Search term: exercise



..........|....

2024-04-06 23:22:01,738 - Pubmed_API - INFO:
***Running `batch_retrieve_citation` with iteration 1***
Extracting 15 PMIDs from iteration 1.
Processing complete.

2024-04-06 23:22:01,741 - Pubmed_API - INFO:
***Running `.extract_pubmed_details_df`***

2024-04-06 23:22:01,873 - Pubmed_API - INFO:
15 PMIDs found.
['38581603', '38581560', '38581554', '38581479', '38581449', '38581398', '38581338', '38581216', '38581070', '38581046', '38580947', '38580913', '38580840', '38580835', '38580668']



.

In [35]:
result_dict[iteration].results_dict[1].replace({np.nan: None}).to_dict(orient='records')

[{'article_title': "Exploring home rehabilitation therapists' experiences of supporting older persons to physical exercise after acute hospitalization: a qualitative interview study.",
  'abstract': "PURPOSE: After hospitalization, older persons may face a decline in physical function and daily independence. In-hospital exercise interventions can mitigate this decline, and continued support from primary healthcare post-discharge may enhance sustainability. This study aimed to explore home rehabilitation therapists' experiences of supporting physical exercise after acute hospitalization, including exercise programs initiated during hospital stay. METHODS: This qualitative study was conducted alongside a randomized-controlled trial to investigate prerequisites for a transitional care intervention. Twelve interviews were conducted with physiotherapists, occupational therapists, and managers across seven rehabilitation therapy services in Stockholm, Sweden. Data were analyzed using reflexi

# 2.6

In [47]:
class Pubmed_API:
    def __init__(self, api_key=os.getenv('api_ncbi'), logger=None, logging_level=logging.INFO):
        """
        Parameters:
        - api_key (str): NCBI API key
        ---
        # Example usage

        result_dict = dict()
        iteration = 1
        query = 'query string'
        result_dict[iteration] = Pubmed_API()

        ## Option 1

        2 steps: Get list of PMIDs first, then get the article data.

        ids_list = result_dict[iteration].search_article(query, retmax=5, ids_only=True)
        df = result_dict[iteration].get_article_data_by_title()

        ## Option 2

        Get the PMIDs and then the article data in one step.

        df = result_dict[iteration].search_article(query, retmax=5, ids_only=False)

        """
        self.api_key = api_key
        self.logger = create_function_logger('Pubmed_API', logger, level=logging_level)
        self.iteration = 0
        self.responses_dict = {}
        self.results_dict = {}
        self.PMIDs_dict = {}
        self.record_strings_dict = {}

    def search_article(self, query, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, period_filter=None,
        additional_search_params=None, ids_only=False, 
        verbose=True
        ):
        """
        Search for article title in PubMed database.

        Parameters:
        - query (str): Pubmed search query.
        - reldate (int, optional): The search returns only those items that have a date specified by datetype within the last n days.
        - query_tag (str, optional): Query tag to append to the search query.
        - publication (str, optional): Publication name.
        - retmax (int, optional): Maximum number of results to return. 
            If None, default is 20. API returns a maximum of 9999 results. To get more results for Pubmed,
            need to use the command line: https://www.ncbi.nlm.nih.gov/books/NBK179288/
        - systematic_only (bool, optional): If True, filter for only systematic review articles.
        - review_only (bool, optional): If True, filter for only systematic review or review articles.
        - additional_search_params (dict, optional): Additional search parameters to pass to the esearch API.
        - period_filter (1, 5, or 10, optional): Filter for articles published in the past 1, 5, or 10 years.
            Note: To filter by other periods, use `reldate` parameter as that is how the API works.
            
        Returns:

        API documentation: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
        Pubmed User Guide including tags for filtering results: https://pubmed.ncbi.nlm.nih.gov/help/
        """
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        response = {}
        results = pd.DataFrame()
        search_term = f'{re.sub(r"not", "", query)}'  # Remove 'not' since it will be treated as a boolean
        if query_tag:
            search_term += f'{query_tag}'
        if publication:
            search_term = f'AND {publication} [ta]'
        if systematic_only:
            search_term += ' AND systematic[sb]'
        elif review_only:
            search_term += ' AND (systematic[sb] OR review[pt])'
        if period_filter:
            search_term += f' AND y_{period_filter}[Filter]'
        params = {
            'db': 'pubmed',
            'term': search_term,
            'retmode': 'json',
            'datetype': 'edat',
        }
        if reldate:
            params['reldate'] = reldate
        if retmax:
            params['retmax'] = retmax
        if additional_search_params:
            params.update(additional_search_params)
        self.logger.info(f'Search term: {search_term}')
        messages = []
        try:
            self.iteration += 1
            response = requests.get(base_url, params=params)
            response_dict = response.json()
            id_list = response_dict['esearchresult']['idlist']
            messages.append(f'{len(id_list)} PMIDs found.')
            if verbose==True:
                messages.append(f'{id_list}')
            self.PMIDs_dict[self.iteration] = id_list
            self.responses_dict[self.iteration] = response_dict
            if ids_only==False:
                results = self.get_article_data_by_title()
            else:
                results = id_list
            self.logger.info('\n'.join(messages))
        except Exception as error:
            error_messages = []
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        return results

    def get_article_data_by_title(self, iteration=None, orient='records'):
        result_df = pd.DataFrame()
        try:
            iteration = self.iteration if iteration == None else iteration
            record_strings_list = self.batch_retrieve_citation(iteration)
            self.record_strings_dict[iteration] = record_strings_list
            result_df = self.extract_pubmed_details_df(iteration)
            self.results_dict[iteration] = result_df.to_dict(orient=orient)
        except Exception as error:
            error_messages = []
            error_messages.append(f'Response: \n{self.PMIDs_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        return result_df

    def batch_retrieve_citation(self, iteration):
        result_list = []
        messages = []
        messages.append(f'***Running `batch_retrieve_citation` with iteration {iteration}***')
        try:
            id_list = self.PMIDs_dict.get(iteration)
            if id_list:
                messages.append(f'Extracting {len(id_list)} PMIDs from iteration {iteration}.')
                for index, id in enumerate(id_list):
                    result_list.append(self.retrieve_citation(id).decode('utf-8'))
                    current_index, current_id = index+1, id
                    # Show progress 
                    indicator = '.'
                    if current_index % 10 == 0:
                        indicator+='|'
                    print(indicator, end='\n' if current_index%2==100 else '')
                messages.append("Processing complete.")
            else:
                self.logger.warning(f'No results found.')
        except Exception as error:
            messages.append(f'Response: \n{self.responses_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            messages.append(f'\tAn error occurred on line {lineno} in {filename}: {error}')
            messages.append(f'Article {current_index} [{current_id}] not found.')
        self.logger.info('\n'.join(messages))
        return result_list

    def retrieve_citation(self, article_id):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        params = {
            'db': 'pubmed',
            'id': article_id
        }
        response = requests.get(base_url, params=params)
        return response.content

    def extract_pubmed_details_df(self, iteration=None):
        """
        Extract the Pubmed article details for the given list of record strings for the given iteration.

        Returns:
        DataFrame of the Pubmed article details.
        """
        df = pd.DataFrame()
        self.logger.info('***Running `.extract_pubmed_details_df`***')
        record_strings = pd.Series(self.record_strings_dict.get(iteration if iteration else self.iteration))
        regex_dict = {
            'article_title': r'<ArticleTitle>(.*?)</ArticleTitle>',
            'pmid': r'<PMID.*?>(.*?)</PMID>',
            'journal': r'<Title>(.*?)</Title>',
            'volume': r'<Volume>(.*?)</Volume>',
            'issue': r'<Issue>(.*?)</Issue>',
            'year': r'<PubDate><Year>(\d{4})</Year>',
            'month': r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>',
            'start_page': r'<StartPage>(.*?)</StartPage>',
            'end_page': r'<EndPage>(.*?)</EndPage>',
            'doi': r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>',
        }
        for column, regex in regex_dict.items():
            df[column] = record_strings.str.extract(regex)
        df['abstract'] = self.df_extractall(
            record_strings, parent_regex=r'<Abstract>(.*?)</Abstract>',
            regex = r'<AbstractText.*?(?: Label="(.*?)")?.*?>(.*?)</AbstractText>',
            logger=self.logger, sep=': ', join_strings=' '
        )
        df['mesh_headings'] = self.df_extractall(
            record_strings, 
            parent_regex=r'<MeshHeadingList>(.*?)</MeshHeadingList>',
            regex=r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>',
            nested_regex=r'<QualifierName.*?>(.*?)</QualifierName>', logger=self.logger
        )
        df['authors'] = self.df_extractall(
            record_strings, sep=' ',
            regex=r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>',
            logger=self.logger 
        )
        df['keywords'] = self.df_extractall(
            record_strings, parent_regex=r'<KeywordList.*?>(.*?)</KeywordList>',
            regex=r'<Keyword.*?>(.*?)</Keyword>', 
            logger=self.logger
        )
        df['major_topics'] = self.df_extractall(
            record_strings, 
            regex=r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', 
            logger=self.logger
        )
        df['publication_type'] = self.df_extractall(
            record_strings, parent_regex=r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>',
            regex=r'<PublicationType.*?>(.*?)</PublicationType>', 
            logger=self.logger
        )
        columns = [
            'article_title',
            'abstract',
            'mesh_headings',
            'keywords',
            'major_topics',
            'pmid',
            'doi',
            'journal',
            'volume',
            'issue',
            'year',
            'month',
            'start_page',
            'end_page',
            'authors',
            'publication_type'
        ]
        df = df[columns].replace({np.nan: None})
        return df

    def df_extractall(self, 
            series, regex, parent_regex=None, nested_regex=None, sep=[' ', ' / '], 
            join_strings=False, logger=None
            ):
        """
        Helper function called by `.search_article()` and `.get_article_data_by_title()` to parse 
        article metadata from PubMed database.

        Parameters:
        - series: pd.Series
        - regex: Regular expression to extract from the series.
        - parent_regex (optional): Regular expression from which to extract the `regex`.
            If None, `regex` will be extracted from the series.
        - nested_regex (optional): Regular expression that is nested within `regex` to extract.
        - sep (str or list; optional): String or list of strings used to separate multiple capture groups.
            If it is a list, then the first value is used to separate the main capture groups. 
            The second value is used to separate the nested capture groups. If the nested regex 
            has multiple capture groups, then the last value is used to separate them.
        - join_strings (optional): Boolean indicating whether to join the extracted values.
        - logger (optional): Instance of Custom_Logger class.

        Returns:
        - pd.Series with the extracted values.
        """
        messages = []
        messages.append(f'***Running `df_extractall` with regex {regex}***')
        if parent_regex:
            messages.append(f'\tparent_regex: {parent_regex}')
        if nested_regex:
            messages.append(f'\tnested_regex: {nested_regex}')
        if parent_regex:
            extracted = series.str.extract(parent_regex, expand=False)
            series = extracted
        extracted = series.str.extractall(regex).replace({np.nan: ''})
        if extracted.shape[1] >= 1:
            joined_values = extracted[0]
        else:
            messages.warning('No matches found.')
            return series
        if extracted.shape[1] > 1:
            extracted.index.names = [f'{name if name else "index"}{index if index !=0 else ""}' for index, name in enumerate(extracted.index.names)]
            for i in range(1, extracted.shape[1]):
                if nested_regex:
                    matches = extracted[i].str.extractall(nested_regex)#.replace({np.nan: ''})
                    messages.append(f'Number of nested capture groups: {matches.shape[1]}')
                    matches.columns = [f'nested_text{column}' for column in matches.columns]
                    regex_df = extracted.merge(
                        matches, how='left', left_index=True, right_index=True
                    ).replace({np.nan: ''})
                    nested_separator = sep if type(sep) == str else sep[1]
                    if i == 1:
                        root_column = 0 
                        capture_group_separator = nested_separator
                    else:
                        root_column = 'Text'
                        capture_group_separator = sep if type(sep) == str else sep[-1]
                    regex_df = concat_columns(
                        regex_df, [root_column, 'nested_text0'], 'Text', 
                        sep=capture_group_separator
                    )
                    joined_values = regex_df['Text']

                else:
                    separator = sep if type(sep) == str else sep[0]
                    joined_values = joined_values + separator + extracted[i]
        new_series = joined_values.groupby(level=0).apply(lambda groupby: [match for match in groupby])
        if (type(join_strings) == str) | (join_strings == True):
            new_series = new_series.apply(lambda x: f'{join_strings if type(join_strings) == str else " "}'.join(x))
        self.logger.debug('\n'.join(messages))
        return new_series

    def extract_pubmed_details(self, record_string):
        """
        [Archived: Use `extract_pubmed_details_df` instead to perform regex operations on the entire dataframe.]
        Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
        """
        authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
        formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

        # Extract publication year
        publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
        publication_year = publication_year.group(1) if publication_year else ''
        publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
        publication_month = publication_month.group(1) if publication_month else ''

        # Extract article title
        article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
        article_title = article_title.group(1) if article_title else ''

        # Extract journal title
        journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
        journal_title = journal_title.group(1) if journal_title else ''

        # Extract journal volume
        journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
        journal_volume = journal_volume.group(1) if journal_volume else ''

        # Extract journal issue
        journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
        journal_issue = journal_issue.group(1) if journal_issue else ''

        # Extract start page
        start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
        start_page = start_page.group(1) if start_page else ''

        # Extract end page
        end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
        end_page = end_page.group(1) if end_page else ''

        # Extract ELocationID
        doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
        doi = doi.group(1) if doi else ''

        # Extract PMID
        pmid = re.search(r'<PMID.*?>(.*?)</PMID>', record_string)
        pmid = pmid.group(1) if pmid else ''

        abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
        # self.logger.debug(f'Number of abstract sections: {len(abstract_matches)}')
        if len(abstract_matches) > 1:
            cleaned_abstract_sections = []
            for match in abstract_matches:
                clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
                clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
                cleaned_abstract_sections.append(clean_match)
                
            abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
        else:
            abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
            
        # Extract MeshHeadingList
        MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
        MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''

        # Estract MeshHeading text and any QualifierName
        mesh_headings = []
        pattern = r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>'
        matches = re.findall(pattern, MeshHeadingList)
        for match in matches:
            heading = match[0]
            if match[1]: # Estract Mesh QualifierName                
                MeshQualifiers = re.findall(
                    r'<QualifierName.*?>(.*?)</QualifierName>', match[1]
                    )
                print(f'mesh qualifiers: {MeshQualifiers}')
                for qualifier in MeshQualifiers:
                    heading = f"{match[0]} / {qualifier}"
                    mesh_headings.append(heading)
            else:
                mesh_headings.append(heading)

        # Extract keyword
        Keyword_List = re.search(r'<KeywordList.*?>(.*?)</KeywordList>', record_string)
        Keyword_List = Keyword_List.group(1) if Keyword_List else ''
        Keywords = re.findall(
            r'<Keyword.*?>(.*?)</Keyword>', Keyword_List
            )
        # Extract MajorTopic text
        MajorTopics = re.findall(
            r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', record_string
            )
        # Extract Publication Type
        PublicationTypeList = re.search(r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>', record_string)
        PublicationTypeList = PublicationTypeList.group(1) if PublicationTypeList else ''
        PublicationType = re.findall(
            r'<PublicationType.*?>(.*?)</PublicationType>', PublicationTypeList
            )
        return {
            'pubmed_title': article_title,
            'abstract': abstract,
            'journal': journal_title,
            'authors': formatted_authors,
            'year': publication_year,
            'month': publication_month,
            'pub_volume': journal_volume,
            'pub_issue': journal_issue,
            'start_page': start_page,
            'end_page': end_page,
            'doi': doi,
            'pmid': pmid,
            'mesh_headings': mesh_headings,
            'keywords': Keywords,
            'major_topics': MajorTopics,
            'publication_type': PublicationType
        }


iteration = 2.6
query = 'exercise'
retmax = 15
result_dict[iteration] = Pubmed_API()
df = result_dict[iteration].search_article(query, retmax=retmax, ids_only=False)


2024-04-06 23:50:59,076 - Pubmed_API - INFO:
Search term: exercise



..........|....

2024-04-06 23:52:28,902 - Pubmed_API - INFO:
***Running `batch_retrieve_citation` with iteration 1***
Extracting 15 PMIDs from iteration 1.
Processing complete.

2024-04-06 23:52:28,904 - Pubmed_API - INFO:
***Running `.extract_pubmed_details_df`***

2024-04-06 23:52:29,042 - Pubmed_API - INFO:
15 PMIDs found.
['38581603', '38581560', '38581554', '38581479', '38581449', '38581398', '38581338', '38581216', '38581070', '38581046', '38580947', '38580913', '38580840', '38580835', '38580668']



.

In [48]:
result_dict[iteration].results_dict[1]

[{'article_title': "Exploring home rehabilitation therapists' experiences of supporting older persons to physical exercise after acute hospitalization: a qualitative interview study.",
  'abstract': "PURPOSE: After hospitalization, older persons may face a decline in physical function and daily independence. In-hospital exercise interventions can mitigate this decline, and continued support from primary healthcare post-discharge may enhance sustainability. This study aimed to explore home rehabilitation therapists' experiences of supporting physical exercise after acute hospitalization, including exercise programs initiated during hospital stay. METHODS: This qualitative study was conducted alongside a randomized-controlled trial to investigate prerequisites for a transitional care intervention. Twelve interviews were conducted with physiotherapists, occupational therapists, and managers across seven rehabilitation therapy services in Stockholm, Sweden. Data were analyzed using reflexi

### save to json

In [56]:
filename = 'pubmed_results'
path = '../data'
save_to_json(
    result_dict[2.6].results_dict[1], filename=filename, description=None, path=path, append_version=True
)

Object saved as JSON: ../data/pubmed_results_2024-04-06_235718.json


In [57]:
def save_to_json(obj, filename=None, append_version=False,
    path=r'C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\output\json'
    ):
    """
    Save Python object as a JSON file.
    Parameters:
    - obj: Python object to be saved.
    - filename: Root of the filename.
    - path (raw string): Use the format r'<path>'. If None, file is saved in same directory as script.
    - append_version (bool): If true, append date and time to end of filename.
    """
    if filename == None:
        filename = f'outputs_{datetime.now().strftime("%Y-%m-%d_%H%M")}'
        append_version = False
    if path:
        path = f'{path}/'.replace('\\','/')
    if append_version:
        filename += f'_{datetime.now().strftime("%Y-%m-%d_%H%M%S")}'
    filename += '.json'
    with open(path+filename, 'w') as f:
        json.dump(obj, f, indent=4)
    print(f'Object saved as JSON: {path}{filename}')

save_to_json(
    result_dict[2.6].results_dict[1], filename=filename, path=path, append_version=True
)

Object saved as JSON: ../data/pubmed_results_2024-04-07_001115.json


In [50]:
iteration = 2.61
query = 'exercise'
retmax = 2
result_dict[iteration] = Pubmed_API()
ids_list = result_dict[iteration].search_article(
    query, retmax=retmax, 
    ids_only=True
    )

2024-04-06 23:52:56,918 - Pubmed_API - INFO:
Search term: exercise

2024-04-06 23:53:02,705 - Pubmed_API - INFO:
2 PMIDs found.
['38581603', '38581560']



In [51]:
df = result_dict[iteration].get_article_data_by_title()
df

.

2024-04-06 23:53:17,258 - Pubmed_API - INFO:
***Running `batch_retrieve_citation` with iteration 1***
Extracting 2 PMIDs from iteration 1.
Processing complete.

2024-04-06 23:53:17,262 - Pubmed_API - INFO:
***Running `.extract_pubmed_details_df`***



.

Unnamed: 0,article_title,abstract,mesh_headings,keywords,major_topics,pmid,doi,journal,volume,issue,year,month,start_page,end_page,authors,publication_type
0,Exploring home rehabilitation therapists' experiences of supporting older persons to physical ex...,"PURPOSE: After hospitalization, older persons may face a decline in physical function and daily ...",,"[Exercise, Older adults, Primary care, Qualitative study, Rehabilitation therapy, Transitional c...",,38581603,10.1007/s41999-024-00972-5,European geriatric medicine,,,2024,,,,"[Sandlund Christina, Sandberg Linda, Lindblom Sebastian, Frisendahl Nathalie, Bostr&#xf6;m Anne-...",[Journal Article]
1,Current awareness and use of transthoracic echocardiography in evaluation of valvular heart dise...,BACKGROUND: There are few reports on transthoracic echocardiography (TTE) for the evaluation of ...,,"[Aortic stenosis, Transthoracic echocardiography, Valvular heart disease]",,38581560,10.1007/s12574-024-00648-w,Journal of echocardiography,,,2024,,,,"[Usuku Hiroki, Yamamoto Eiichiro, Oike Fumi, Yoshida Kenichi, Ogata Yuji, Kato Saori, Fukushige ...",[Journal Article]


# Haystack
https://haystack.deepset.ai/tutorials/39_embedding_metadata_for_improved_retrieval
https://colab.research.google.com/github/deepset-ai/haystack-tutorials/blob/main/tutorials/39_Embedding_Metadata_for_Improved_Retrieval.ipynb#scrollTo=J6GCkMgP6j6K

In [None]:
from haystack import Pipeline
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
from haystack.utils import ComponentDevice

from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore


def create_indexing_pipeline(document_store, metadata_fields_to_embed=None):
    document_cleaner = DocumentCleaner()
    document_splitter = DocumentSplitter(split_by="sentence", split_length=2)
    document_embedder = SentenceTransformersDocumentEmbedder(
        model="thenlper/gte-large", meta_fields_to_embed=metadata_fields_to_embed
    )
    document_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE)

    indexing_pipeline = Pipeline()
    indexing_pipeline.add_component("cleaner", document_cleaner)
    indexing_pipeline.add_component("splitter", document_splitter)
    indexing_pipeline.add_component("embedder", document_embedder)
    indexing_pipeline.add_component("writer", document_writer)

    indexing_pipeline.connect("cleaner", "splitter")
    indexing_pipeline.connect("splitter", "embedder")
    indexing_pipeline.connect("embedder", "writer")

    return indexing_pipeline

# *End of Page*