# Title
[]()

In [1]:

import sys
sys.path.append('../src')
sys.path.append(r"/home/silvhua/custom_python")
from silvhua import *

In [27]:
# set the option to wrap text within cells
pd.set_option('display.max_colwidth', 500)
# pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Initialize

In [3]:
api_key = os.getenv('api_ncbi') # Pubmed API key
result_dict = dict()

# Iteration 1

In [4]:
import sys
sys.path.append(r"/home/silvhua/custom_python")
import os
import pandas as pd
import string
import re
import requests
# from article_processing import create_text_dict_from_folder
# from orm_summarize import *
api_key = os.getenv('api_ncbi') # Pubmed API key

import sys
import os
import requests
from Custom_Logger import *

class Pubmed_API:
    def __init__(self, api_key=os.getenv('api_ncbi'), logger=None, logging_level=logging.INFO):
        self.api_key = api_key
        self.logger = create_function_logger('Pubmed_API', logger, level=logging_level)
        self.iteration = 0
        self.responses_dict = {}
        self.results_dict = {}
        self.PMIDs_dict = {}
        self.record_strings_dict = {}

    def search_article(self, query, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, additional_search_params=None, ids_only=False, 
        verbose=True
        ):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        response = {}
        results = pd.DataFrame()
        search_term = f'{re.sub(r"not", "", query)}'  # Remove 'not' since it will be treated as a boolean
        if query_tag:
            search_term += f'{query_tag}'
        if publication:
            search_term = f'AND {publication} [ta]'
        if systematic_only:
            search_term += ' AND systematic[sb]'
        elif review_only:
            search_term += ' AND (systematic[sb] OR review[pt])'
        params = {
            'db': 'pubmed',
            'term': search_term,
            'retmax': 5,
            'retmode': 'json',
            'datetype': 'edat',
        }
        if reldate:
            params['reldate'] = reldate
        if retmax:
            params['retmax'] = retmax
        if additional_search_params:
            params.update(additional_search_params)
        self.logger.info(f'Search term: {search_term}')
        messages = []
        try:
            self.iteration += 1
            response = requests.get(base_url, params=params)
            response_dict = response.json()
            id_list = response_dict['esearchresult']['idlist']
            messages.append(f'{len(id_list)} PMIDs found.')
            if verbose==True:
                messages.append(f'{id_list}')
            self.PMIDs_dict[self.iteration] = id_list
            self.responses_dict[self.iteration] = response_dict
            if ids_only==False:
                results = self.get_article_data_by_title()
            else:
                results = id_list
            self.logger.info('\n'.join(messages))
        except Exception as error:
            error_messages = []
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        
        return results

    def get_article_data_by_title(self, iteration=None):
        result_df = pd.DataFrame()
        try:
            result_dict = {}
            iteration = self.iteration if iteration == None else iteration
            record_strings_list = self.batch_retrieve_citation(iteration)
            self.record_strings_dict[iteration] = record_strings_list
            for index, record_string in enumerate(record_strings_list):
                result_dict[index] = self.extract_pubmed_details(record_string)
            self.results_dict[iteration] = result_dict
            result_df = pd.DataFrame(result_dict).transpose()
        except Exception as error:
            error_messages = []
            error_messages.append(f'Response: \n{self.PMIDs_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        return result_df

    def batch_retrieve_citation(self, iteration):
        result_list = []
        messages = []
        try:
            id_list = self.PMIDs_dict.get(iteration)
            if id_list:
                self.logger.info(f'Extracting these {len(id_list)} PMIDs: {id_list}')
                for index, id in enumerate(id_list):
                    result_list.append(self.retrieve_citation(id).decode('utf-8'))
                    current_index, current_id = index+1, id
            else:
                self.logger.warning(f'No results found.')
        except Exception as error:
            messages.append(f'Response: \n{self.responses_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            messages.append(f'\tAn error occurred on line {lineno} in {filename}: {error}')
            messages.append(f'Article {current_index} [{current_id}] not found.')
        return result_list

    def retrieve_citation(self, article_id):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        params = {
            'db': 'pubmed',
            'id': article_id
        }
        response = requests.get(base_url, params=params)
        return response.content

    def extract_pubmed_details(self, record_string):
        """
        Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
        """
        authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
        formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

        # Extract publication year
        publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
        publication_year = publication_year.group(1) if publication_year else ''
        publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
        publication_month = publication_month.group(1) if publication_month else ''

        # Extract article title
        article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
        article_title = article_title.group(1) if article_title else ''

        # Extract journal title
        journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
        journal_title = journal_title.group(1) if journal_title else ''

        # Extract journal volume
        journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
        journal_volume = journal_volume.group(1) if journal_volume else ''

        # Extract journal issue
        journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
        journal_issue = journal_issue.group(1) if journal_issue else ''

        # Extract start page
        start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
        start_page = start_page.group(1) if start_page else ''

        # Extract end page
        end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
        end_page = end_page.group(1) if end_page else ''

        # Extract ELocationID
        doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
        doi = doi.group(1) if doi else ''

        # Extract PMID
        pmid = re.search(r'<PMID.*?>(.*?)</PMID>', record_string)
        pmid = pmid.group(1) if pmid else ''

        abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
        # self.logger.debug(f'Number of abstract sections: {len(abstract_matches)}')
        if len(abstract_matches) > 1:
            cleaned_abstract_sections = []
            for match in abstract_matches:
                clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
                clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
                cleaned_abstract_sections.append(clean_match)
                
            abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
        else:
            abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
            
        # Extract MeshHeadingList
        MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
        MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''

        # Estract MeshHeading text and any QualifierName
        mesh_headings = []
        pattern = r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>'
        matches = re.findall(pattern, MeshHeadingList)
        for match in matches:
            heading = match[0]
            if match[1]: # Estract Mesh QualifierName                
                MeshQualifiers = re.findall(
                    r'<QualifierName.*?>(.*?)</QualifierName>', match[1]
                    )
                print(f'mesh qualifiers: {MeshQualifiers}')
                for qualifier in MeshQualifiers:
                    heading = f"{match[0]} / {qualifier}"
                    mesh_headings.append(heading)
            else:
                mesh_headings.append(heading)

        # Extract keyword
        Keyword_List = re.search(r'<KeywordList.*?>(.*?)</KeywordList>', record_string)
        Keyword_List = Keyword_List.group(1) if Keyword_List else ''
        Keywords = re.findall(
            r'<Keyword.*?>(.*?)</Keyword>', Keyword_List
            )
        # Extract MajorTopic text
        MajorTopics = re.findall(
            r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', record_string
            )
        # Extract Publication Type
        PublicationTypeList = re.search(r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>', record_string)
        PublicationTypeList = PublicationTypeList.group(1) if PublicationTypeList else ''
        PublicationType = re.findall(
            r'<PublicationType.*?>(.*?)</PublicationType>', PublicationTypeList
            )
        return {
            'pubmed_title': article_title,
            'abstract': abstract,
            'journal': journal_title,
            'authors': formatted_authors,
            'year': publication_year,
            'month': publication_month,
            'pub_volume': journal_volume,
            'pub_issue': journal_issue,
            'start_page': start_page,
            'end_page': end_page,
            'doi': doi,
            'pmid': pmid,
            'mesh_headings': mesh_headings,
            'keywords': Keywords,
            'major_topics': MajorTopics,
            'publication_type': PublicationType
        }
    
iteration = 4
# query = '("resistance train*"[All Fields]) AND ((y_10[Filter]) AND (meta-analysis[Filter] OR review[Filter] OR systematicreview[Filter]))'
# query = 'Factors associated with different types of hip fractures among elderly patients a tertiary hospital in Pahang: A retrospective cross-sectional study'
query = '38555895 OR 38568267'
result_dict[iteration] = Pubmed_API()
ids_list = result_dict[iteration].search_article(query, retmax=5, ids_only=True)
ids_list
# df = result_dict[iteration].get_article_data_by_title()
# df

2024-04-04 21:28:00,418 - Pubmed_API - INFO:
Search term: 38555895 OR 38568267

2024-04-04 21:28:01,364 - Pubmed_API - INFO:
2 PMIDs found.
['38568267', '38555895']



['38568267', '38555895']

In [5]:
df = result_dict[iteration].get_article_data_by_title()
df

2024-04-04 21:28:22,749 - Pubmed_API - INFO:
Extracting these 2 PMIDs: ['38568267', '38555895']



mesh qualifiers: ['epidemiology', 'complications']


Unnamed: 0,pubmed_title,abstract,journal,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,pmid,mesh_headings,keywords,major_topics,publication_type
0,Inspiratory Training for Improving Respiratory...,PURPOSE: To investigate the effects of inspira...,Pediatric physical therapy : the official publ...,"K&#xea;nia K P Menezes, Patrick R Avelino, Mar...",2024,,36,2,207,215,10.1097/PEP.0000000000001092,38568267,"[Child, Humans, Cerebral Palsy, Walking, Muscl...",[],"[Cerebral Palsy, Resistance Training]","[Systematic Review, Meta-Analysis, Journal Art..."
1,Factors associated with different types of hip...,"INTRODUCTION: Hip fractures, predominantly due...",The Medical journal of Malaysia,"R Mohd Yusoff, Z A Mulud, M Mohammadnezhad",2024,,79,Suppl 1,117,121,,38555895,"[Aged, Humans, Female, Retrospective Studies, ...",[],[Hip Fractures],[Journal Article]


In [37]:
import numpy as np
from table_mapping import concat_columns
def df_extractall(
        series, regex, parent_regex=None, nested_regex=None, sep=[' ', ' / '], 
        join_strings=False, logger=None
        ):
    logger = create_function_logger('df_extractall', logger)
    messages = []
    messages.append(f'***Running `df_extractall` with regex {regex}***')
    if parent_regex:
        messages.append(f'\tparent_regex: {parent_regex}')
    if nested_regex:
        messages.append(f'\tnested_regex: {nested_regex}')
    logger.info('\n'.join(messages))
    if parent_regex:
        extracted = series.str.extract(parent_regex, expand=False)
        series = extracted
    extracted = series.str.extractall(regex).replace({np.nan: ''})
    if extracted.shape[1] >= 1:
        joined_values = extracted[0]
    else:
        messages.warning('No matches found.')
        return series
    debug_messages = []
    debug_messages.append(f'Number of capture groups: {extracted.shape[1]}')
    if extracted.shape[1] > 1:
        extracted.index.names = [f'{name if name else "index"}{index if index !=0 else ""}' for index, name in enumerate(extracted.index.names)]
        for i in range(1, extracted.shape[1]):
            if nested_regex:
                matches = extracted[i].str.extractall(nested_regex)#.replace({np.nan: ''})
                debug_messages.append(f'Number of nested capture groups: {matches.shape[1]}')
                matches.columns = [f'nested_text{column}' for column in matches.columns]
                regex_df = extracted.merge(
                    matches, how='left', left_index=True, right_index=True
                ).replace({np.nan: ''})
                nested_separator = sep if type(sep) == str else sep[1]
                if i == 1:
                    root_column = 0 
                    capture_group_separator = nested_separator
                else:
                    root_column = 'Text'
                    capture_group_separator = sep if type(sep) == str else sep[-1]
                regex_df = concat_columns(
                    regex_df, [root_column, 'nested_text0'], 'Text', 
                    sep=capture_group_separator
                )
                joined_values = regex_df['Text']

            else:
                separator = sep if type(sep) == str else sep[0]
                joined_values = joined_values + separator + extracted[i]
    new_series = joined_values.groupby(level=0).apply(lambda groupby: [match for match in groupby])
    if (type(join_strings) == str) | (join_strings == True):
        new_series = new_series.apply(lambda x: f'{join_strings if type(join_strings) == str else " "}'.join(x))
    logger.debug('\n'.join(debug_messages))
    return new_series

def extract_pubmed_details_df(record_strings_list, logger=None, logging_level=10):
    df = pd.DataFrame()
    logger = create_function_logger('extract_pubmed_details_df', logger, level=logging_level)
    record_strings = pd.Series(record_strings_list)

    regex_dict = {
        'article_title': r'<ArticleTitle>(.*?)</ArticleTitle>',
        # 'pmid': r'<PMID.*?>(.*?)</PMID>',
        # 'journal': r'<Title>(.*?)</Title>',
        # 'volume': r'<Volume>(.*?)</Volume>',
        # 'issue': r'<Issue>(.*?)</Issue>',
        # 'year': r'<PubDate><Year>(\d{4})</Year>',
        # 'month': r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>',
        # 'start_page': r'<StartPage>(.*?)</StartPage>',
        # 'end_page': r'<EndPage>(.*?)</EndPage>',
        # 'doi': r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>',

    }
    for column, regex in regex_dict.items():
        df[column] = record_strings.str.extract(regex)
    # df['mesh_headings'] = df_extractall(
    #     record_strings, 
    #     parent_regex=r'<MeshHeadingList>(.*?)</MeshHeadingList>',
    #     regex=r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>',
    #     nested_regex=r'<QualifierName.*?>(.*?)</QualifierName>', logger=logger
    # )

    # df['authors'] = df_extractall(
    #     record_strings, sep=' ',
    #     regex=r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>',
    #     logger=logger 
    # )
    # df['abstract'] = df_extractall(
    #     record_strings,
    #     regex = r'(<AbstractText.*?>.*?</AbstractText>)',
    #     nested_regex=r'(?: Label="(.*?)")?.*?>(.*)', logger=logger, sep=['\n', ' ']
    # )
    df['abstract'] = df_extractall(
        record_strings, parent_regex=r'<Abstract>(.*?)</Abstract>',
        regex = r'<AbstractText.*?(?: Label="(.*?)")?.*?>(.*?)</AbstractText>',
        # nested_regex=r'(?: Label="(.*?)")?.*?>(.*)', 
        logger=logger, sep=': ', join_strings=' '
    )
    
    return df

record_strings_list = result_dict[iteration].record_strings_dict[1]
df = extract_pubmed_details_df(record_strings_list)
# df.loc[1]
df

2024-04-04 22:13:57,402 - extract_pubmed_details_df - DEBUG:
Found existing handlers: [<StreamHandler stderr (DEBUG)>]. Found existing console handler: <StreamHandler stderr (DEBUG)>. Setting console handler level to: 10. 

2024-04-04 22:13:57,405 - extract_pubmed_details_df - INFO:
***Running `df_extractall` with regex <AbstractText.*?(?: Label="(.*?)")?.*?>(.*?)</AbstractText>***
	parent_regex: <Abstract>(.*?)</Abstract>

2024-04-04 22:13:57,415 - extract_pubmed_details_df - DEBUG:
Number of capture groups: 2



Unnamed: 0,article_title,abstract
0,"Inspiratory Training for Improving Respiratory Strength, Pulmonary Function, and Walking in Cerebral Palsy: A Meta-Analysis.","PURPOSE: To investigate the effects of inspiratory strength training on respiratory muscle strength, pulmonary function, and walking capacity in children with cerebral palsy, with Gross Motor Function Classification System I to III. METHODS: Searches were conducted in CINAHL, LILACS, MEDLINE, and Physiotherapy Evidence Database (PEDro) databases. The outcomes of interest were respiratory muscle strength, pulmonary function, and walking capacity. The quality was assessed by PEDro Scale. The G..."
1,Factors associated with different types of hip fractures among elderly patients a tertiary hospital in Pahang: A retrospective cross-sectional study.,"INTRODUCTION: Hip fractures, predominantly due to decreased bone density and falls, significantly impact elderly health, disproportionately affecting women and placing a strain on healthcare resources. This study aims to conduct an indepth epidemiological analysis of hip fracture incidence among the elderly in Pahang, Malaysia, to inform better healthcare strategies. MATERIALS AND METHODS: In this retrospective study, medical records of patients admitted with hip fractures between 2019 and 2..."


# iteration 4

In [172]:
import sys
sys.path.append(r"/home/silvhua/custom_python")
import os
import pandas as pd
import string
import re
import requests
# from article_processing import create_text_dict_from_folder
# from orm_summarize import *
api_key = os.getenv('api_ncbi') # Pubmed API key

import sys
import os
import requests
from Custom_Logger import *

class Pubmed_API:
    def __init__(self, api_key=os.getenv('api_ncbi'), logger=None, logging_level=logging.INFO):
        self.api_key = api_key
        self.logger = create_function_logger('Pubmed_API', logger, level=logging_level)
        self.iteration = 0
        self.responses_dict = {}
        self.results_dict = {}
        self.PMIDs_dict = {}
        self.record_strings_dict = {}

    def search_article(self, query, query_tag=None, publication=None, reldate=None, retmax=None,
        systematic_only=False, review_only=False, additional_search_params=None, ids_only=False, 
        verbose=True
        ):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        response = {}
        results = pd.DataFrame()
        search_term = f'{re.sub(r"not", "", query)}'  # Remove 'not' since it will be treated as a boolean
        if query_tag:
            search_term += f'{query_tag}'
        if publication:
            search_term = f'AND {publication} [ta]'
        if systematic_only:
            search_term += ' AND systematic[sb]'
        elif review_only:
            search_term += ' AND (systematic[sb] OR review[pt])'
        params = {
            'db': 'pubmed',
            'term': search_term,
            'retmax': 5,
            'retmode': 'json',
            'datetype': 'edat',
        }
        if reldate:
            params['reldate'] = reldate
        if retmax:
            params['retmax'] = retmax
        if additional_search_params:
            params.update(additional_search_params)
        self.logger.info(f'Search term: {search_term}')
        messages = []
        try:
            self.iteration += 1
            response = requests.get(base_url, params=params)
            response_dict = response.json()
            id_list = response_dict['esearchresult']['idlist']
            messages.append(f'{len(id_list)} PMIDs found.')
            if verbose==True:
                messages.append(f'{id_list}')
            self.PMIDs_dict[self.iteration] = id_list
            self.responses_dict[self.iteration] = response_dict
            if ids_only==False:
                results = self.get_article_data_by_title()
            else:
                results = id_list
            self.logger.info('\n'.join(messages))
        except Exception as error:
            error_messages = []
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        
        return results

    def get_article_data_by_title(self, iteration=None):
        result_df = pd.DataFrame()
        try:
            result_dict = {}
            iteration = self.iteration if iteration == None else iteration
            record_strings_list = self.batch_retrieve_citation(iteration)
            self.record_strings_dict[iteration] = record_strings_list
            for index, record_string in enumerate(record_strings_list):
                result_dict[index] = self.extract_pubmed_details(record_string)
            self.results_dict[iteration] = result_dict
            result_df = pd.DataFrame(result_dict).transpose()
        except Exception as error:
            error_messages = []
            error_messages.append(f'Response: \n{self.PMIDs_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            message = f'\tAn error occurred on line {lineno} in {filename}: {error}'
            error_messages.append(message)
            self.logger.error('\n'.join(error_messages))
        return result_df

    def batch_retrieve_citation(self, iteration):
        result_list = []
        messages = []
        try:
            id_list = self.PMIDs_dict.get(iteration)
            if id_list:
                self.logger.info(f'Extracting these {len(id_list)} PMIDs: {id_list}')
                for index, id in enumerate(id_list):
                    result_list.append(self.retrieve_citation(id).decode('utf-8'))
                    current_index, current_id = index+1, id
            else:
                self.logger.warning(f'No results found.')
        except Exception as error:
            messages.append(f'Response: \n{self.responses_dict.get(iteration)}')
            exc_type, exc_obj, tb = sys.exc_info()
            file = tb.tb_frame
            lineno = tb.tb_lineno
            filename = file.f_code.co_filename
            messages.append(f'\tAn error occurred on line {lineno} in {filename}: {error}')
            messages.append(f'Article {current_index} [{current_id}] not found.')
        return result_list

    def retrieve_citation(self, article_id):
        base_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        if self.api_key:
            base_url += f'&api_key={self.api_key}'
        params = {
            'db': 'pubmed',
            'id': article_id
        }
        response = requests.get(base_url, params=params)
        return response.content

    def extract_pubmed_details(self, record_string):
        """
        Helper function called by `pubmed_details_by_title` to parse article metadata from PubMed database.
        """
        authors = re.findall(r'<Author ValidYN="Y".*?><LastName>(.*?)</LastName><ForeName>(.*?)</ForeName>', record_string)
        formatted_authors = ', '.join(['{} {}'.format(author[1], author[0]) for author in authors])

        # Extract publication year
        publication_year = re.search(r'<PubDate><Year>(\d{4})</Year>', record_string)
        publication_year = publication_year.group(1) if publication_year else ''
        publication_month = re.search(r'<PubDate>.*?<Month>(Aug)</Month>.*?</PubDate>', record_string)
        publication_month = publication_month.group(1) if publication_month else ''

        # Extract article title
        article_title = re.search(r'<ArticleTitle>(.*?)</ArticleTitle>', record_string)
        article_title = article_title.group(1) if article_title else ''

        # Extract journal title
        journal_title = re.search(r'<Title>(.*?)</Title>', record_string)
        journal_title = journal_title.group(1) if journal_title else ''

        # Extract journal volume
        journal_volume = re.search(r'<Volume>(.*?)</Volume>', record_string)
        journal_volume = journal_volume.group(1) if journal_volume else ''

        # Extract journal issue
        journal_issue = re.search(r'<Issue>(.*?)</Issue>', record_string)
        journal_issue = journal_issue.group(1) if journal_issue else ''

        # Extract start page
        start_page = re.search(r'<StartPage>(.*?)</StartPage>', record_string)
        start_page = start_page.group(1) if start_page else ''

        # Extract end page
        end_page = re.search(r'<EndPage>(.*?)</EndPage>', record_string)
        end_page = end_page.group(1) if end_page else ''

        # Extract ELocationID
        doi = re.search(r'<ELocationID.*?EIdType="doi".*?>(.*?)</ELocationID>', record_string)
        doi = doi.group(1) if doi else ''

        # Extract PMID
        pmid = re.search(r'<PMID.*?>(.*?)</PMID>', record_string)
        pmid = pmid.group(1) if pmid else ''

        abstract_matches = re.findall(r'(<AbstractText.*?>.*?</AbstractText>)', record_string)
        # self.logger.debug(f'Number of abstract sections: {len(abstract_matches)}')
        if len(abstract_matches) > 1:
            cleaned_abstract_sections = []
            for match in abstract_matches:
                clean_match = re.sub(r'<AbstractText.*?((?:Label=".*")?.*?>.*)</AbstractText>', r'\1', match)
                clean_match = re.sub(r'(?: Label="(.*?)")?.*?>(.*)', r'\1: \2', clean_match)
                cleaned_abstract_sections.append(clean_match)
                
            abstract = ''.join([f'{group}<br>' for group in cleaned_abstract_sections])
        else:
            abstract = re.sub(r'<AbstractText.*?>(.*?)</AbstractText>', r'\1', abstract_matches[0])  if abstract_matches else ''
            
        # Extract MeshHeadingList
        MeshHeadingList = re.search(r'<MeshHeadingList>(.*?)</MeshHeadingList>', record_string)
        MeshHeadingList = MeshHeadingList.group(1) if MeshHeadingList else ''

        # Estract MeshHeading text and any QualifierName
        mesh_headings = []
        pattern = r'<MeshHeading><DescriptorName.*?>(.*?)</DescriptorName>(<QualifierName.*?>.*?</QualifierName>)?</MeshHeading>'
        matches = re.findall(pattern, MeshHeadingList)
        for match in matches:
            heading = match[0]
            if match[1]: # Estract Mesh QualifierName                
                MeshQualifiers = re.findall(
                    r'<QualifierName.*?>(.*?)</QualifierName>', match[1]
                    )
                print(f'mesh qualifiers: {MeshQualifiers}')
                for qualifier in MeshQualifiers:
                    heading = f"{match[0]} / {qualifier}"
                    mesh_headings.append(heading)
            else:
                mesh_headings.append(heading)

        # Extract keyword
        Keyword_List = re.search(r'<KeywordList.*?>(.*?)</KeywordList>', record_string)
        Keyword_List = Keyword_List.group(1) if Keyword_List else ''
        Keywords = re.findall(
            r'<Keyword.*?>(.*?)</Keyword>', Keyword_List
            )
        # Extract MajorTopic text
        MajorTopics = re.findall(
            r'<[^>]*MajorTopicYN="Y"[^>]*>([^<]+)<\/[^>]+>', record_string
            )
        # Extract Publication Type
        PublicationTypeList = re.search(r'<PublicationTypeList.*?>(.*?)</PublicationTypeList>', record_string)
        PublicationTypeList = PublicationTypeList.group(1) if PublicationTypeList else ''
        PublicationType = re.findall(
            r'<PublicationType.*?>(.*?)</PublicationType>', PublicationTypeList
            )
        return {
            'pubmed_title': article_title,
            'abstract': abstract,
            'journal': journal_title,
            'authors': formatted_authors,
            'year': publication_year,
            'month': publication_month,
            'pub_volume': journal_volume,
            'pub_issue': journal_issue,
            'start_page': start_page,
            'end_page': end_page,
            'doi': doi,
            'pmid': pmid,
            'mesh_headings': mesh_headings,
            'keywords': Keywords,
            'major_topics': MajorTopics,
            'publication_type': PublicationType
        }
    
iteration = 4
# query = '("resistance train*"[All Fields]) AND ((y_10[Filter]) AND (meta-analysis[Filter] OR review[Filter] OR systematicreview[Filter]))'
# query = 'Factors associated with different types of hip fractures among elderly patients a tertiary hospital in Pahang: A retrospective cross-sectional study'
query = '38555895 OR 38568267'
result_dict[iteration] = Pubmed_API()
ids_list = result_dict[iteration].search_article(query, retmax=5, ids_only=True)
df = result_dict[iteration].get_article_data_by_title()
df

2024-04-04 08:56:20,361 - Pubmed_API - INFO:
Search term: 38555895 OR 38568267



2024-04-04 08:56:21,906 - Pubmed_API - INFO:
2 PMIDs found.
['38568267', '38555895']

2024-04-04 08:56:21,933 - Pubmed_API - INFO:
Extracting these 2 PMIDs: ['38568267', '38555895']



mesh qualifiers: ['epidemiology', 'complications']


Unnamed: 0,pubmed_title,abstract,journal,authors,year,month,pub_volume,pub_issue,start_page,end_page,doi,pmid,mesh_headings,keywords,major_topics,publication_type
0,Inspiratory Training for Improving Respiratory...,PURPOSE: To investigate the effects of inspira...,Pediatric physical therapy : the official publ...,"K&#xea;nia K P Menezes, Patrick R Avelino, Mar...",2024,,36,2,207,215,10.1097/PEP.0000000000001092,38568267,"[Child, Humans, Cerebral Palsy, Walking, Muscl...",[],"[Cerebral Palsy, Resistance Training]","[Systematic Review, Meta-Analysis, Journal Art..."
1,Factors associated with different types of hip...,"INTRODUCTION: Hip fractures, predominantly due...",The Medical journal of Malaysia,"R Mohd Yusoff, Z A Mulud, M Mohammadnezhad",2024,,79,Suppl 1,117,121,,38555895,"[Aged, Humans, Female, Retrospective Studies, ...",[],[Hip Fractures],[Journal Article]


# *End of Page*