<a href="https://colab.research.google.com/github/sagihuja/projects/blob/main/SH_pythonproj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import re
from bs4 import BeautifulSoup as bs

In [136]:

def get_article_url(article_id):
    """
    get_article_url is a function that receives the article id and returns its
    coresponding Pubmed cited by URL address.
    """
    server = 'https://pubmed.ncbi.nlm.nih.gov/?linkname=pubmed_pubmed_citedin&from_uid='
    url = f'{server}{article_id}&page=1'
    #print('Link to the article on Pubmed:',url,'\n')
    return url

def get_article_html(article_id): 
    """
    get_article_html is a function that receives the article id and in return
    fetches its coresponding Pubmed HTML file.
    """     
    url = get_article_url(article_id) 
    r = requests.get(url)
    r.raise_for_status()
    return r.text

def get_cited_amount(article_id):
    """
    This function received the article id and retreives the number of times
    it was citated back.
    """
    soup = bs(get_article_html(article_id), 'html5lib')
    find_amount = soup.find('span', {"class":"value"}).get_text()
    print('Number of citation found for this article is: ',find_amount)


def get_number_citation_pages(article_id):
    """
    This functions retreives internaly from the URL the number of pages
    containing citations in order to deal with the pagination, and gather all
    the articles.
    """
    soup = bs(get_article_html(article_id), 'html5lib')
    find_number_citation_pages = soup.find('label', {"class":"of-total-pages"}).get_text()
    #print(find_number_citation_pages)
    number_citating_pages = (find_number_citation_pages.split()[1])
    print(f'Number of citiation pages for this article on "Pubmed" web site is: {number_citating_pages}')
    value_citating_pages = int(number_citating_pages)
        

def collect_article_info(article_id):
    articles = {}
    soup = bs(get_article_html(article_id), 'html5lib')
    collect_relevant_article_info = soup.find_all('div', attrs={'class': 'docsum-content'})
    #print(collect_relevant_article_info[0].text) #Prints the original article's title, autors and publication information
    for line in collect_relevant_article_info:
      articles["PMID"] = line.find("a", attrs={'class': 'docsum-title'}).get('data-article-id') 
      articles["Title: "] = line.find("a", attrs={'class': 'docsum-title'}).text.strip()[:30]
      articles["Journal name: "] = line.find("span", attrs={'class': 'docsum-journal-citation short-journal-citation'}).text.strip()
      print("Articles information: ", articles)

from dataclasses import dataclass
@dataclass
class citing_article:
    journal_name:str
    #journal_name_short:str
    impact_factor:float

def get_journals_impact_factor_urls(article_id):
    soup = bs(get_article_html(article_id), 'html5lib')
    find_journal_name = soup.find_all('span', attrs={'class': 'docsum-journal-citation short-journal-citation'})
    #print(find_journal_name[0].text) #Prints the original article journal title as an header
    citers = []
    for line in find_journal_name:
        journal_name = line.text.split()
        journal_name_short = '-'.join(journal_name[0:-1])
        journal_impact_factor_url = (f'https://www.scijournal.org/impact-factor-of-{journal_name_short}shtml')
        #print(journal_impact_factor_url)

        try:
            html_text_impact_factor_page = requests.get(journal_impact_factor_url).text
            soup = bs(html_text_impact_factor_page, 'html5lib')
            impact_factor = soup.find('div', class_='num').text
            impact_factor = float(impact_factor)
            #print(impact_factor)
            citers.append(citing_article(journal_name,impact_factor))
            #citers.impact_factor
        except AttributeError:
            pass
    number_cited = len(citers)
    total = sum([c.impact_factor for c in citers])
    mean_impact_factor = total/number_cited
    print(f'Score for PMID({article_id})is: {mean_impact_factor}')
    return citers

In [105]:
#art = citing_article( 'genbio', 100)
#art.impact_factor
#art2 = citing_article( 'molbio', 50)
#art2.journal_name
#citers = [art, art2]
#citers, ([c.impact_factor for c in citers])
#sum([c.impact_factor for c in citers])


150

In [138]:
while True:
    valid_answer = re.compile(r"\d{1,8}")
    answer = input('Please provide the "Pubmed" id (PMID) for the desired article? ')

    if re.fullmatch(valid_answer, answer):
        article_id = int(answer)
        print(get_article_url(article_id), 
              get_cited_amount(article_id), 
              get_number_citation_pages(article_id), 
              collect_article_info(article_id), 
              get_journals_impact_factor_urls(article_id)
               )
        break
    elif not answer:
        break
    else:
        print(f"'{answer}' is NOT a valid PMID!") 

Please provide the "Pubmed" id (PMID) for the desired article? 25118243
Number of citation found for this article is:  11
Number of citiation pages for this article on "Pubmed" web site is: 2
Articles information:  {'PMID': '25118243', 'Title: ': 'Fur is the master regulator of', 'Journal name: ': 'mBio. 2014.'}
Articles information:  {'PMID': '34181459', 'Title: ': 'Surviving Serum: the Escherich', 'Journal name: ': 'Infect Immun. 2021.'}
Articles information:  {'PMID': '34038483', 'Title: ': 'Transcriptome analysis unveils', 'Journal name: ': 'PLoS One. 2021.'}
Articles information:  {'PMID': '33139383', 'Title: ': '(p)ppGpp-Dependent Regulation ', 'Journal name: ': 'Infect Immun. 2021.'}
Articles information:  {'PMID': '33083114', 'Title: ': 'Master regulator genes and the', 'Journal name: ': 'PeerJ. 2020.'}
Articles information:  {'PMID': '32443590', 'Title: ': 'Streptococcus suis Uptakes Car', 'Journal name: ': 'Pathogens. 2020.'}
Articles information:  {'PMID': '31346171', 'Title

In [137]:
get_journals_impact_factor_urls(25118243)

Score for PMID(25118243)is: 4.9472499999999995


[citing_article(journal_name=['mBio.', '2014.'], impact_factor=6.784),
 citing_article(journal_name=['Infect', 'Immun.', '2021.'], impact_factor=3.201),
 citing_article(journal_name=['PLoS', 'One.', '2021.'], impact_factor=2.74),
 citing_article(journal_name=['Infect', 'Immun.', '2021.'], impact_factor=3.201),
 citing_article(journal_name=['PeerJ.', '2020.'], impact_factor=2.379),
 citing_article(journal_name=['Nat', 'Commun.', '2019.'], impact_factor=12.121),
 citing_article(journal_name=['mSphere.', '2018.'], impact_factor=4.282),
 citing_article(journal_name=['Mol', 'Cell', 'Proteomics.', '2016.'], impact_factor=4.87)]