<a href="https://colab.research.google.com/github/sagihuja/projects/blob/main/python_proj_naya.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import re
from bs4 import BeautifulSoup as bs

In [24]:

def get_article_url(article_id):
    """
    get_article_url is a function that receives the article id and returns its
    coresponding Pubmed cited by URL address.
    """
    server = 'https://pubmed.ncbi.nlm.nih.gov/?linkname=pubmed_pubmed_citedin&from_uid='
    url = f'{server}{article_id}&page=1'
    #print('Link to the article on Pubmed:',url,'\n')
    return url

def get_article_html(article_id): 
    """
    get_article_html is a function that receives the article id and in return
    fetches its coresponding Pubmed HTML file.
    """     
    url = get_article_url(article_id) 
    r = requests.get(url)
    r.raise_for_status()
    return r.text

def get_cited_amount(article_id):
    """
    This function received the article id and retreives the number of times
    it was citated back.
    """
    soup = bs(get_article_html(article_id), 'html5lib')
    find_amount = soup.find('span', {"class":"value"}).get_text()
    print('Number of citation found for this article is: ',find_amount)


def get_number_citation_pages(article_id):
    """
    This functions retreives internaly from the URL the number of pages
    containing citations in order to deal with the pagination, and gather all
    the articles.
    """
    soup = bs(get_article_html(article_id), 'html5lib')
    find_number_citation_pages = soup.find('label', {"class":"of-total-pages"}).get_text()
    #print(find_number_citation_pages)
    number_citating_pages = (find_number_citation_pages.split()[1])
    print(f'Number of citiation pages for this article on "Pubmed" web site is: {number_citating_pages}')
    value_citating_pages = int(number_citating_pages)
        

def collect_article_info(article_id):
    articles = {}
    soup = bs(get_article_html(article_id), 'html5lib')
    collect_relevant_article_info = soup.find_all('div', attrs={'class': 'docsum-content'})
    #print(collect_relevant_article_info[0].text) #Prints the original article's title, autors and publication information
    for line in collect_relevant_article_info:
      articles["PMID"] = line.find("a", attrs={'class': 'docsum-title'}).get('data-article-id') 
      articles["Title: "] = line.find("a", attrs={'class': 'docsum-title'}).text.strip()[:30]
      articles["Journal name: "] = line.find("span", attrs={'class': 'docsum-journal-citation short-journal-citation'}).text.strip()
      print("Articles information: ", articles)

from dataclasses import dataclass
@dataclass
class Item:
    journal_name:str
    journal_name_short:str
    impact:float

def get_journals_impact_factor_urls(article_id):
    soup = bs(get_article_html(article_id), 'html5lib')
    find_journal_name = soup.find_all('span', attrs={'class': 'docsum-journal-citation short-journal-citation'})
    #print(find_journal_name[0].text) #Prints the original article journal title as an header
    citers = []
    for line in find_journal_name:
        journal_name = line.text.split()
        journal_name_short = '-'.join(journal_name[0:-1])
        journal_impact_factor_url = (f'https://www.scijournal.org/impact-factor-of-{journal_name_short}shtml')
        print(journal_impact_factor_url)

        try:
            html_text_impact_factor_page = requests.get(journal_impact_factor_url).text
            soup = bs(html_text_impact_factor_page, 'html5lib')
            impact_factor = soup.find('div', class_='num').text
            impact_factor = float(impact_factor)
            print(impact_factor)
            citers.append(Item(journal_name, journal_name_short, impact_factor))
        except AttributeError:
            pass
    return citers

In [25]:
while True:
    valid_answer = re.compile(r"\d{1,8}")
    answer = input('Please provide the "Pubmed" id (PMID) for the desired article? ')

    if re.fullmatch(valid_answer, answer):
        article_id = int(answer)
        print(get_article_url(article_id), 
              get_cited_amount(article_id), 
              get_number_citation_pages(article_id), 
              collect_article_info(article_id), 
              get_journals_impact_factor_urls(article_id)
               )
        break
    elif not answer:
        break
    else:
        print(f"'{answer}' is NOT a valid PMID!") 

Please provide the "Pubmed" id (PMID) for the desired article? 25118243
Number of citation found for this article is:  11
Number of citiation pages for this article on "Pubmed" web site is: 2
Articles information:  {'PMID': '25118243', 'Title: ': 'Fur is the master regulator of', 'Journal name: ': 'mBio. 2014.'}
Articles information:  {'PMID': '34181459', 'Title: ': 'Surviving Serum: the Escherich', 'Journal name: ': 'Infect Immun. 2021.'}
Articles information:  {'PMID': '34038483', 'Title: ': 'Transcriptome analysis unveils', 'Journal name: ': 'PLoS One. 2021.'}
Articles information:  {'PMID': '33139383', 'Title: ': '(p)ppGpp-Dependent Regulation ', 'Journal name: ': 'Infect Immun. 2021.'}
Articles information:  {'PMID': '33083114', 'Title: ': 'Master regulator genes and the', 'Journal name: ': 'PeerJ. 2020.'}
Articles information:  {'PMID': '32443590', 'Title: ': 'Streptococcus suis Uptakes Car', 'Journal name: ': 'Pathogens. 2020.'}
Articles information:  {'PMID': '31346171', 'Title

In [197]:
collect_article_info(25118243)

Link to the article on Pubmed: https://pubmed.ncbi.nlm.nih.gov/?linkname=pubmed_pubmed_citedin&from_uid=25118243&page=1 

Articles information:  {'PMID': '25118243', 'Title: ': 'Fur is the master regulator of', 'Journal name: ': 'mBio. 2014.'}
Articles information:  {'PMID': '34181459', 'Title: ': 'Surviving Serum: the Escherich', 'Journal name: ': 'Infect Immun. 2021.'}
Articles information:  {'PMID': '34038483', 'Title: ': 'Transcriptome analysis unveils', 'Journal name: ': 'PLoS One. 2021.'}
Articles information:  {'PMID': '33139383', 'Title: ': '(p)ppGpp-Dependent Regulation ', 'Journal name: ': 'Infect Immun. 2021.'}
Articles information:  {'PMID': '33083114', 'Title: ': 'Master regulator genes and the', 'Journal name: ': 'PeerJ. 2020.'}
Articles information:  {'PMID': '32443590', 'Title: ': 'Streptococcus suis Uptakes Car', 'Journal name: ': 'Pathogens. 2020.'}
Articles information:  {'PMID': '31346171', 'Title: ': 'Glycine, serine and threonine ', 'Journal name: ': 'Nat Commun. 

In [203]:
get_journal_impact_factor_url(25118243)

Link to the article on Pubmed: https://pubmed.ncbi.nlm.nih.gov/?linkname=pubmed_pubmed_citedin&from_uid=25118243&page=1 

mBio. 2014.
https://www.scijournal.org/impact-factor-of-mBio.shtml

                  6.784
                
https://www.scijournal.org/impact-factor-of-Infect-Immun.shtml

                  3.201
                
https://www.scijournal.org/impact-factor-of-PLoS-One.shtml

                  2.74
                
https://www.scijournal.org/impact-factor-of-Infect-Immun.shtml

                  3.201
                
https://www.scijournal.org/impact-factor-of-PeerJ.shtml

                  2.379
                
https://www.scijournal.org/impact-factor-of-Pathogens.shtml
https://www.scijournal.org/impact-factor-of-Nat-Commun.shtml

                  12.121
                
https://www.scijournal.org/impact-factor-of-mSphere.shtml

                  4.282
                
https://www.scijournal.org/impact-factor-of-Mol-Cell-Proteomics.shtml

                  4.87
   

In [182]:
get_journal_impact_factor_url(25118243)


                  6.784
                

                  3.201
                

                  2.74
                

                  3.201
                

                  2.379
                

                  12.121
                

                  4.282
                

                  4.87
                
