<a href="https://colab.research.google.com/github/sagihuja/projects/blob/main/SH_python_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*****Pubmed* Article Assistant Tool****
-This tool is aimed for receiving a PMID (pubmed id) and provide back usuful information for the user regarding the article:

*   A link to the Pubmed article.
*   Number of citations it received.
*   A list of all the articles citating the article, their PMID, title, journal name and year of publication.
*   Specific Link to scijournal web site pointing to each journal impact factor

https://pubmed.ncbi.nlm.nih.gov/

https://www.scijournal.org/

Impact Factor (IF) = is a measure reflecting the average number of citations to articles published in science and social science journals. It is frequently used as a proxy for the relative importance of a journal within its field, with journals with higher impact factors deemed to be more important than those with lower ones https://www.scijournal.org/

In [24]:
import requests
import re
from bs4 import BeautifulSoup as bs

In [25]:
def get_article_url(article_id, page=1):
    """
    get_article_url function receives the article id and returns its
    coresponding Pubmed cited by URL address.
    """
    server = 'https://pubmed.ncbi.nlm.nih.gov/?linkname=pubmed_pubmed_citedin&from_uid='
    url = f'{server}{article_id}&page={page}'
    return url

def get_article_html(article_id, page=1): 
    """
    get_article_html function receives the article id and in return
    fetches its coresponding Pubmed HTML page (default page is "1").
    """     
    url = get_article_url(article_id,page=page) 
    r = requests.get(url)
    r.raise_for_status()
    return r.text

def get_cited_amount(soup):
    """
    get_cited_amount function retreives the number of times the article was 
    cited back.
    """
    find_amount = soup.find('span', {"class":"value"}).get_text()
    print('Total number of citations found for this article is: ',find_amount)

def get_number_citation_pages(soup):
    """
    get_number_citation_pages function retreives internaly from the URL the 
    number of pages the article was "cited by".
    """
    find_number_citation_pages = soup.find('label', {"class":"of-total-pages"}).get_text()
    number_citating_pages = (find_number_citation_pages.split()[1])
    print(f'Number of citiation pages for this article on "Pubmed" web site is: {number_citating_pages}')
    value_citating_pages = int(number_citating_pages)
    return value_citating_pages
        
def collect_article_info(soup):
    """
    collect_article_info function retreives the citating articles PMID, title, 
    name of journal and year of publication.
    """
    articles = {}
    collect_relevant_article_info = soup.find_all('div', attrs={'class': 'docsum-content'})
    #print(collect_relevant_article_info[0].text) #Prints the original article's title, autors and publication information
    for line in collect_relevant_article_info:
      articles["PMID"] = line.find("a", attrs={'class': 'docsum-title'}).get('data-article-id') 
      articles["Title: "] = line.find("a", attrs={'class': 'docsum-title'}).text.strip()[:30]
      articles["Journal name: "] = line.find("span", attrs={'class': 'docsum-journal-citation short-journal-citation'}).text.strip()
      print("Articles information: ", articles)

from dataclasses import dataclass
@dataclass
class CitingArticle:
    journal_name:str
    impact_factor:float

def get_journals_impact_factor_urls(soup):
    """
    get_journals_impact_factor_urls function extracts each journal 
    (that exists on the site's database) Impact Factor and provides link to 
    the journal page. 
    """
    find_journal_name = soup.find_all('span', attrs={'class': 'docsum-journal-citation short-journal-citation'})
    #print(find_journal_name[0].text) #Prints the original article journal title as an header
    citers = []
    for line in find_journal_name:
        journal_name = line.text.split()
        journal_name_short = '-'.join(journal_name[0:-1])
        journal_impact_factor_url = (f'https://www.scijournal.org/impact-factor-of-{journal_name_short}shtml')
        print(journal_impact_factor_url)

        try:
            html_text_impact_factor_page = requests.get(journal_impact_factor_url).text
            soup = bs(html_text_impact_factor_page, 'html5lib')
            impact_factor = soup.find('div', class_='num').text
            impact_factor = float(impact_factor)
            #print(impact_factor)
            citers.append(citing_article(journal_name,impact_factor))
            citers.impact_factor
        except AttributeError:
            pass
    number_cited = len(citers)
    total = sum([c.impact_factor for c in citers])
    mean_impact_factor = total/number_cited
    print(f'Score for PMID({article_id})is: {mean_impact_factor}')
    return citers

In [None]:
while True:
    valid_answer = re.compile(r"\d{1,8}")
    answer = input('Please provide the "Pubmed" id (PMID) for the desired article? ')

    if re.fullmatch(valid_answer, answer):
        article_id = int(answer)
        html_text = get_article_url(article_id)
        soup = bs(get_article_html(article_id), 'html5lib')
        pages = get_number_citation_pages(soup)

        print(html_text,
              get_cited_amount(soup),
              collect_article_info(soup), 
              get_journals_impact_factor_urls(soup)
             )
        for page in range(2, pages+1):
            html_text = get_article_url(article_id, page=page)
            soup = bs(get_article_html(article_id, page=page), 'html5lib')
            print(html_text,
              get_cited_amount(soup), 
              collect_article_info(soup), 
              get_journals_impact_factor_urls(soup)
             )
        break
    elif not answer:
        break
    else:
        print(f"'{answer}' is NOT a valid PMID!") 