<a href="https://colab.research.google.com/github/sagihuja/projects/blob/main/python_proj_naya.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
import re
from bs4 import BeautifulSoup as bs

In [157]:

def get_article_url(article_id):
    """
    get_article_url is a function that receives the article id and returns its
    coresponding Pubmed cited by URL address.
    """
    server = 'https://pubmed.ncbi.nlm.nih.gov/?linkname=pubmed_pubmed_citedin&from_uid='
    url = f'{server}{article_id}&page=1'
    #print('Link to the article on Pubmed:',url,'\n')
    return url

def get_article_html(article_id): 
    """
    get_article_html is a function that receives the article id and in return
    fetches its coresponding Pubmed HTML file.
    """     
    url = get_article_url(article_id) 
    r = requests.get(url)
    r.raise_for_status()
    return r.text

def get_cited_amount(article_id):
    """
    This function received the article id and retreives the number of times
    it was citated back.
    """
    soup = bs(get_article_html(article_id), 'html5lib')
    find_amount = soup.find('span', {"class":"value"}).get_text()
    print('Number of citation found for this article is: ',find_amount)


def get_number_citation_pages(article_id):
    """
    This functions retreives internaly from the URL the number of pages
    containing citations in order to deal with the pagination, and gather all
    the articles.
    """
    soup = bs(get_article_html(article_id), 'html5lib')
    find_number_citation_pages = soup.find('label', {"class":"of-total-pages"}).get_text()
    #print(find_number_citation_pages)
    number_citating_pages = (find_number_citation_pages.split()[1])
    print(number_citating_pages)
    value_citating_pages = int(number_citating_pages)
        

def collect_article_info(article_id):
    articles = {}
    soup = bs(get_article_html(article_id), 'html5lib')
    collect_relevant_article_info = soup.find_all('div', attrs={'class': 'docsum-content'})
    #print(collect_relevant_article_info[0].text) #Prints the original article's title, autors and publication information
    for line in collect_relevant_article_info:
      articles["PMID"] = line.find("a", attrs={'class': 'docsum-title'}).get('data-article-id') 
      articles["Title: "] = line.find("a", attrs={'class': 'docsum-title'}).text.strip()[:30]
      articles["Journal name: "] = line.find("span", attrs={'class': 'docsum-journal-citation short-journal-citation'}).text.strip()
      print("Articles information: ", articles)


def get_journal_impact_factor_url(article_id):
    soup = bs(get_article_html(article_id), 'html5lib')
    find_journal_name = soup.find_all('span', attrs={'class': 'docsum-journal-citation short-journal-citation'})
    #print(find_journal_name[0].text) #Prints the original article journal title as an header
    for line in find_journal_name:
        journal_name = line.text.split()
        #print(journal_name[0:-1])
        journal_name_short = '-'.join(journal_name[0:-1])
        #print(journal_name_short)
        journal_impact_factor_url = (f'https://www.scijournal.org/impact-factor-of-{journal_name_short}shtml')
        print(journal_impact_factor_url)

        for line in journal_impact_factor_url:
            r = requests.get(journal_impact_factor_url)
            r.raise_for_status()
            #return r.text
            soup = bs(get_journal_impact_factor_url(article_id), 'html5lib')
            impact_factor = soup.find('div', {"class":"num"}).get_text()
            #impact_factor = soup.find('div', {"class":"num"}).get_text()
            print(impact_factor)


