In [74]:
import os
import time
from datetime import datetime

import pickle

from tqdm.auto import tqdm

from src.constants import *

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [7]:
def _pget(url, stream=False):
    """
    Acounts for network errors in   getting a request (Pubmed often appears offline, but not for long periods of time)
    Retries every 2 seconds for 60 seconds and then gives up
    """
    downloaded = False
    count = 0

    while not downloaded and count < 60:
        try:
            page = requests.get(url, stream=stream)
            downloaded = True
        except:
            print(url + f" - Network error, retrying... ({count + 1})")
            time.sleep(2)
            count += 1
                
    if page != None:
        return page
    else:
        raise ValueError

In [8]:
def _get_search_matches(search_terms, max_page_num=5):
    """
    Gets all of the ids of articles matching a given list of search terms
    Optionnaly, set a max number of pages to search through (10 articles per page)
    Selenium not used here
    """
    # Filtering out every non English match
    search_url = "https://pubmed.ncbi.nlm.nih.gov/?term=" + '+'.join(search_terms) + '&filter=lang.english'
    full_search_ids = []

    # Grabs every page
    page_num = 0
    while (max_page_num and page_num < max_page_num) or not max_page_num:
        page_num += 1
        if page_num != 1:
            page_url = search_url + "&page=" + str(page_num)
        else:
            page_url = search_url
        try:
            page = _pget(page_url)
            page_soup = BeautifulSoup(page.text, features="lxml")
            page_ids = page_soup.find("div", {"class": "search-results-chunk results-chunk"}
                                     ).get("data-chunk-ids").split(",")
            full_search_ids += page_ids
        except AttributeError:
            break

    # Saving the results
    return full_search_ids

In [4]:
search_matches_ids = _get_search_matches(["Anastomotic", "Leaks"])

In [12]:
articles_list = []

for article_id in tqdm(search_matches_ids):
    
    url = "https://pubmed.ncbi.nlm.nih.gov/" + str(article_id)
    
    with requests.get(url) as r:
        soup = BeautifulSoup(r.text, "html.parser")
    
    citations_ids = []
    citedby = soup.find("div", {"class": "citedby-articles"})
    if citedby:
        for a in citedby.find_all("a", {"class": "docsum-title"}):
            citations_ids.append(a["data-ga-action"])
    
    try:
        date_text = soup.find("div", {"class": "article-source"}).find("span", {"class": "cit"}).text.split(";")[0]
        date_text = " ".join(date_text.split(" ")[:2])
        date = datetime.strptime(date_text, "%Y %b")
    except:
        date = "Undef"
    
    articles_list.append({"ID": article_id,
                          "Publication Date": date,
                          "Citations ID": citations_ids})

  0%|          | 0/50 [00:00<?, ?it/s]

In [13]:
citations_df = pd.DataFrame(articles_list).set_index("ID")

In [19]:
article_id = 21213108

url = "https://pubmed.ncbi.nlm.nih.gov/" + str(article_id)
   
pubmed_page = _pget(url)
pubmed_soup = BeautifulSoup(pubmed_page.text, features="lxml")

author_names = []
authors_soup_list = pubmed_soup.find("div", {"class": "inline-authors"}
                                    ).find_all("span", {"class": "authors-list-item"})
for author_soup in authors_soup_list:
    author_soup = author_soup.find("a", {"class": "full-name"})
    author_names.append(author_soup["data-ga-label"])
print(author_names)

['Marie Černá', 'Martin Köcher', 'Vlastimil Válek', 'René Aujeský', 'Čestmír Neoral', 'Tomáš Andrašina', 'Jiří Pánek', 'Shankari Mahathmakanthi']


In [18]:
authors_soup_list[0]

<span class="authors-list-item"><a class="full-name" data-ga-action="author_link" data-ga-category="search" data-ga-label="Marie Černá" href="/?term=%C4%8Cern%C3%A1+M&amp;cauthor_id=21213108" ref="linksrc=author_name_link">Marie Černá</a><sup class="affiliation-links"><span class="author-sup-separator"> </span><a class="affiliation-link" href="#affiliation-1" ref="linksrc=author_aff" title="Department of Radiology, University Hospital, I. P. Pavlova 6, 775 20 Olomouc, Czech Republic.">
                1
              </a></sup><span class="comma">, </span></span>

In [85]:
def reformat_authors(list_str):
    list_formated = list_str[1:-1].split(", ")
    if len(list_formated) == 1 and list_formated[0] == '':
        return []
    else:
        for i in range(len(list_formated)):
            list_formated[i] = list_formated[i][1:-1]
        return list_formated
    
def reformat_date(date_str):
    if date_str != "Undef":
        return datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
    else:
        return date_str

def reformat_citations(list_str):
    list_formated = list_str[1:-1].split(", ")
    if len(list_formated) == 1 and list_formated[0] == '':
        return []
    else:
        for i in range(len(list_formated)):
            list_formated[i] = int(list_formated[i][1:-1])
        return list_formated

In [86]:
with open(os.path.join(DATA_PATH, "citations_full.csv"), "r") as f:
    citations_df = pd.read_csv(f, sep='|')

citations_df["ID"] = citations_df["ID"].apply(lambda x: int(x))
citations_df["Authors"] = citations_df["Authors"].apply(reformat_authors)
citations_df["Date"] = citations_df["Date"].apply(reformat_date)
citations_df["Citations"] = citations_df["Citations"].apply(reformat_citations)
citations_df = citations_df.set_index("ID")

In [87]:
citations_df

Unnamed: 0_level_0,Authors,Date,Citations
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34100248,"[John C Alverdy, Hans Martin Schardey]",2021-11-01 00:00:00,[35764831]
32139117,"[Caterina Foppa, Siew Chien Ng, Marco Montorsi...",2020-06-01 00:00:00,"[35955993, 35915290, 35733511, 35535032, 35421..."
32671825,"[B D Stephensen, F Reid, S Shaikh, R Carroll, ...",2020-12-01 00:00:00,"[35986864, 35657137, 35151339, 35024926, 34988..."
33565590,"[Antonio Barbaro, Thomas A Eldredge, Jonathan ...",2021-02-01 00:00:00,[35015134]
32012414,"[K Stormark, P-M Krarup, A Sjövall, K Søreide,...",2020-09-01 00:00:00,"[35955993, 34063108, 32795348]"
...,...,...,...
33089382,"[Roberto Cirocchi, Georgi Popivanov, Marina Ko...",2021-05-01 00:00:00,"[35056340, 34750614, 34557938, 34150837, 33727..."
31352419,"[Matthew James Lee, Adele E Sayers, Thomas M D...",2019-07-01 00:00:00,"[35928981, 35585034, 35493630, 35238117, 34922..."
31559948,"[Jane Nixon, Sarah Brown, Isabelle L Smith, El...",2019-09-01 00:00:00,"[34097765, 34097764, 33999463, 33969911, 33969..."
28913968,[2015 European Society of Coloproctology colla...,2017-09-01 00:00:00,"[35913516, 35592128, 34518869, 32215559, 32172..."
