In [1]:
from bs4 import BeautifulSoup
import csv
import time
import requests
import pandas as pd
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

In [2]:
journal_selection = "all"
max_pages = 50
extra_info = "keyword_depression"

## Scraping

In [3]:
def get_soup(url):
    """
    Extract page as html using BeautifulSoup.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    return soup

In [4]:
def make_url(page):
    # url = f"https://link.springer.com/search/page/{page}?query=depression&facet-discipline=%22Psychology%22&facet-start-year=2018&facet-language=%22En%22&sortOrder=newestFirst&query=depression&facet-end-year=2024&facet-content-type=%22Article%22"
    # url = f"https://link.springer.com/search/page/{page}?date-facet-mode=between&facet-discipline=%22Psychology%22&facet-start-year=2018&facet-language=%22En%22&sortOrder=newestFirst&facet-end-year=2023&facet-content-type=%22Article%22"
    url = f"https://link.springer.com/search/page/{page}?facet-end-year=2023&date-facet-mode=between&previous-end-year=2023&sortOrder=newestFirst&query=depression&previous-start-year=1832&facet-start-year=2019&facet-content-type=%22Article%22"
    return url

In [5]:
allowed_journals = ["European Archives of Psychiatry and Clinical Neuroscience"]

In [6]:
# store results for multiple pages
hrefs_results = []

for page in tqdm(range(0, max_pages)):

    # get page results
    url = make_url(page)
    soup = get_soup(url)
    # print(soup)

    # get all citations and extract links to article
    journals = soup.find_all('a', {'class': "publication-title"})
    for journal in journals:
        # if journal["title"] in allowed_journals:
        #     print("JA")
        citations = soup.find_all('a', {'class': 'title'}, href=True)
        hrefs = [citation["href"] for citation in citations]
        hrefs_results += hrefs

    # extract text and remove tag
    # citations = [citation.get_text() for citation in citations]
    
    # keep only journals of interest
    # citations = [citation for citation in citations if citation.split('.')[0] in journals]
    # results += citations
    

  0%|          | 0/50 [00:00<?, ?it/s]

In [7]:
# hrefs_results

In [8]:
springer_url = "https://link.springer.com"

all_dates = {"Received": [], "Accepted": [], "Published": []}
print()

# extract date info for all article links obtained in previous cell
for href in tqdm(hrefs_results):
    url = springer_url + href
    soup = get_soup(url)

    # dates are stored in list
    dates = soup.find_all('ul', {'class': 'c-bibliographic-information__list'})
    for date in dates:

        # check each element in the list of dates
        subsoup = date.findAll('p')
        for i, soup in enumerate(subsoup):

            # store item if it is of interest and has time set
            for date_type in ["Received", "Published", "Accepted"]:
                if date_type in soup.get_text():
                    day = soup.find('time')
                    day = day["datetime"]
                    all_dates[date_type].append(day)




  0%|          | 0/20000 [00:00<?, ?it/s]

## Data formatting

In [9]:
# convert received list to dataframe
df_received = pd.DataFrame(all_dates["Received"], columns=["Received"])
df_received["Received"] = pd.to_datetime(df_received["Received"])
df_received.head()

Unnamed: 0,Received
0,2022-11-19
1,2022-10-07
2,2022-12-03
3,2022-04-16
4,2022-04-12


In [10]:
# convert accepted list to dataframe (separate since list lengths differ)
df_accepted = pd.DataFrame(all_dates["Accepted"], columns=["Accepted"])
df_accepted["Accepted"] = pd.to_datetime(df_accepted["Accepted"])
df_accepted.head()

Unnamed: 0,Accepted
0,2021-03-03
1,2022-08-29
2,2022-12-09
3,2022-12-07
4,2022-12-07


In [11]:
df_received.shape, df_accepted.shape

((14400, 1), (19560, 1))

In [12]:
# store dfs as csv
df_received.to_csv(f"data/df_received_{max_pages}_{journal_selection}_{extra_info}.csv", index=False)
df_accepted.to_csv(f"data/df_accepted_{max_pages}_{journal_selection}_{extra_info}.csv", index=False)