# Scraping Springer Article Dates

In [20]:
from bs4 import BeautifulSoup
import csv
import time
import requests
import pandas as pd
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

In [21]:
journal_selection = "all"
max_pages = 16000
start_page = 0
extra_info = "keyword_depression"

## Scraping

In [22]:
def get_soup(url):
    """
    Extract page as html using BeautifulSoup.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    return soup

In [23]:
def make_url(page):
    # url = f"https://link.springer.com/search/page/{page}?query=depression&facet-discipline=%22Psychology%22&facet-start-year=2018&facet-language=%22En%22&sortOrder=newestFirst&query=depression&facet-end-year=2024&facet-content-type=%22Article%22"
    # url = f"https://link.springer.com/search/page/{page}?date-facet-mode=between&facet-discipline=%22Psychology%22&facet-start-year=2018&facet-language=%22En%22&sortOrder=newestFirst&facet-end-year=2023&facet-content-type=%22Article%22"
    # url = f"https://link.springer.com/search/page/{page}?facet-end-year=2023&date-facet-mode=between&previous-end-year=2023&sortOrder=newestFirst&query=depression&previous-start-year=1832&facet-start-year=2019&facet-content-type=%22Article%22"
    url = f"https://link.springer.com/search/page/{page}?query=depression&sortOrder=newestFirst&facet-content-type=%22Article%22&date-facet-mode=between&facet-start-year=2000&previous-start-year=2019&facet-end-year=2023&previous-end-year=2023"

    return url

### Step 1: find all articles in search query
Loop over pages and extract links if article appears in journal of interest.

In [24]:
allowed_journals = ["Psychological Research",
                    "Current Psychology",
                    "Cognitive, Affective, & Behavioral Neuroscience",
                    "International Journal of Cognitive Therapy",
                    "Current Treatment Options in Psychiatry",
                    "European Archives of Psychiatry and Clinical Neuroscience",
                    "BMC Psychiatry",
                    "Academic Psychiatry",
                    "Social Psychiatry and Psychiatric Epidemiology"
                    "Discover Mental Health",
                    "International Journal of Mental Health and Addiction",
                    "Annals of General Psychiatry",
                    ]

In [25]:
# store results for multiple pages
results = {"journal": [], "title": [], "href": []}

# loop over pages
for page in tqdm(range(start_page, max_pages)):

    # get page results
    url = make_url(page)
    soup = get_soup(url)

    articles_list = soup.find_all('li', {'class': "no-access"})
    for article in articles_list:
        
        # get all citations and extract links to article for journals of interest
        journal = article.find('a', {'class': "publication-title"})
        if journal["title"] in allowed_journals:
            details = soup.find('a', {'class': 'title'}, href=True)
            results["href"].append(details["href"])
            results["title"].append(details.get_text())
            results["journal"].append(journal["title"]) 

  0%|          | 0/16000 [00:00<?, ?it/s]

### Step 2: for each article extract dates

In [39]:
len(results["href"])

663

In [28]:
springer_url = "https://link.springer.com"

all_dates = {"Received": [], "Accepted": [], "Published": []}
dates = []
print()

# extract date info for all article links obtained in previous cell
for i, href in tqdm(enumerate(results["href"])):
    # print(href)
    url = springer_url + href
    soup = get_soup(url)

    # dates are stored in list
    article_dates = soup.find_all('ul', {'class': 'c-bibliographic-information__list'})
    for date in article_dates:

        # check each element in the list of dates
        subsoup = date.findAll('p')
        accepted_received = []
        for j, soup in enumerate(subsoup):
            
            # store accepted date
            if "Accepted" in soup:
                day = soup.find('time')
                day = day["datetime"]
                accepted_received.append(day)

            # store received date 
            if "Received" in soup:

                # check if accepted is already in list
                if accepted_received != []:
                    print("only received date?")
                    print(accepted_received)
                    print("---")

                day = soup.find('time')
                day = day["datetime"]
                accepted_received.append(day)

        if accepted_received != []:
            # print(accepted_received)
            
            if len(accepted_received) == 2:
                dates.append([i] + accepted_received)
            else:
                if len(accepted_received) == 1:
                    print(f"no received date for iter {i}")
                else:
                    print(len(accepted_received))
                # store item if it is of interest and has time set
                # for date_type in ["Received", "Published", "Accepted"]:
                #     if date_type in soup.get_text():
                        
                        # all_dates[date_type].append(day)




0it [00:00, ?it/s]

no received date for iter 2
no received date for iter 8
no received date for iter 10
no received date for iter 11
no received date for iter 16
no received date for iter 22
no received date for iter 30
no received date for iter 31
no received date for iter 32
no received date for iter 33
no received date for iter 49
no received date for iter 52
no received date for iter 57
no received date for iter 58
no received date for iter 65
no received date for iter 66
no received date for iter 67
no received date for iter 68
no received date for iter 69
no received date for iter 70
no received date for iter 71
no received date for iter 72
no received date for iter 77
no received date for iter 78
no received date for iter 79
no received date for iter 80
no received date for iter 81
no received date for iter 82
no received date for iter 90
no received date for iter 93
no received date for iter 100
no received date for iter 101
no received date for iter 102
no received date for iter 103
no received 

In [29]:
results["href"]

['/article/10.1007/s10964-022-01726-x',
 '/article/10.1007/s40120-023-00437-0',
 '/article/10.1007/s12144-023-04528-x',
 '/article/10.1007/s11033-023-08263-1',
 '/article/10.1007/s40615-022-01267-w',
 '/article/10.1007/s10571-022-01256-x',
 '/article/10.1007/s00213-023-06319-5',
 '/article/10.1007/s00213-023-06319-5',
 '/article/10.3758/s13415-023-01061-z',
 '/article/10.1007/s10586-022-03626-y',
 '/article/10.3758/s13415-023-01066-8',
 '/article/10.3758/s13415-023-01066-8',
 '/article/10.1007/s40122-023-00475-4',
 '/article/10.1007/s12098-022-04352-4',
 '/article/10.1007/s10522-022-10006-x',
 '/article/10.1007/s12012-023-09786-6',
 '/article/10.3758/s13415-023-01070-y',
 '/article/10.1007/s13369-022-07455-4',
 '/article/10.1007/s10147-023-02299-w',
 '/article/10.1186/s40479-023-00216-1',
 '/article/10.1007/s43032-023-01222-y',
 '/article/10.1007/s43032-023-01222-y',
 '/article/10.1007/s11019-023-10146-y',
 '/article/10.1186/s13041-023-01020-2',
 '/article/10.1007/s10508-023-02576-9',


In [30]:
dates_df = pd.DataFrame(dates)
dates_df.set_index(0, drop=True, inplace=True)
dates_df

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2022-10-22,2022-12-16
1,2022-11-30,2023-01-09
3,2022-08-11,2023-01-10
4,2021-11-09,2022-02-14
5,2022-03-15,2022-07-07
...,...,...
652,2022-05-04,2022-08-09
653,2021-04-23,2021-12-26
655,2022-03-26,2022-07-04
656,2022-01-15,2022-09-09


In [31]:
dates_df[1]

0
0      2022-10-22
1      2022-11-30
3      2022-08-11
4      2021-11-09
5      2022-03-15
          ...    
652    2022-05-04
653    2021-04-23
655    2022-03-26
656    2022-01-15
658    2020-12-14
Name: 1, Length: 426, dtype: object

## Data formatting

In [32]:
journals = pd.DataFrame(results["journal"])
titles = pd.DataFrame(results["title"])

In [33]:
dates_df["journal"] = journals[journals.index.isin(dates_df.index)]
dates_df["title"] = titles[titles.index.isin(dates_df.index)]
dates_df

Unnamed: 0_level_0,1,2,journal,title
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2022-10-22,2022-12-16,"Cognitive, Affective, & Behavioral Neuroscience",The Bridge between Cybervictimization and Suic...
1,2022-11-30,2023-01-09,International Journal of Mental Health and Add...,Proposed Recommendations for the Management of...
3,2022-08-11,2023-01-10,International Journal of Mental Health and Add...,"Zebrafish, a biological model for pharmaceutic..."
4,2021-11-09,2022-02-14,International Journal of Mental Health and Add...,Centering Asian American Women’s Health: Preva...
5,2022-03-15,2022-07-07,"Cognitive, Affective, & Behavioral Neuroscience",Daytime Light Deficiency Leads to Sex- and Bra...
...,...,...,...,...
652,2022-05-04,2022-08-09,International Journal of Mental Health and Add...,Minimally invasive versus open transforaminal ...
653,2021-04-23,2021-12-26,Current Psychology,Emergency Department Use by Children and Youth...
655,2022-03-26,2022-07-04,Academic Psychiatry,Effect of shift work on frontline doctors’ moo...
656,2022-01-15,2022-09-09,Academic Psychiatry,Climate Change and Older Adults: an Important ...


In [34]:
dates_df.to_csv(f"data/dates_df_{max_pages}_{journal_selection}_{extra_info}.csv", index=False)

In [35]:
# # convert received list to dataframe
# df_received = pd.DataFrame(all_dates["Received"], columns=["Received"])
# df_received["Received"] = pd.to_datetime(df_received["Received"])
# df_received.head()

In [36]:
# # convert accepted list to dataframe (separate since list lengths differ)
# df_accepted = pd.DataFrame(all_dates["Accepted"], columns=["Accepted"])
# df_accepted["Accepted"] = pd.to_datetime(df_accepted["Accepted"])
# df_accepted.head()

In [37]:
# df_received.shape, df_accepted.shape

In [38]:
# # store dfs as csv
# df_received.to_csv(f"data/df_received_{max_pages}_{journal_selection}_{extra_info}.csv", index=False)
# df_accepted.to_csv(f"data/df_accepted_{max_pages}_{journal_selection}_{extra_info}.csv", index=False)