# Scraping Springer Article Dates

In [None]:
from bs4 import BeautifulSoup
import csv
import time
import requests
import pandas as pd
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

In [None]:
journal_selection = ""
max_pages = 60000
start_page = 0
extra_info = "all_journals_no_keywords"

## Scraping

In [None]:
def get_soup(url):
    """
    Extract page as html using BeautifulSoup.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    return soup

In [None]:
def make_url(page):
    # url = f"https://link.springer.com/search/page/{page}?query=depression&facet-discipline=%22Psychology%22&facet-start-year=2018&facet-language=%22En%22&sortOrder=newestFirst&query=depression&facet-end-year=2024&facet-content-type=%22Article%22"
    # url = f"https://link.springer.com/search/page/{page}?date-facet-mode=between&facet-discipline=%22Psychology%22&facet-start-year=2018&facet-language=%22En%22&sortOrder=newestFirst&facet-end-year=2023&facet-content-type=%22Article%22"
    # url = f"https://link.springer.com/search/page/{page}?facet-end-year=2023&date-facet-mode=between&previous-end-year=2023&sortOrder=newestFirst&query=depression&previous-start-year=1832&facet-start-year=2019&facet-content-type=%22Article%22"
    # url = f"https://link.springer.com/search/page/{page}?query=depression&sortOrder=newestFirst&facet-content-type=%22Article%22&date-facet-mode=between&facet-start-year=2000&previous-start-year=2019&facet-end-year=2023&previous-end-year=2023"

    # url = f"https://link.springer.com/search/page/{page}?facet-end-year=2023&date-facet-mode=between&previous-end-year=2023&sortOrder=newestFirst&previous-start-year=1832&facet-start-year=2018&facet-content-type=%22Article%22"
    # url = f"https://link.springer.com/search/page/{page}?facet-end-year=2023&date-facet-mode=between&facet-start-year=2018&facet-content-type=%22Article%22" # 45 results
    
    # WORKS
    # url = f"https://link.springer.com/search/page/{page}?facet-end-year=2023&date-facet-mode=between&previous-end-year=2023&facet-start-year=2018&previous-start-year=2000&sortOrder=newestFirst&query=%22depressive+disorder%22+%2B+%22+depression%22&facet-content-type=%22Article%22"
    url = f"https://link.springer.com/search/page/{page}?facet-end-year=2023&date-facet-mode=between&previous-end-year=2023&facet-start-year=2018&previous-start-year=2000&sortOrder=newestFirst&facet-content-type=%22Article%22"
    return url
    

### Step 1: find all articles in search query
Loop over pages and extract links if article appears in journal of interest.

In [None]:
allowed_journals = ["Psychological Research",
                    "Current Psychology",
                    "Cognitive, Affective, & Behavioral Neuroscience",
                    "International Journal of Cognitive Therapy",
                    "Current Treatment Options in Psychiatry",
                    "European Archives of Psychiatry and Clinical Neuroscience",
                    "BMC Psychiatry",
                    "Academic Psychiatry",
                    "Social Psychiatry and Psychiatric Epidemiology"
                    "Discover Mental Health",
                    "International Journal of Mental Health and Addiction",
                    "Annals of General Psychiatry",
                    ]

In [None]:
# store results for multiple pages
results = {"journal": [], "title": [], "href": []}

# loop over pages
for page in tqdm(range(start_page, max_pages, 1)):

    # get page results
    url = make_url(page)
    soup = get_soup(url)

    articles_list = soup.find_all('li', {'class': "no-access"})
    for article in articles_list:
        
        # get all citations and extract links to article for journals of interest
        journal = article.find('a', {'class': "publication-title"})
        # if journal["title"] in allowed_journals:
        details = soup.find('a', {'class': 'title'}, href=True)
        results["href"].append(details["href"])
        results["title"].append(details.get_text())
        results["journal"].append(journal["title"]) 

  0%|          | 0/150000 [00:00<?, ?it/s]

ConnectionError: HTTPSConnectionPool(host='idp.springer.com', port=443): Max retries exceeded with url: /authorize?redirect_uri=https://link.springer.com/search/page/33635?facet-end-year%3D2023%26date-facet-mode%3Dbetween%26previous-end-year%3D2023%26facet-start-year%3D2018%26previous-start-year%3D2000%26sortOrder%3DnewestFirst%26facet-content-type%3D%2522Article%2522&client_id=springerlink&response_type=cookie (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001F078F98730>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

### Step 2: for each article extract dates

In [None]:
len(results["href"])

11656

In [None]:
springer_url = "https://link.springer.com"

all_dates = {"Received": [], "Accepted": [], "Published": []}
dates = []
print()

# extract date info for all article links obtained in previous cell
for i, href in tqdm(enumerate(results["href"])):
    # print(href)
    url = springer_url + href
    soup = get_soup(url)

    # dates are stored in list
    article_dates = soup.find_all('ul', {'class': 'c-bibliographic-information__list'})
    for date in article_dates:

        # check each element in the list of dates
        subsoup = date.findAll('p')
        accepted_received = []
        for j, soup in enumerate(subsoup):
            
            # store accepted date
            if "Accepted" in soup:
                day = soup.find('time')
                day = day["datetime"]
                accepted_received.append(day)

            # store received date 
            if "Received" in soup:

                # check if accepted is already in list
                if accepted_received != []:
                    print("only received date?")
                    print(accepted_received)
                    print("---")

                day = soup.find('time')
                day = day["datetime"]
                accepted_received.append(day)

        if accepted_received != []:
            # print(accepted_received)
            
            if len(accepted_received) == 2:
                dates.append([i] + accepted_received)
            else:
                if len(accepted_received) == 1:
                    print(f"no received date for iter {i}")
                else:
                    print(len(accepted_received))
                # store item if it is of interest and has time set
                # for date_type in ["Received", "Published", "Accepted"]:
                #     if date_type in soup.get_text():
                        
                        # all_dates[date_type].append(day)




NameError: name 'tqdm' is not defined

In [None]:
len(dates)

9422

In [None]:
dates_df = pd.DataFrame(dates)
dates_df.set_index(0, drop=True, inplace=True)
dates_df

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2022-04-19,2022-10-09
1,2022-04-19,2022-10-09
2,2022-04-19,2022-10-09
3,2022-04-19,2022-10-09
4,2022-04-19,2022-10-09
...,...,...
11651,2022-10-13,2023-02-22
11652,2022-10-13,2023-02-22
11653,2022-10-13,2023-02-22
11654,2022-10-13,2023-02-22


In [None]:
dates_df[1]

0
0        2022-04-19
1        2022-04-19
2        2022-04-19
3        2022-04-19
4        2022-04-19
            ...    
11651    2022-10-13
11652    2022-10-13
11653    2022-10-13
11654    2022-10-13
11655    2022-10-13
Name: 1, Length: 9422, dtype: object

## Data formatting

In [None]:
journals = pd.DataFrame(results["journal"])
titles = pd.DataFrame(results["title"])

In [None]:
dates_df["journal"] = journals[journals.index.isin(dates_df.index)]
dates_df["title"] = titles[titles.index.isin(dates_df.index)]
dates_df

Unnamed: 0_level_0,1,2,journal,title
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2022-04-19,2022-10-09,Marine Systems & Ocean Technology,Experimental study of an oscillating water col...
1,2022-04-19,2022-10-09,Marine Systems & Ocean Technology,Experimental study of an oscillating water col...
2,2022-04-19,2022-10-09,Marine Systems & Ocean Technology,Experimental study of an oscillating water col...
3,2022-04-19,2022-10-09,Marine Systems & Ocean Technology,Experimental study of an oscillating water col...
4,2022-04-19,2022-10-09,Molecular Imaging and Biology,Experimental study of an oscillating water col...
...,...,...,...,...
11651,2022-10-13,2023-02-22,Journal of Cryptology,Estimating Time-To-Compromise for Industrial C...
11652,2022-10-13,2023-02-22,World Journal of Urology,Estimating Time-To-Compromise for Industrial C...
11653,2022-10-13,2023-02-22,Journal of Bone and Mineral Metabolism,Estimating Time-To-Compromise for Industrial C...
11654,2022-10-13,2023-02-22,Therapeutic Innovation & Regulatory Science,Estimating Time-To-Compromise for Industrial C...


In [None]:
dates_df.to_csv(f"data/dates_df_{max_pages}_{journal_selection}_{extra_info}.csv", index=False)

In [None]:
# # convert received list to dataframe
# df_received = pd.DataFrame(all_dates["Received"], columns=["Received"])
# df_received["Received"] = pd.to_datetime(df_received["Received"])
# df_received.head()

In [None]:
# # convert accepted list to dataframe (separate since list lengths differ)
# df_accepted = pd.DataFrame(all_dates["Accepted"], columns=["Accepted"])
# df_accepted["Accepted"] = pd.to_datetime(df_accepted["Accepted"])
# df_accepted.head()

In [None]:
# df_received.shape, df_accepted.shape

In [None]:
# # store dfs as csv
# df_received.to_csv(f"data/df_received_{max_pages}_{journal_selection}_{extra_info}.csv", index=False)
# df_accepted.to_csv(f"data/df_accepted_{max_pages}_{journal_selection}_{extra_info}.csv", index=False)