In [None]:
%pip install --upgrade beautifulsoup4

In [None]:
%pip install --upgrade tinydb

In [None]:
%pip install --upgrade ipywidgets

Define a function for retrieving a page of filtered articles

In [None]:
import requests

HTTP_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0'
}

def retrieve_page(page_number):
    url = 'https://midasnetwork.us/wp-admin/admin-ajax.php'

    data = {
        'action': 'filter_papers',
        'paged': f'{page_number}',
        'journal': '',
        'author': '',
        'title': '',
        'startdate': '2024-10-01',
        'enddate': '',
        'displaydefault': '',
    }

    return requests.post(url, data=data, headers=HTTP_HEADERS)

Retrieve the first page of the paginated list of filtered articles. Determine the number of pages to process.

In [None]:
from bs4 import BeautifulSoup

page = retrieve_page(1)

soup = BeautifulSoup(page.text, 'html.parser')
pages = soup.find_all('a', {'class', 'page-numbers'})

last_page = int(pages[-2].text)

print(f'pages to process: {last_page}')

Create a list of links to article detail pages by iterating over the pages of filtered articles.

In [None]:
article_links = []
i = 1

while True:
    print(f'processing page {i} of {last_page}')

    for article in soup.find_all('article'):
        link = article.find('a')
        article_links.append(link.get('href'))

    # stop if we just processed the last page
    if i == last_page:
        break

    i += 1

    print(f'retrieving page {i}')

    page = retrieve_page(i)
    soup = BeautifulSoup(page.text, 'html.parser')

Load details for each article by iterating over the list of article detail page links and loading and processing each page.

In [None]:
import json

from ipywidgets import IntProgress
from IPython.display import display

progress_bar = IntProgress(min=0, max=len(article_links)) # instantiate a progress bar
display(progress_bar)

articles = []
print(f'procesing {len(article_links)} articles')

for link in article_links:
    resp = requests.get(link, headers=HTTP_HEADERS)
    soup = BeautifulSoup(resp.text, 'html.parser')

    data = soup.find_all('p', {'class', 'elementor-heading-title'})
    
    article = {
        'title': data[0].text,
        'abstract': data[1].text,
        'journal': data[2].text,
        'reference': data[3].text
    }
    
    refs = data[3].find_all('a')
    article['link'] = refs[1].get('href')

    articles.append(article)

    progress_bar.value += 1

Store the article details in a local NoSQL (JSON) database for later re-use.

In [None]:
from tinydb import TinyDB, Query

db = TinyDB('db.json')
table = db.table('articles')

for article in articles:
    table.insert(article)

print(f'{len(table.all())} articles stored (total)')