In [1]:
import re
import ssl
import json
from urllib.request import urlopen
from bs4 import BeautifulSoup

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

### ЭЛЕМЕНТЫ

In [2]:
# Fetch the URL for each author

link = 'https://elementy.ru/novosti_nauki'

author_urls = []

html = urlopen(link, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('div', class_='sublink toggle_body minimize')[1].find_all('a')

for tag in tags:
    author_urls.append('https://elementy.ru' + tag['href'])

In [3]:
# Get the number of pages for each author

auth_pages = []

for url in author_urls:
    html = urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    try:
        pages = soup('a', href=re.compile('.+?page=.+'))
        num = int(pages[-2].get_text())
    except:
        num = 1
    auth_pages.append((url, num))

In [10]:
# Get article URLs for each author

names = []
d = dict()

for item in auth_pages:
    url, pages = item[0], item[1]
    
    if pages > 2: # authors with at least 3 pages (~40 articles)
        art, arts = dict(), list()
        html = urlopen(url, context=ctx).read()
        soup = BeautifulSoup(html, 'html.parser')
        
        name = re.search(r'(.+?)(?= •)', soup.find('title').get_text(' ', strip=True)).group()
        names.append(name)

        for i in range(int(pages)): # look at each page

            html = urlopen(url + f'?page={i}', context=ctx).read()
            soup = BeautifulSoup(html, 'html.parser')
            tags = soup('div', class_='clblock newslist')

            for tag in tags:
                hrefs = tag('a', class_='nohover', href=re.compile('/novosti_nauki/[0-9]+/.+'))
                for href in hrefs:
                    link = 'https://elementy.ru' + href['href']

                    art = {'Article URL': link}
                    arts.append(art)
                    
        d[name] = arts

# Write the data to a JSON file
with open('elementy_authors.json', 'w') as f:
    json.dump(d, f, indent=4)

In [21]:
# Retrieve article texts and metadata for chosen authors

auth = dict()

for name in names:
    data = list()

    with open('elementy_authors.json', 'r') as f:
        js = json.load(f)
        
        for i in range(len(js[name])):
            d = dict()
            
            url = js[name][i]['Article URL']
            html = urlopen(url, context=ctx).read()
            soup = BeautifulSoup(html, 'html.parser')
            
            # Metadata
            meta = soup.find('div', class_='mb itemhead newslist')
            title = meta.contents[0].get_text()
            date = meta.find('span', class_='date').get_text()
            topics = ', '.join([topic.get_text() for topic in meta('a')[1:-1]])

            pattern = re.compile('Источник(и?)|См. также(:)?(\s)?')
            
            # Full text
            text = ''
            tags = soup.find('div', class_='memo').contents
            for tag in tags[:-2]: # ignore author's signature
                # ignore illustrations, tables, etc.
                if tag.name == 'p' or tag.name == 'ul':
                    # ignore source materials
                    if tag.get('class') and tag.get('class')[0] == 'small': continue
                    if tag.find('b') and pattern.match(tag.find('b').get_text()): continue
                    
                    text += tag.get_text(' ', strip=True) + ' ' 

            if text:
                d = {'Author': name, 'Title': title, 'Publication date': date, \
                    'Topics': topics, 'Article URL': url, 'Full text': text}
                data.append(d)
                
        auth[name] = data

# Write the data to a JSON file
with open('elementy_texts.json', 'w') as f:
    json.dump(auth, f, indent=4)