In [7]:
# URLs of a wonderful website of Ukrainian poetry
url_base = "https://onlyart.org.ua"

# We intend to scrape three main categories:
url_classics = "/ukrainian-poets/"
url_modern = "/modern-ukrainian-poets/"
url_live = "/live/"

In [68]:
"""
The general strategy is to scrape all three sections (the 'classics' and 
'modern' sections look nearly identical, the 'live' section will require a slightly
different approcah with pagination support, different author parsing, etc.)

Resulting data is stored as JSON like so:

{
    poet_name: {
        poem_title: poem_text,
        ...
    }
    ...
}

"""

import requests
from bs4 import BeautifulSoup as bs
import re
import json

from pprint import pprint as pp


def run_request(url):
    """Fetch & soupify url. Use sparingly"""
    
    # This will mask us as a legit user and help avoid 424
    headers = {
        'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15',
        'Referer': 'https://onlyart.org.ua/',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'br, gzip, deflate',
        'Host': 'onlyart.org.ua',
        'Connection': 'keep-alive'
    }
    
    # Run request
    r = requests.get(url, headers=headers)
    soup = bs(r.text, 'html.parser')
    
    # Return a BS4 object
    return soup


def get_poet_urls(url):
    """Scraping step 1:
    Gets urls for all poets in a section:
    base_url -> {poet_name: poet_url}
    """
    
    # Parse and target 'posts' section
    soup = run_request(url)
    posts = soup.find(id='posts')

    poet_urls = {}

    for post in posts:
        if post != '\n':
            try:
                link = post.a['href']
                name = post.a.string
            except TypeError:
                pass
            poet_urls[name] = link
    
    # Return populated dict
    return poet_urls


def get_poem_urls(url):
    """Scraping step 2:
    Gets urls of poem pages from a poet section page:
    poet_url -> {poem_name: poem_url}"""
    
    poems = {}

    soup = run_request(url)    
    entries = soup.find_all(class_='entry')

    for entry in entries:
        a_tags = entry.find_all('a')
        for tag in a_tags:
            link = tag['href']

            # This is an ugly hack to merge all text 
            # regardless of nested tags
            name = tag.text
            
            """
            name = ''
            
            contents = tag.contents
            for item in contents:
                item = item.string
                name = name + item
            """

            poems[name] = link
    
    return poems
    

def get_poem_text(url):
    """Scraping step 3:
    Fetch and return
    
    TODO: Separate titles and subtitles
    """
    
    stop = {
        '(adsbygoogle=window.adsbygoogle||[]).push({});',
        '\n\n'
    }
    
    # Fetch text
    soup = run_request(url)
    poem_text = soup.article.text
    
    # Remove stopwords
    for word in stop:
        poem_text = poem_text.replace(word, '')
    
    return poem_text
    
    
def scrape_section(url):
    """Scrape entire section
    
    WARNING: Takes a very long time and puts strain on target server.
    Use with caution."""
    
    poets = get_poet_urls(url)
    
    for poet in poets.keys():
        poems = get_poem_urls(poets[poet])
        for poem in poems.keys():
            
            # A small number of poems will fail to fetch,
            # we just ignore them for now.
            try:
                text = get_poem_text(poems[poem])
                poems[poem] = text
            except:
                text = "#ERROR at:"+poems[poem]
        
        poets[poet] = poems
    
    return poets


In [63]:
# Run routine

url = url_base + url_classics
poets = scrape_section(url)

In [67]:
# Save data to a file

s = json.dumps(poets)
with open('poets.json', 'w+') as f:
    f.write(s)

[1]


In [6]:
import pyphen 

dic = pyphen.Pyphen(lang='uk_UA')
dic.inserted('збереження')

'збе-ре-же-н-ня'