In [1]:
import datetime
import functools
import operator
import random
import re
from itertools import zip_longest

import orchest
import pandas as pd
from bs4 import BeautifulSoup

from utils import fetch_urls_in_parallel

In [2]:
df = pd.DataFrame([], columns=["Time", "Count"])
p_count = 0
log_path = "/data/log-scrape-pipeline.csv"


def log_count(count):
    global df
    df = df.append({"Time": datetime.datetime.now(), "Count": count}, ignore_index=True)
    df.to_csv(log_path)
    
log_count(0)

### Scraping

In [3]:
domains = orchest.get_inputs()["domains"]

In [4]:
def contains_skip_chars(string):
    return any(elem in string for elem in ["›"])

In [5]:
def clean_text(text):
    return re.sub(
        " +", " ", text.replace("\r\n", " ").replace("\t", " ").replace("\n", " ")
    ).strip()

In [6]:
def extract_paragraphs(html):
    soup = BeautifulSoup(html, features="html.parser")
    return [
        clean_text(p.text)
        for p in soup.find_all("p")
        if len(p.text) > 25 and not contains_skip_chars(p.text)
    ]

In [7]:
def grouper(n, iterable, fillvalue=None):
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)

In [8]:
def get_paragraphs(urls):
    global p_count

    # Fetch all URLs

    # Parse urls
    ps = []

    for _urls in grouper(10, urls):
        _urls = [i for i in list(_urls) if i]

        resps = fetch_urls_in_parallel(_urls, 10)

        _ps = [extract_paragraphs(resp.content) for resp in resps if resp is not None]
        _ps = functools.reduce(operator.iconcat, _ps, [])

        ps += _ps
    
        p_count += len(_ps)
        log_count(p_count)

    return ps

In [9]:
ps_nl = get_paragraphs(domains["nl"])

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [10]:
ps_nl[:1]

['Staat er op Wikipedia een artikel zonder portretfoto? En heeft u een foto die bij een artikel zou passen? Stel dan uw foto hier ter beschikking en de vrijwilligers van Wikipedia doen de rest.']

In [11]:
ps_de = get_paragraphs(domains["de"])

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [12]:
ps_de[:1]

['Ausschnitt TK 25 NRWWanderkarte Bergisches Land 1: 25.000, Karte 5: Süden; Das Bergische 2015. ISBN 9783936405880']

In [13]:
print("Found %d Dutch sentences." % len(ps_nl))
print("Found %d German sentences." % len(ps_de))

Found 811 Dutch sentences.
Found 2881 German sentences.


In [14]:
min_size = min(len(ps_nl), len(ps_de))

In [15]:
print("Equal sizing datasets: %d " % min_size)

Equal sizing datasets: 811 


In [16]:
orchest.output(
    {
        "nl": random.sample(ps_nl, min_size),
        "de": random.sample(ps_de, min_size),
    },
    name="sentence_data",
)