In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# default_exp runners

# Process

In [13]:
# export
from __future__ import annotations
from dataclasses import dataclass
from typing import Callable



In [14]:
# export
import logging
from tqdm.auto import tqdm

@dataclass
class ProcessMemory:
    queries: list[Callable]
    steps: list[Callable]
    filter: Callable
    progress_bar: bool = True

    def query(self):
        for query in tqdm(self.queries, desc='query', disable=not self.progress_bar):
            for record in query.query():
                yield record

    def prepare(self, records):
        return self.filter(tqdm(records, desc='filter', disable=not self.progress_bar))

    def fetch(self, records):
        for record in tqdm(records, desc='fetch', disable=not self.progress_bar):
            yield (record.content, record)

    def transform(self, content_records):
        for content, record in tqdm(content_records, desc='transform', disable=not self.progress_bar):
            error = False
            for step in self.steps:
                try:
                    content = step(content, record)
                except Exception as e:
                    error = True
                    logging.error('Error processing %s at step %s: %s' % (record, step.__name__, e))
                    break
            if not error:
                yield content

    def run(self):
        records = self.prepare(self.query())
        content_records = self.fetch(records)
        return self.transform(content_records) 

## Simple test process

In practice we'd use a something like parsel, beautifulsoup or selectolax.
However for a simple demo this has no external dependencies.

In [15]:
from html.parser import HTMLParser

class SkeptricHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.extract = {}
        self.field = None
        
    def handle_starttag(self, tag, attrs):
        if dict(attrs).get('class') == 'post-full-title':
            self.field = 'title'
        if dict(attrs).get('class') == 'byline-meta-date':
            self.field = 'date'

    def handle_endtag(self, tag):
        self.field = None

    def handle_data(self, data):
        if self.field is not None:
            self.extract[self.field] = data

def skeptric_filter(records):
    return [r for r in records if r.mime == 'text/html' and r.status == 200]
            
def skeptric_extract(content, metadata):
    parser = SkeptricHTMLParser()
    html = content.decode('utf-8')
    parser.feed(html)
    data = parser.extract
    data['url'] = metadata.url
    data['timestamp'] = metadata.timestamp
    return data

def skeptric_verify_extract(content, metadata):
    if not content.get('title'):
        raise ValueError('Missing title')
    if not content.get('date'):
        raise ValueError('Missing date')
    return content

from datetime import datetime
def skeptric_normalise(content, metadata):
    content = content.copy()
    content['date'] = datetime.strptime(content['date'], '%d %B %Y')
    return content

from webrefine.query import WarcFileQuery
test_data = '../resources/test/skeptric.warc.gz'

skeptric_query = WarcFileQuery(test_data)

In [16]:
skeptric_process = ProcessMemory(queries=[skeptric_query],
                     filter=skeptric_filter,
                     steps=[skeptric_extract, skeptric_verify_extract, skeptric_normalise])

In [17]:
list(skeptric_process.run())

filter: 0it [00:00, ?it/s]

query:   0%|          | 0/1 [00:00<?, ?it/s]

transform: 0it [00:00, ?it/s]

fetch:   0%|          | 0/8 [00:00<?, ?it/s]

ERROR:root:Error processing WarcFileRecord(url='https://skeptric.com/', timestamp=datetime.datetime(2021, 11, 26, 11, 28, 36), mime='text/html', status=200, path=PosixPath('../resources/test/skeptric.warc.gz'), offset=17122, digest='JJVB3MQERHRZJCHOJNKS5VDOODXPZAV2') at step skeptric_verify_extract: Missing title
ERROR:root:Error processing WarcFileRecord(url='https://skeptric.com/tags/data/', timestamp=datetime.datetime(2021, 11, 26, 11, 28, 38), mime='text/html', status=200, path=PosixPath('../resources/test/skeptric.warc.gz'), offset=130269, digest='R7CLAACFU5L7T5LKI5G53RZSMCNUNV6F') at step skeptric_verify_extract: Missing title


[{'title': "Pagination in Internet Archive's Wayback Machine with CDX",
  'date': datetime.datetime(2021, 11, 23, 0, 0),
  'url': 'https://skeptric.com/pagination-wayback-cdx/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 34)},
 {'title': 'About Skeptric',
  'date': datetime.datetime(2021, 10, 18, 0, 0),
  'url': 'https://skeptric.com/about/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 37)},
 {'title': 'Searching 100 Billion Webpages Pages With Capture Index',
  'date': datetime.datetime(2020, 6, 11, 0, 0),
  'url': 'https://skeptric.com/searching-100b-pages-cdx/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 39)},
 {'title': 'Fast Web Dataset Extraction Worfklow',
  'date': datetime.datetime(2021, 11, 21, 0, 0),
  'url': 'https://skeptric.com/fast-web-data-workflow/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 39)},
 {'title': 'Unique Key for Web Captures',
  'date': datetime.datetime(2021, 11, 19, 0, 0),
  'url': 'https://skeptric.com/key-

In [18]:
skeptric_process.progress_bar = False

In [19]:
list(skeptric_process.run())

ERROR:root:Error processing WarcFileRecord(url='https://skeptric.com/', timestamp=datetime.datetime(2021, 11, 26, 11, 28, 36), mime='text/html', status=200, path=PosixPath('../resources/test/skeptric.warc.gz'), offset=17122, digest='JJVB3MQERHRZJCHOJNKS5VDOODXPZAV2') at step skeptric_verify_extract: Missing title
ERROR:root:Error processing WarcFileRecord(url='https://skeptric.com/tags/data/', timestamp=datetime.datetime(2021, 11, 26, 11, 28, 38), mime='text/html', status=200, path=PosixPath('../resources/test/skeptric.warc.gz'), offset=130269, digest='R7CLAACFU5L7T5LKI5G53RZSMCNUNV6F') at step skeptric_verify_extract: Missing title


[{'title': "Pagination in Internet Archive's Wayback Machine with CDX",
  'date': datetime.datetime(2021, 11, 23, 0, 0),
  'url': 'https://skeptric.com/pagination-wayback-cdx/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 34)},
 {'title': 'About Skeptric',
  'date': datetime.datetime(2021, 10, 18, 0, 0),
  'url': 'https://skeptric.com/about/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 37)},
 {'title': 'Searching 100 Billion Webpages Pages With Capture Index',
  'date': datetime.datetime(2020, 6, 11, 0, 0),
  'url': 'https://skeptric.com/searching-100b-pages-cdx/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 39)},
 {'title': 'Fast Web Dataset Extraction Worfklow',
  'date': datetime.datetime(2021, 11, 21, 0, 0),
  'url': 'https://skeptric.com/fast-web-data-workflow/',
  'timestamp': datetime.datetime(2021, 11, 26, 11, 28, 39)},
 {'title': 'Unique Key for Web Captures',
  'date': datetime.datetime(2021, 11, 19, 0, 0),
  'url': 'https://skeptric.com/key-

We can always look up an error

Would be nicer if everything was a string so we didn't have to handle the imports...

In [20]:
from webrefine.query import WarcFileRecord
import datetime
from pathlib import PosixPath
WarcFileRecord(url='https://skeptric.com/', timestamp=datetime.datetime(2021, 11, 26, 11, 28, 36), mime='text/html', status=200, path=PosixPath('../resources/test/skeptric.warc.gz'), offset=17122, digest='JJVB3MQERHRZJCHOJNKS5VDOODXPZAV2')

WarcFileRecord(url='https://skeptric.com/', timestamp=datetime.datetime(2021, 11, 26, 11, 28, 36), mime='text/html', status=200, path=PosixPath('../resources/test/skeptric.warc.gz'), offset=17122, digest='JJVB3MQERHRZJCHOJNKS5VDOODXPZAV2')