# Utility Functions

In [1]:
# default_exp util

## SHA-1 Digest

Calculating the base 32 encoded SHA-1 digest that is commonly used in WARC files and CDX indexes.

In [2]:
#export
from hashlib import sha1
from base64 import b32encode

def sha1_digest(content: bytes) -> str:
    return b32encode(sha1(content).digest()).decode('ascii')

In [3]:
sha1_digest(b'12345')

'RSZCG7IGPHFIRW3EMTVMMDNJMNCVCOLE'

## Making URLs Pretty

Sometimes I want to return something that looks like a URL in Jupyter, but works in other environments. Adapted from [here](https://github.com/jupyterlab/jupyterlab/issues/7393).

In [4]:
#export
from dataclasses import dataclass

@dataclass(frozen=True)
class URL:
    """Wrapper around a URL string to provide nice display in IPython environments."""

    url: str

    def _repr_html_(self):
        """HTML link to this URL."""
        return f'<a href="{self.url}">{self.url}</a>'

    def __str__(self):
        """Return the underlying string."""
        return self.url

It displays nicely

In [5]:
url = URL('https://commoncrawl.org/')
url

The repr is usable

In [6]:
repr(url)

"URL(url='https://commoncrawl.org/')"

The string form is what we need

In [7]:
str(url)

'https://commoncrawl.org/'

Or we can extract it

In [8]:
url.url

'https://commoncrawl.org/'

# Session Helpers

Make a session that can run multiple concurrent requests and retry for intermittent failures.

In [9]:
#export
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def make_session(pool_maxsize):
    retry_strategy =  Retry(total=5, backoff_factor=1, status_forcelist=set([504, 500]))
    adapter = HTTPAdapter(max_retries=retry_strategy, pool_maxsize=pool_maxsize, pool_block=True)
    session = requests.Session()
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

## Joblib Helpers

Forcing a function with joblib.Memory

In [10]:
def _forced(f, force):
    """Forced version of memoized function with Memory"""
    assert hasattr(f, 'call')
    if not force:
        return f
    def result(*args, **kwargs):
        # Force returns a tuple of result,metadata
        return f.call(*args, **kwargs)[0]
    return result