In [None]:
# default_exp cleaner

In [None]:
# hide
from nbdev.showdoc import *

# Cleaner

> The clearners are the last but not least blocks of web2dataset. Their goal is to purge and clean the dataset.

Example of cleaner (not yet implemented):

* delete double (based on hash)
* delete image with low resolution
* ethic base purger, how to ?

There are two kind of cleaner, the one that work on metadata that are called before downloading the image and the one that work on image and are called after


In [None]:
# export
from functools import wraps
from typing import List

from web2dataset.document import Document

# MetaDataCleaner

In [None]:
# export
class MetaDataCleanerError(ValueError):
    pass

a cleaner should delete docs not create them, so we verify than we did not create new docs with this wrapper

In [None]:
# export
def check_no_docs_creation(f):
    @wraps(f)
    def wrapper(self, docs: List[Document]) -> List[Document]:
        new_docs = f(self, docs)
        if len(new_docs) > len(docs):
            raise MetaDataCleanerError(
                f"the cleaner should not create more docs than originaly. There were before {len(docs)} docs and there are now {len(new_docs)} docs"
            )
        return new_docs

    return wrapper

Here is the abstract class for the meta data cleaner. It only operate on documents not images

In [None]:
# export
class MetaDataCleaner:
    @check_no_docs_creation
    def clean(self, docs: List[Document]) -> List[Document]:
        pass

here is a basic cleaner that is mainly used for testing

In [None]:
# export
class IdentityCleaner:
    @check_no_docs_creation
    def clean(self, docs: List[Document]) -> List[Document]:
        return docs

In [None]:
docs = [
    Document(origin="", image_url="https://image/bike"),
    Document(origin="", image_url="https://image/bike"),
    Document(origin="", image_url="https://image/bmx"),
]

In [None]:
cleaner = IdentityCleaner()
docs = cleaner.clean(docs)

## Duplicate cleaner

This MetaDataDuplicateCleaner delete any duplicate, i.e document with the same src image to avoid downloading twice the same image.
It is different from the ImageDuplicateCleaner, will delete two identical image after the donwload, this images could come from two different sources be still be the same

In [None]:
# export
class DuplicateCleaner(MetaDataCleaner):
    @check_no_docs_creation
    def clean(self, docs: List[Document]) -> List[Document]:
        url_doc = {
            doc.image_url: doc for doc in docs
        }  # first we create a dict with image url as key because we want to keep only one doc per image_rul
        return list(url_doc.values())

In [None]:
docs = [
    Document(origin="", image_url="https://image/bike"),
    Document(origin="", image_url="https://image/bike"),
    Document(origin="", image_url="https://image/bmx"),
]

In [None]:
url = [doc.image_url for doc in docs]
len(url), len(set(url))

(3, 2)

as we can see in this list of doc there are 3 url but only two of them are different. Let's fix it

In [None]:
cleaner = DuplicateCleaner()

In [None]:
docs2 = cleaner.clean(docs)
url = [doc.image_url for doc in docs2]

assert len(url) == len(set(url))

# ImageCleaner

In [None]:
# export
class ImageCleaner:
    def __init__(self, path: str):
        self.path = path

    def clean(self):
        pass