diff --git a/README.md b/README.md index 275a3bc..39fd062 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ Example of datasets to download with example commands are available in the [data * [laion5B](dataset_examples/laion5B.md) 5B image/text pairs that can be downloaded in 7 days using 10 nodes * [laion-aesthetic](dataset_examples/laion-aesthetic.md) Laion aesthetic is a 120M laion5B subset with aesthetic > 7 pwatermark < 0.8 punsafe < 0.5 * [laion-art](dataset_examples/laion-art.md) Laion aesthetic is a 8M laion5B subset with aesthetic > 8 pwatermark < 0.8 punsafe < 0.5 +* [laion-art-noai](dataset_examples/laion-art-noai.md) As above, but respecting requests made by artists to exclude their work from generative AI training sets. * [laion-high-resolution](dataset_examples/laion-high-resolution.md) Laion high resolution is a 170M resolution >= 1024x1024 subset of laion5B * [laion-face](dataset_examples/laion-face.md) Laion face is the human face subset of LAION-400M for large-scale face pretraining. It has 50M image-text pairs. @@ -146,6 +147,8 @@ This module exposes a single function `download` which takes the same arguments * **max_aspect_ratio** maximum aspect ratio of the image to download (default *inf*) * **incremental_mode** Can be "incremental" or "overwrite". For "incremental", img2dataset will download all the shards that were not downloaded, for "overwrite" img2dataset will delete recursively the output folder then start from zero (default *incremental*) * **max_shard_retry** Number of time to retry failed shards at the end (default *1*) +* **user_agent_token** Additional identifying token that will be added to the User-Agent header sent with HTTP requests to download images; for example: "img2downloader". (default *None*) +* **disallowed_header_directives** List of X-Robots-Tags header directives that, if present in HTTP response when downloading an image, will cause the image to be excluded from the output dataset; for example: '["noai", "noindex"]' (default *None*) ## Incremental mode @@ -191,6 +194,13 @@ If needed, you can use: When filtering data, it is recommended to pre-shuffle your dataset to limit the impact on shard size distribution. +## Respecting opt-out directives + +Copyright holders can communicate image usage restrictions by sending `X-Robots-Tag: noai` or `X-Robots-Tag: noindex` HTTP header directives when images are downloaded. + +To respect such directives, you can use: +* --disallowed_header_directives '["noai", "noindex"]' : to filter out images with these directives + ## How to tweak the options The default values should be good enough for small sized dataset. For larger ones, these tips may help you get the best performance: diff --git a/dataset_examples/laion-art-noai.md b/dataset_examples/laion-art-noai.md new file mode 100644 index 0000000..c6b29b9 --- /dev/null +++ b/dataset_examples/laion-art-noai.md @@ -0,0 +1,31 @@ +## Laion-art + +Laion art is a 8M samples laion5B subset with aesthetic > 8 pwatermark < 0.8 punsafe < 0.5 +See [full description](https://github.com/LAION-AI/laion-datasets/blob/main/laion-aesthetic.md) + +It is available at https://huggingface.co/datasets/laion/laion-art + +A good use case is to train an image generation model. However, concerns have been raised about how to ethically source training data at scale for such purposes, especially where creator consent may not have been expressed in advance of dataset construction. One solution is to allow artists to opt their images out of such usage by sending HTTP header directives when image content is downloaded. img2dataset can be configured to automatically respect such directives. + +### Download the metadata + +Download from [https://huggingface.co/datasets/laion/laion1B-nolang-aesthetic +https://huggingface.co/datasets/laion/laion2B-en-aesthetic +https://huggingface.co/datasets/laion/laion2B-multi-aesthetic](https://huggingface.co/datasets/laion/laion-art) + +``` +wget https://huggingface.co/datasets/laion/laion-art/resolve/main/laion-art.parquet +``` + +### Download the images with img2dataset, respecting noai and noindex directives + +``` +img2dataset --url_list laion-art --input_format "parquet"\ + --url_col "URL" --caption_col "TEXT" --output_format webdataset\ + --output_folder laion-high-resolution --processes_count 16 --thread_count 64 --image_size 384\ + --resize_only_if_bigger=True --resize_mode="keep_ratio" --skip_reencode=True \ + --save_additional_columns '["similarity","hash","punsafe","pwatermark","aesthetic","LANGUAGE"]' --enable_wandb True \ + --user_agent_token img2dataset --disallowed_header_directives '["noai", "noindex"]' +``` + +### Benchmark diff --git a/img2dataset/downloader.py b/img2dataset/downloader.py index 7595aa9..49fe8df 100644 --- a/img2dataset/downloader.py +++ b/img2dataset/downloader.py @@ -17,17 +17,39 @@ from .logger import write_stats -def download_image(row, timeout): +def is_disallowed(headers, user_agent_token, disallowed_header_directives): + """Check if HTTP headers contain an X-Robots-Tag directive disallowing usage""" + for values in headers.get_all("X-Robots-Tag", []): + try: + uatoken_directives = values.split(":", 1) + directives = [x.strip().lower() for x in uatoken_directives[-1].split(",")] + ua_token = uatoken_directives[0].lower() if len(uatoken_directives) == 2 else None + if (ua_token is None or ua_token == user_agent_token) and any( + x in disallowed_header_directives for x in directives + ): + return True + except Exception as err: # pylint: disable=broad-except + traceback.print_exc() + print(f"Failed to parse X-Robots-Tag: {values}: {err}") + return False + + +def download_image(row, timeout, user_agent_token, disallowed_header_directives): """Download an image with urllib""" key, url = row img_stream = None + user_agent_string = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0" + if user_agent_token: + user_agent_string += f" (compatible; {user_agent_token}; +https://github.com/rom1504/img2dataset)" try: - request = urllib.request.Request( - url, - data=None, - headers={"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"}, - ) + request = urllib.request.Request(url, data=None, headers={"User-Agent": user_agent_string}) with urllib.request.urlopen(request, timeout=timeout) as r: + if disallowed_header_directives and is_disallowed( + r.headers, + user_agent_token, + disallowed_header_directives, + ): + return key, None, "Use of image disallowed by X-Robots-Tag directive" img_stream = io.BytesIO(r.read()) return key, img_stream, None except Exception as err: # pylint: disable=broad-except @@ -36,9 +58,9 @@ def download_image(row, timeout): return key, None, str(err) -def download_image_with_retry(row, timeout, retries): +def download_image_with_retry(row, timeout, retries, user_agent_token, disallowed_header_directives): for _ in range(retries + 1): - key, img_stream, err = download_image(row, timeout) + key, img_stream, err = download_image(row, timeout, user_agent_token, disallowed_header_directives) if img_stream is not None: return key, img_stream, err return key, None, err @@ -71,6 +93,8 @@ def __init__( compute_md5, encode_format, retries, + user_agent_token, + disallowed_header_directives, ) -> None: self.sample_writer_class = sample_writer_class self.resizer = resizer @@ -85,6 +109,12 @@ def __init__( self.compute_md5 = compute_md5 self.encode_format = encode_format self.retries = retries + self.user_agent_token = None if user_agent_token is None else user_agent_token.strip().lower() + self.disallowed_header_directives = ( + None + if disallowed_header_directives is None + else {directive.strip().lower() for directive in disallowed_header_directives} + ) def __call__( self, @@ -164,7 +194,13 @@ def data_generator(): oom_sample_per_shard = math.ceil(math.log10(self.number_sample_per_shard)) with ThreadPool(self.thread_count) as thread_pool: for key, img_stream, error_message in thread_pool.imap_unordered( - lambda x: download_image_with_retry(x, timeout=self.timeout, retries=self.retries), + lambda x: download_image_with_retry( + x, + timeout=self.timeout, + retries=self.retries, + user_agent_token=self.user_agent_token, + disallowed_header_directives=self.disallowed_header_directives, + ), loader, ): try: diff --git a/img2dataset/main.py b/img2dataset/main.py index 406e6f4..a40e40f 100644 --- a/img2dataset/main.py +++ b/img2dataset/main.py @@ -56,6 +56,8 @@ def download( max_aspect_ratio: float = float("inf"), incremental_mode: str = "incremental", max_shard_retry: int = 1, + user_agent_token: Optional[str] = None, + disallowed_header_directives: Optional[List[str]] = None, ): """Download is the main entry point of img2dataset, it uses multiple processes and download multiple files""" config_parameters = dict(locals()) @@ -166,6 +168,8 @@ def signal_handler(signal_arg, frame): # pylint: disable=unused-argument compute_md5=compute_md5, encode_format=encode_format, retries=retries, + user_agent_token=user_agent_token, + disallowed_header_directives=disallowed_header_directives, ) print("Starting the downloading of this file") diff --git a/tests/fixtures.py b/tests/fixtures.py index 9199904..ef9163c 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -6,7 +6,7 @@ import sys -def setup_fixtures(count=5): +def setup_fixtures(count=5, disallowed=0): test_list = [] current_folder = os.path.dirname(__file__) test_folder = current_folder + "/" + "resize_test_image" @@ -15,10 +15,23 @@ def setup_fixtures(count=5): for i in range(count): item = random.randint(0, len(image_paths) - 1) test_list.append( - (f"caption {i}" if i != 0 else "", image_paths[item].replace(test_folder, f"http://localhost:{port}")) + ( + f"caption {i}" if i != 0 else "", + image_paths[item].replace(test_folder, f"http://localhost:{port}/allowed"), + ) ) test_list = test_list[:count] + for i in range(disallowed): + item = random.randint(0, len(image_paths) - 1) + test_list.append( + ( + f"caption {i}" if i != 0 else "", + image_paths[item].replace(test_folder, f"http://localhost:{port}/disallowed"), + ) + ) + test_list = test_list[: count + disallowed] + return test_list diff --git a/tests/http_server.py b/tests/http_server.py index d725f79..b3eb6d0 100644 --- a/tests/http_server.py +++ b/tests/http_server.py @@ -1,9 +1,16 @@ import os -from fastapi import FastAPI +from fastapi import FastAPI, Response from fastapi.staticfiles import StaticFiles +class StaticFilesXRobotsTagHeader(StaticFiles): + async def get_response(self, *args, **kwargs) -> Response: + response = await super().get_response(*args, **kwargs) + response.headers["X-Robots-Tag"] = "noai, noimageai, noindex, noimageindex, nofollow" + return response + + app = FastAPI() current_folder = os.path.dirname(__file__) @@ -15,4 +22,5 @@ async def get(): return "hi" -app.mount("/", StaticFiles(directory=test_folder), name="static") +app.mount("/allowed", StaticFiles(directory=test_folder), name="static_allowed") +app.mount("/disallowed", StaticFilesXRobotsTagHeader(directory=test_folder), name="static_disallowed") diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 180f2c8..9c4a1b4 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -36,6 +36,8 @@ def test_unique_md5(tmp_path): compute_md5=True, encode_format="jpg", retries=0, + user_agent_token="img2dataset", + disallowed_header_directives=["noai", "noindex"], ) tmp_file = os.path.join(test_folder, "test_list.feather") @@ -57,7 +59,12 @@ def test_unique_md5(tmp_path): def test_downloader(tmp_path): test_folder = str(tmp_path) - test_list = setup_fixtures(count=5) + n_allowed = 5 + n_disallowed = 5 + test_list = setup_fixtures(count=n_allowed, disallowed=n_disallowed) + + assert len(test_list) == n_allowed + n_disallowed + image_folder_name = os.path.join(test_folder, "images") os.mkdir(image_folder_name) @@ -79,6 +86,8 @@ def test_downloader(tmp_path): compute_md5=True, encode_format="jpg", retries=0, + user_agent_token="img2dataset", + disallowed_header_directives=["noai", "noindex"], ) tmp_file = os.path.join(test_folder, "test_list.feather") @@ -87,4 +96,4 @@ def test_downloader(tmp_path): downloader((0, tmp_file)) - assert len(os.listdir(image_folder_name + "/00000")) == 3 * len(test_list) + assert len(os.listdir(image_folder_name + "/00000")) == 3 * n_allowed