rom1504 · rom1504 · Aug 20, 2023 · Jan 14, 2023 · Jan 27, 2023 · May 2, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -63,5 +63,7 @@ jobs:
     - name: Unit tests
       run: |
         source .env/bin/activate
+        ray start --disable-usage-stats
+        ray start --address='127.0.0.1:6379'
         make test
 
diff --git a/examples/ray_example/cluster_minimal.yaml b/examples/ray_example/cluster_minimal.yaml
@@ -0,0 +1,67 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: minimal
+min_workers: 0
+max_workers: 10
+upscaling_speed: 1.0
+available_node_types:
+    ray.head.default:
+        resources: {}
+        node_config:
+            ImageId: ami-0ea1c7db66fee3098
+            InstanceType: m5.24xlarge
+            # if you have  an IamInstanceProfile fill it out here...
+            #IamInstanceProfile: 
+            #    Arn: <instance_profile_arn>
+    ray.worker.default:
+        min_workers: 0
+        max_workers: 500
+        node_config:
+            ImageId: ami-0ea1c7db66fee3098
+            InstanceType: m5.24xlarge
+            InstanceMarketOptions:
+                MarketType: spot
+            # if you have  an IamInstanceProfile fill it out here...
+            #IamInstanceProfile: 
+            #    Arn: <instance_profile_arn>
+
+# Cloud-provider specific configuration.
+provider:
+    type: aws
+    region: us-east-1
+
+initialization_commands:
+    - wget https://secure.nic.cz/files/knot-resolver/knot-resolver-release.deb 
+    - sudo dpkg -i knot-resolver-release.deb
+    - sudo apt update
+    - sudo apt install -y knot-resolver
+    - sudo sh -c 'echo `hostname -I` `hostname` >> /etc/hosts'
+    - sudo sh -c 'echo nameserver 127.0.0.1 > /etc/resolv.conf'
+    - sudo systemctl stop systemd-resolved
+    - sudo systemctl start kresd@1.service
+    - sudo systemctl start kresd@2.service
+    - sudo systemctl start kresd@3.service
+    - sudo systemctl start kresd@4.service
+    - sudo systemctl start kresd@5.service
+    - sudo systemctl start kresd@6.service
+    - sudo systemctl start kresd@7.service
+    - sudo systemctl start kresd@8.service
+    - sudo apt-get install ffmpeg libsm6 libxext6  -y
+
+setup_commands:
+    - wget https://repo.anaconda.com/miniconda/Miniconda3-py39_22.11.1-1-Linux-x86_64.sh -O miniconda.sh
+    - bash ~/miniconda.sh -f -b -p miniconda3/
+    - echo 'export PATH="$HOME/miniconda3/bin/:$PATH"' >> ~/.bashrc
+    # if you have AWS CREDS fill them out here
+    #- echo 'export AWS_ACCESS_KEY_ID=<AWS_KEY>'  >> ~/.bashrc
+    #- echo 'export AWS_SECRET_ACCESS_KEY=<AWS_SECRET_KEY>' >> ~/.bashrc
+    - pip install --upgrade pip setuptools wheel
+    - pip install ray
+    - pip uninstall -y img2dataset
+    - pip install git+https://github.com/vaishaal/img2dataset.git@7aadba42f8008106bd38475e06e78e79dfe4bbeb
+    - pip install opencv-python --upgrade 
+    - wandb login ead6ddc201a45d8e0d9b6e76220e4faf18178820
+    - pip install s3fs==2022.11.0 
+    - pip install botocore==1.27.59
+
+head_setup_commands: []
+
diff --git a/examples/ray_example/ray_example.py b/examples/ray_example/ray_example.py
@@ -0,0 +1,49 @@
+import sys
+import time
+from collections import Counter
+
+import ray
+from img2dataset import download
+
+import argparse
+
+
+
+
+@ray.remote
+def main(args):
+    download(
+	processes_count=1, 
+	thread_count=32,
+	retries=0,
+	timeout=10,
+	url_list=args.url_list,
+	image_size=512,
+	resize_only_if_bigger=True,
+	resize_mode="keep_ratio_largest",
+	skip_reencode=True,
+	output_folder=args.out_folder,
+	output_format="webdataset",
+	input_format="parquet",
+	url_col="url",
+	caption_col="alt",
+	enable_wandb=True,
+	subjob_size=48*120*2,
+	number_sample_per_shard=10000,
+	distributor="ray",
+	oom_shard_count=8,
+    compute_hash="sha256",
+	save_additional_columns=["uid"]
+    )
+
+if __name__ == "__main__":
+	parser = argparse.ArgumentParser()
+	parser.add_argument("--url_list")
+	parser.add_argument("--out_folder")
+	args = parser.parse_args()
+	ray.init(address="localhost:6379")
+	main(args)
+
+
+
+
diff --git a/examples/ray_example/readme.md b/examples/ray_example/readme.md
@@ -0,0 +1,19 @@
+# Parallelizing Img2Dataset using Ray 
+If you do not want to set up a PySpark cluster, you can also set up a ray cluster, functionally they are 
+close to the same but ray handles a larger amount of tasks better and doesn't have the "staged" nature of 
+Spark which is great if you have a large queue of tasks and don't want to be vulnerable to the stragglers in each batch.
+The tooling to set up a Ray cluster on AWS is slightly better at the time of writing this document (Jan 2023)
+
+## Instructions for running a large img2dataset job on a ray cluster on AWS
+First install ray:
+``` pip install ray ```
+
+If you are on AWS you can spin up a ray cluster this way:
+
+``` ray up cluster_minimal.yaml ```
+
+Then you can run your job:
+```ray submit cluster_minmal.yaml ray_example.py -- --url_list <url_list> --out_folder <out_folder>```
+
+Using the above code I was able to achieve a maximum download rate of 220,000 images/second on a cluster of 100 m5.24xlarge (9600 cores).
+
diff --git a/img2dataset/distributor.py b/img2dataset/distributor.py
@@ -28,7 +28,7 @@ def multiprocessing_distributor(processes_count, downloader, reader, _, max_shar
 
         def run(gen):
             failed_shards = []
-            for (status, row) in tqdm(process_pool.imap_unordered(downloader, gen)):
+            for status, row in tqdm(process_pool.imap_unordered(downloader, gen)):
                 if status is False:
                     failed_shards.append(row)
             return failed_shards
@@ -56,7 +56,7 @@ def run(gen):
             failed_shards = []
             for batch in batcher(gen, subjob_size):
                 rdd = spark.sparkContext.parallelize(batch, len(batch))
-                for (status, row) in rdd.map(downloader).collect():
+                for status, row in rdd.map(downloader).collect():
                     if status is False:
                         failed_shards.append(row)
             return failed_shards
@@ -66,6 +66,29 @@ def run(gen):
         retrier(run, failed_shards, max_shard_retry)
 
 
+try:
+    import ray  # pylint: disable=import-outside-toplevel
+
+    @ray.remote
+    def ray_download(downloader, shards):
+        status, row = downloader(shards)
+        return status, row
+
+    def ray_distributor(processes_count, downloader, reader, _, max_shard_retry):  # type: ignore
+        # pylint: disable=unused-argument
+        ret = []
+        count = 0
+        for task in reader:
+            count += 1
+            ret.append(ray_download.remote(downloader, task))
+        ray.get(ret)
+
+except ModuleNotFoundError as e:
+
+    def ray_distributor(processes_count, downloader, reader, subjob_size, max_shard_retry):  # type: ignore  # pylint: disable=unused-argument
+        return None
+
+
 @contextmanager
 def _spark_session(processes_count: int):
     """Create and close a spark session if none exist"""

diff --git a/img2dataset/main.py b/img2dataset/main.py
@@ -15,7 +15,11 @@
 )
 from .reader import Reader
 from .downloader import Downloader
-from .distributor import multiprocessing_distributor, pyspark_distributor
+from .distributor import (
+    multiprocessing_distributor,
+    pyspark_distributor,
+    ray_distributor,
+)
 import fsspec
 import sys
 import signal
@@ -244,6 +248,8 @@ def signal_handler(signal_arg, frame):  # pylint: disable=unused-argument
         distributor_fn = multiprocessing_distributor
     elif distributor == "pyspark":
         distributor_fn = pyspark_distributor
+    elif distributor == "ray":
+        distributor_fn = ray_distributor
     else:
         raise ValueError(f"Distributor {distributor} not supported")
 

diff --git a/img2dataset/writer.py b/img2dataset/writer.py
@@ -18,8 +18,8 @@ def __init__(self, output_file, schema, buffer_size=100):
         self.schema = schema
         self._initiatlize_buffer()
         fs, output_path = fsspec.core.url_to_fs(output_file)
-
-        self.output_fd = fs.open(output_path, "wb")
+        # testing for S3
+        self.output_fd = fs.open(output_path, "wb", blocksize=200000000)
         self.parquet_writer = pq.ParquetWriter(self.output_fd, schema)
 
     def _initiatlize_buffer(self):

diff --git a/requirements-test.txt b/requirements-test.txt
@@ -12,3 +12,4 @@ tensorflow
 tensorflow_io
 types-requests
 types-pkg_resources
+ray
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -365,6 +365,7 @@ def test_relative_path(tmp_path):
     [
         "multiprocessing",
         "pyspark",
+        "ray",
     ],
 )
 def test_distributors(distributor, tmp_path):