ray-project · feefs · Jan 25, 2023 · Jan 26, 2023 · Jan 26, 2023 · Jan 26, 2023
@@ -93,7 +93,7 @@ Other preprocessing operations require global operations, such as groupbys and g
     # -> Sort Sample: 100%|███████████████████████████████████████| 10/10 [00:01<00:00,  9.04it/s]
     # -> GroupBy Map: 100%|███████████████████████████████████████| 10/10 [00:00<00:00, 23.66it/s]
     # -> GroupBy Reduce: 100%|████████████████████████████████████| 10/10 [00:00<00:00, 937.21it/s]
-    # -> Dataset(num_blocks=10, num_rows=3, schema={})
+    # -> <Dataset num_blocks=10, num_rows=3, schema={}>
     agg_ds.to_pandas()
     # ->
     #    A  mean(B)  mean(C)
@@ -136,7 +136,7 @@ These aggregations can be combined with batch mapping to transform a dataset usi
 
     ds = ds.map_batches(impute_b, batch_format="pandas")
     # -> Map Progress: 100%|██████████████████████████████████████| 10/10 [00:00<00:00, 132.66it/s]
-    # -> Dataset(num_blocks=10, num_rows=10, schema={A: int64, B: int64, C: int64})
+    # -> <Dataset num_blocks=10, num_rows=10, schema={A: int64, B: int64, C: int64}>
 
     # Standard scaling of all feature columns.
     stats = ds.aggregate(Mean("B"), Std("B"), Mean("C"), Std("C"))
@@ -156,7 +156,7 @@ These aggregations can be combined with batch mapping to transform a dataset usi
 
     ds = ds.map_batches(batch_standard_scaler, batch_format="pandas")
     # -> Map Progress: 100%|██████████████████████████████████████| 10/10 [00:00<00:00, 144.79it/s]
-    # -> Dataset(num_blocks=10, num_rows=10, schema={A: int64, B: double, C: double})
+    # -> <Dataset num_blocks=10, num_rows=10, schema={A: int64, B: double, C: double}>
 
 Random shuffle
 ==============

@@ -102,11 +102,11 @@ def train(self, shard: ray.data.Dataset[int]) -> int:
 # -> [Actor(Worker, ...), Actor(Worker, ...), ...]
 
 ds = ray.data.range(10000)
-# -> Dataset(num_blocks=200, num_rows=10000, schema=<class 'int'>)
+# -> <Dataset num_blocks=200, num_rows=10000, schema=<class 'int'>>
 
 shards = ds.split(n=4, locality_hints=workers)
-# -> [Dataset(num_blocks=13, num_rows=2500, schema=<class 'int'>),
-#     Dataset(num_blocks=13, num_rows=2500, schema=<class 'int'>), ...]
+# -> [<Dataset num_blocks=13, num_rows=2500, schema=<class 'int'>>,
+#     <Dataset num_blocks=13, num_rows=2500, schema=<class 'int'>>, ...]
 
 ray.get([w.train.remote(s) for w, s in zip(workers, shards)])
 # -> [2500, 2500, 2500, 2500]

@@ -10,7 +10,7 @@
 # __gen_synth_int_range_begin__
 # Create a Dataset of Python objects.
 ds = ray.data.range(10000)
-# -> Dataset(num_blocks=200, num_rows=10000, schema=<class 'int'>)
+# -> <Dataset num_blocks=200, num_rows=10000, schema=<class 'int'>>
 
 ds.take(5)
 # -> [0, 1, 2, 3, 4]
@@ -21,7 +21,7 @@
 # __gen_synth_tabular_range_begin__
 # Create a Dataset of Arrow records.
 ds = ray.data.range_table(10000)
-# -> Dataset(num_blocks=200, num_rows=10000, schema={value: int64})
+# -> <Dataset num_blocks=200, num_rows=10000, schema={value: int64}>
 
 ds.take(5)
 # -> [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}, {'value': 4}]
@@ -60,7 +60,7 @@
 # __from_items_begin__
 # Create a Dataset of tabular (Arrow) records.
 ds = ray.data.from_items([{"col1": i, "col2": str(i)} for i in range(10000)])
-# -> Dataset(num_blocks=200, num_rows=10000, schema={col1: int64, col2: string})
+# -> <Dataset num_blocks=200, num_rows=10000, schema={col1: int64, col2: string}>
 
 ds.show(3)
 # -> {'col1': 0, 'col2': '0'}
@@ -76,7 +76,7 @@
 # Create a tabular Dataset from a Pandas DataFrame.
 df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
 ds = ray.data.from_pandas(df)
-# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: object})
+# -> <Dataset num_blocks=1, num_rows=10000, schema={col1: int64, col2: object}>
 
 ds.show(3)
 # -> {'col1': 0, 'col2': '0'}
@@ -99,7 +99,7 @@
 ]
 # Create a tabular Dataset from multiple Pandas DataFrames.
 ds = ray.data.from_pandas(dfs)
-# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object})
+# -> <Dataset num_blocks=10, num_rows=10000, schema={col1: int64, col2: object}>
 
 ds.show(3)
 # -> {'col1': 0, 'col2': '0'}
@@ -152,8 +152,8 @@
 # fmt: off
 # __read_images_begin__
 ds = ray.data.read_images("example://image-datasets/simple")
-# -> Dataset(num_blocks=3, num_rows=3, 
-#            schema={image: ArrowTensorType(shape=(32, 32, 3), dtype=uint8)})
+# -> <Dataset num_blocks=3, num_rows=3, 
+#             schema={image: ArrowTensorType(shape=(32, 32, 3), dtype=uint8)}>
 
 ds.take(1)
 # -> [array([[[ 88,  70,  68],
@@ -199,7 +199,7 @@
 # Create a tabular Dataset from an Arrow Table.
 t = pa.table({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
 ds = ray.data.from_arrow(t)
-# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: string})
+# -> <Dataset num_blocks=1, num_rows=10000, schema={col1: int64, col2: string}>
 
 ds.show(3)
 # -> {'col1': 0, 'col2': '0'}
@@ -222,7 +222,7 @@
 ]
 # Create a tabular Dataset from multiple Arrow Tables.
 ds = ray.data.from_arrow(ts)
-# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string})
+# -> <Dataset num_blocks=10, num_rows=10000, schema={col1: int64, col2: string}>
 
 ds.show(3)
 # -> {'col1': 0, 'col2': '0'}
@@ -240,7 +240,7 @@
 ddf = dd.from_pandas(df, npartitions=4)
 # Create a tabular Dataset from a Dask DataFrame.
 ds = ray.data.from_dask(ddf)
-# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object})
+# -> <Dataset num_blocks=10, num_rows=10000, schema={col1: int64, col2: object}>
 
 ds.show(3)
 # -> {'col1': 0, 'col2': '0'}
@@ -260,7 +260,7 @@
 df = spark.createDataFrame([(i, str(i)) for i in range(10000)], ["col1", "col2"])
 # Create a tabular Dataset from a Spark DataFrame.
 ds = ray.data.from_dask(df)
-# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string})
+# -> <Dataset num_blocks=10, num_rows=10000, schema={col1: int64, col2: string}>
 
 ds.show(3)
 # -> {'col1': 0, 'col2': '0'}
@@ -277,7 +277,7 @@
 mdf = md.DataFrame(df)
 # Create a tabular Dataset from a Modin DataFrame.
 ds = ray.data.from_modin(mdf)
-# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object})
+# -> <Dataset num_blocks=8, num_rows=10000, schema={col1: int64, col2: object}>
 
 ds.show(3)
 # -> {'col1': 0, 'col2': '0'}
@@ -297,7 +297,7 @@
 mdf = md.DataFrame(df, num_partitions=8)
 # Create a tabular Dataset from a Mars DataFrame.
 ds = ray.data.from_mars(mdf)
-# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object})
+# -> <Dataset num_blocks=8, num_rows=10000, schema={col1: int64, col2: object}>
 
 ds.show(3)
 # -> {'col1': 0, 'col2': '0'}
@@ -351,7 +351,7 @@
     columns=["sepal.length", "variety"],
     filter=pa.dataset.field("sepal.length") > 5.0,
 ).fully_executed()  # Force a full read of the file.
-# -> Dataset(num_blocks=1, num_rows=118, schema={sepal.length: double, variety: string})
+# -> <Dataset num_blocks=1, num_rows=118, schema={sepal.length: double, variety: string}>
 
 ds.show(2)
 # -> {'sepal.length': 5.1, 'variety': 'Setosa'}
@@ -446,7 +446,7 @@
 # __read_text_begin__
 # Create a tabular Dataset by reading a text file.
 ds = ray.data.read_text("example://sms_spam_collection_subset.txt")
-# -> Dataset(num_blocks=1, num_rows=10, schema=<class 'str'>)
+# -> <Dataset num_blocks=1, num_rows=10, schema=<class 'str'>>
 
 ds.show(3)
 # -> ham     Go until jurong point, crazy.. Available only in bugis n great world la e
@@ -465,7 +465,7 @@
 
 # Create a tabular Dataset by reading a binary file.
 ds = ray.data.read_binary_files("example://mnist_subset_partitioned/0/1.png")
-# -> Dataset(num_blocks=1, num_rows=1, schema=<class 'bytes'>)
+# -> <Dataset num_blocks=1, num_rows=1, schema=<class 'bytes'>>
 
 ds = ds.map(lambda bytes_: np.asarray(PIL.Image.open(BytesIO(bytes_)).convert("L")))
 # -> Dataset(

@@ -6,7 +6,7 @@
 
 # Create a Dataset of Python objects.
 ds = ray.data.range(10000)
-# -> Dataset(num_blocks=200, num_rows=10000, schema=<class 'int'>)
+# -> <Dataset num_blocks=200, num_rows=10000, schema=<class 'int'>>
 
 ds.take(5)
 # -> [0, 1, 2, 3, 4]
@@ -23,9 +23,9 @@
         {"sepal.length": 4.7, "sepal.width": 3.2,
          "petal.length": 1.3, "petal.width": 0.2, "variety": "Setosa"},
      ])
-# Dataset(num_blocks=3, num_rows=3,
-#         schema={sepal.length: float64, sepal.width: float64,
-#                 petal.length: float64, petal.width: float64, variety: object})
+# <Dataset num_blocks=3, num_rows=3,
+#          schema={sepal.length: float64, sepal.width: float64,
+#                  petal.length: float64, petal.width: float64, variety: object}>
 
 ds.show()
 # -> {'sepal.length': 5.1, 'sepal.width': 3.5,
@@ -50,15 +50,15 @@
 # Tip: "example://" is a convenient protocol to access the
 # python/ray/data/examples/data directory.
 ds = ray.data.read_csv("example://iris.csv")
-# Dataset(num_blocks=1, num_rows=150,
-#         schema={sepal.length: float64, sepal.width: float64,
-#                 petal.length: float64, petal.width: float64, variety: object})
+# <Dataset num_blocks=1, num_rows=150,
+#          schema={sepal.length: float64, sepal.width: float64,
+#                  petal.length: float64, petal.width: float64, variety: object}>
 
 # Create from Parquet.
 ds = ray.data.read_parquet("example://iris.parquet")
-# Dataset(num_blocks=1, num_rows=150,
-#         schema={sepal.length: float64, sepal.width: float64,
-#                 petal.length: float64, petal.width: float64, variety: object})
+# <Dataset num_blocks=1, num_rows=150,
+#          schema={sepal.length: float64, sepal.width: float64,
+#                  petal.length: float64, petal.width: float64, variety: object}>
 
 # __create_from_files_end__
 # fmt: on
@@ -82,18 +82,18 @@
 
 # Create 10 blocks for parallelism.
 ds = ds.repartition(10)
-# Dataset(num_blocks=10, num_rows=150,
-#         schema={sepal.length: float64, sepal.width: float64,
-#                 petal.length: float64, petal.width: float64, variety: object})
+# <Dataset num_blocks=10, num_rows=150,
+#          schema={sepal.length: float64, sepal.width: float64,
+#                  petal.length: float64, petal.width: float64, variety: object}>
 
 # Find rows with sepal.length < 5.5 and petal.length > 3.5.
 def transform_batch(df: pandas.DataFrame) -> pandas.DataFrame:
     return df[(df["sepal.length"] < 5.5) & (df["petal.length"] > 3.5)]
 
 transformed_ds = ds.map_batches(transform_batch)
-# Dataset(num_blocks=10, num_rows=3,
-#         schema={sepal.length: float64, sepal.width: float64,
-#                 petal.length: float64, petal.width: float64, variety: object})
+# <Dataset num_blocks=10, num_rows=3,
+#          schema={sepal.length: float64, sepal.width: float64,
+#                  petal.length: float64, petal.width: float64, variety: object}>
 
 transformed_ds.show()
 # -> {'sepal.length': 5.2, 'sepal.width': 2.7,
@@ -136,18 +136,18 @@ def train(self, shard) -> int:
 
 shards = ds.split(n=4, locality_hints=workers)
 # -> [
-#       Dataset(num_blocks=3, num_rows=45,
-#               schema={sepal.length: double, sepal.width: double,
-#                       petal.length: double, petal.width: double, variety: string}),
-#       Dataset(num_blocks=3, num_rows=45,
-#               schema={sepal.length: double, sepal.width: double,
-#                       petal.length: double, petal.width: double, variety: string}),
-#       Dataset(num_blocks=2, num_rows=30,
-#               schema={sepal.length: double, sepal.width: double,
-#                       petal.length: double, petal.width: double, variety: string}),
-#       Dataset(num_blocks=2, num_rows=30,
-#               schema={sepal.length: double, sepal.width: double,
-#                       petal.length: double, petal.width: double, variety: string}),
+#       <Dataset num_blocks=3, num_rows=45,
+#                schema={sepal.length: double, sepal.width: double,
+#                        petal.length: double, petal.width: double, variety: string}>,
+#       <Dataset num_blocks=3, num_rows=45,
+#                schema={sepal.length: double, sepal.width: double,
+#                        petal.length: double, petal.width: double, variety: string}>,
+#       <Dataset num_blocks=2, num_rows=30,
+#                schema={sepal.length: double, sepal.width: double,
+#                        petal.length: double, petal.width: double, variety: string}>,
+#       <Dataset num_blocks=2, num_rows=30,
+#                schema={sepal.length: double, sepal.width: double,
+#                        petal.length: double, petal.width: double, variety: string}>,
 #    ]
 
 ray.get([w.train.remote(s) for w, s in zip(workers, shards)])

@@ -5,7 +5,7 @@
 import ray
 
 ds = ray.data.range(1000)
-# -> Dataset(num_blocks=200, num_rows=1000, schema=<class 'int'>)
+# -> <Dataset num_blocks=200, num_rows=1000, schema=<class 'int'>>
 ds.take(5)
 # -> [0, 1, 2, 3, 4]
 
@@ -26,7 +26,7 @@
 import ray
 
 ds = ray.data.range(1000)
-# -> Dataset(num_blocks=200, num_rows=1000, schema=<class 'int'>)
+# -> <Dataset num_blocks=200, num_rows=1000, schema=<class 'int'>>
 ds.take(5)
 # -> [0, 1, 2, 3, 4]
 
@@ -47,7 +47,7 @@
 import ray
 
 ds = ray.data.range(1000)
-# -> Dataset(num_blocks=200, num_rows=1000, schema=<class 'int'>)
+# -> <Dataset num_blocks=200, num_rows=1000, schema=<class 'int'>>
 ds.take(5)
 # -> [0, 1, 2, 3, 4]