Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change Dataset's repr to use angled brackets #31947

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions doc/source/data/dataset-ml-preprocessing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ Other preprocessing operations require global operations, such as groupbys and g
# -> Sort Sample: 100%|███████████████████████████████████████| 10/10 [00:01<00:00, 9.04it/s]
# -> GroupBy Map: 100%|███████████████████████████████████████| 10/10 [00:00<00:00, 23.66it/s]
# -> GroupBy Reduce: 100%|████████████████████████████████████| 10/10 [00:00<00:00, 937.21it/s]
# -> Dataset(num_blocks=10, num_rows=3, schema={})
# -> <Dataset num_blocks=10, num_rows=3, schema={}>
agg_ds.to_pandas()
# ->
# A mean(B) mean(C)
Expand Down Expand Up @@ -136,7 +136,7 @@ These aggregations can be combined with batch mapping to transform a dataset usi

ds = ds.map_batches(impute_b, batch_format="pandas")
# -> Map Progress: 100%|██████████████████████████████████████| 10/10 [00:00<00:00, 132.66it/s]
# -> Dataset(num_blocks=10, num_rows=10, schema={A: int64, B: int64, C: int64})
# -> <Dataset num_blocks=10, num_rows=10, schema={A: int64, B: int64, C: int64}>

# Standard scaling of all feature columns.
stats = ds.aggregate(Mean("B"), Std("B"), Mean("C"), Std("C"))
Expand All @@ -156,7 +156,7 @@ These aggregations can be combined with batch mapping to transform a dataset usi

ds = ds.map_batches(batch_standard_scaler, batch_format="pandas")
# -> Map Progress: 100%|██████████████████████████████████████| 10/10 [00:00<00:00, 144.79it/s]
# -> Dataset(num_blocks=10, num_rows=10, schema={A: int64, B: double, C: double})
# -> <Dataset num_blocks=10, num_rows=10, schema={A: int64, B: double, C: double}>

Random shuffle
==============
Expand Down
6 changes: 3 additions & 3 deletions doc/source/data/doc_code/consuming_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,11 @@ def train(self, shard: ray.data.Dataset[int]) -> int:
# -> [Actor(Worker, ...), Actor(Worker, ...), ...]

ds = ray.data.range(10000)
# -> Dataset(num_blocks=200, num_rows=10000, schema=<class 'int'>)
# -> <Dataset num_blocks=200, num_rows=10000, schema=<class 'int'>>

shards = ds.split(n=4, locality_hints=workers)
# -> [Dataset(num_blocks=13, num_rows=2500, schema=<class 'int'>),
# Dataset(num_blocks=13, num_rows=2500, schema=<class 'int'>), ...]
# -> [<Dataset num_blocks=13, num_rows=2500, schema=<class 'int'>>,
# <Dataset num_blocks=13, num_rows=2500, schema=<class 'int'>>, ...]

ray.get([w.train.remote(s) for w, s in zip(workers, shards)])
# -> [2500, 2500, 2500, 2500]
Expand Down
32 changes: 16 additions & 16 deletions doc/source/data/doc_code/creating_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# __gen_synth_int_range_begin__
# Create a Dataset of Python objects.
ds = ray.data.range(10000)
# -> Dataset(num_blocks=200, num_rows=10000, schema=<class 'int'>)
# -> <Dataset num_blocks=200, num_rows=10000, schema=<class 'int'>>

ds.take(5)
# -> [0, 1, 2, 3, 4]
Expand All @@ -21,7 +21,7 @@
# __gen_synth_tabular_range_begin__
# Create a Dataset of Arrow records.
ds = ray.data.range_table(10000)
# -> Dataset(num_blocks=200, num_rows=10000, schema={value: int64})
# -> <Dataset num_blocks=200, num_rows=10000, schema={value: int64}>

ds.take(5)
# -> [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}, {'value': 4}]
Expand Down Expand Up @@ -60,7 +60,7 @@
# __from_items_begin__
# Create a Dataset of tabular (Arrow) records.
ds = ray.data.from_items([{"col1": i, "col2": str(i)} for i in range(10000)])
# -> Dataset(num_blocks=200, num_rows=10000, schema={col1: int64, col2: string})
# -> <Dataset num_blocks=200, num_rows=10000, schema={col1: int64, col2: string}>

ds.show(3)
# -> {'col1': 0, 'col2': '0'}
Expand All @@ -76,7 +76,7 @@
# Create a tabular Dataset from a Pandas DataFrame.
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
ds = ray.data.from_pandas(df)
# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: object})
# -> <Dataset num_blocks=1, num_rows=10000, schema={col1: int64, col2: object}>

ds.show(3)
# -> {'col1': 0, 'col2': '0'}
Expand All @@ -99,7 +99,7 @@
]
# Create a tabular Dataset from multiple Pandas DataFrames.
ds = ray.data.from_pandas(dfs)
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object})
# -> <Dataset num_blocks=10, num_rows=10000, schema={col1: int64, col2: object}>

ds.show(3)
# -> {'col1': 0, 'col2': '0'}
Expand Down Expand Up @@ -152,8 +152,8 @@
# fmt: off
# __read_images_begin__
ds = ray.data.read_images("example://image-datasets/simple")
# -> Dataset(num_blocks=3, num_rows=3,
# schema={image: ArrowTensorType(shape=(32, 32, 3), dtype=uint8)})
# -> <Dataset num_blocks=3, num_rows=3,
# schema={image: ArrowTensorType(shape=(32, 32, 3), dtype=uint8)}>

ds.take(1)
# -> [array([[[ 88, 70, 68],
Expand Down Expand Up @@ -199,7 +199,7 @@
# Create a tabular Dataset from an Arrow Table.
t = pa.table({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
ds = ray.data.from_arrow(t)
# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: string})
# -> <Dataset num_blocks=1, num_rows=10000, schema={col1: int64, col2: string}>

ds.show(3)
# -> {'col1': 0, 'col2': '0'}
Expand All @@ -222,7 +222,7 @@
]
# Create a tabular Dataset from multiple Arrow Tables.
ds = ray.data.from_arrow(ts)
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string})
# -> <Dataset num_blocks=10, num_rows=10000, schema={col1: int64, col2: string}>

ds.show(3)
# -> {'col1': 0, 'col2': '0'}
Expand All @@ -240,7 +240,7 @@
ddf = dd.from_pandas(df, npartitions=4)
# Create a tabular Dataset from a Dask DataFrame.
ds = ray.data.from_dask(ddf)
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object})
# -> <Dataset num_blocks=10, num_rows=10000, schema={col1: int64, col2: object}>

ds.show(3)
# -> {'col1': 0, 'col2': '0'}
Expand All @@ -260,7 +260,7 @@
df = spark.createDataFrame([(i, str(i)) for i in range(10000)], ["col1", "col2"])
# Create a tabular Dataset from a Spark DataFrame.
ds = ray.data.from_dask(df)
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string})
# -> <Dataset num_blocks=10, num_rows=10000, schema={col1: int64, col2: string}>

ds.show(3)
# -> {'col1': 0, 'col2': '0'}
Expand All @@ -277,7 +277,7 @@
mdf = md.DataFrame(df)
# Create a tabular Dataset from a Modin DataFrame.
ds = ray.data.from_modin(mdf)
# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object})
# -> <Dataset num_blocks=8, num_rows=10000, schema={col1: int64, col2: object}>

ds.show(3)
# -> {'col1': 0, 'col2': '0'}
Expand All @@ -297,7 +297,7 @@
mdf = md.DataFrame(df, num_partitions=8)
# Create a tabular Dataset from a Mars DataFrame.
ds = ray.data.from_mars(mdf)
# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object})
# -> <Dataset num_blocks=8, num_rows=10000, schema={col1: int64, col2: object}>

ds.show(3)
# -> {'col1': 0, 'col2': '0'}
Expand Down Expand Up @@ -351,7 +351,7 @@
columns=["sepal.length", "variety"],
filter=pa.dataset.field("sepal.length") > 5.0,
).fully_executed() # Force a full read of the file.
# -> Dataset(num_blocks=1, num_rows=118, schema={sepal.length: double, variety: string})
# -> <Dataset num_blocks=1, num_rows=118, schema={sepal.length: double, variety: string}>

ds.show(2)
# -> {'sepal.length': 5.1, 'variety': 'Setosa'}
Expand Down Expand Up @@ -446,7 +446,7 @@
# __read_text_begin__
# Create a tabular Dataset by reading a text file.
ds = ray.data.read_text("example://sms_spam_collection_subset.txt")
# -> Dataset(num_blocks=1, num_rows=10, schema=<class 'str'>)
# -> <Dataset num_blocks=1, num_rows=10, schema=<class 'str'>>

ds.show(3)
# -> ham Go until jurong point, crazy.. Available only in bugis n great world la e
Expand All @@ -465,7 +465,7 @@

# Create a tabular Dataset by reading a binary file.
ds = ray.data.read_binary_files("example://mnist_subset_partitioned/0/1.png")
# -> Dataset(num_blocks=1, num_rows=1, schema=<class 'bytes'>)
# -> <Dataset num_blocks=1, num_rows=1, schema=<class 'bytes'>>

ds = ds.map(lambda bytes_: np.asarray(PIL.Image.open(BytesIO(bytes_)).convert("L")))
# -> Dataset(
Expand Down
56 changes: 28 additions & 28 deletions doc/source/data/doc_code/quick_start.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

# Create a Dataset of Python objects.
ds = ray.data.range(10000)
# -> Dataset(num_blocks=200, num_rows=10000, schema=<class 'int'>)
# -> <Dataset num_blocks=200, num_rows=10000, schema=<class 'int'>>

ds.take(5)
# -> [0, 1, 2, 3, 4]
Expand All @@ -23,9 +23,9 @@
{"sepal.length": 4.7, "sepal.width": 3.2,
"petal.length": 1.3, "petal.width": 0.2, "variety": "Setosa"},
])
# Dataset(num_blocks=3, num_rows=3,
# schema={sepal.length: float64, sepal.width: float64,
# petal.length: float64, petal.width: float64, variety: object})
# <Dataset num_blocks=3, num_rows=3,
# schema={sepal.length: float64, sepal.width: float64,
# petal.length: float64, petal.width: float64, variety: object}>

ds.show()
# -> {'sepal.length': 5.1, 'sepal.width': 3.5,
Expand All @@ -50,15 +50,15 @@
# Tip: "example://" is a convenient protocol to access the
# python/ray/data/examples/data directory.
ds = ray.data.read_csv("example://iris.csv")
# Dataset(num_blocks=1, num_rows=150,
# schema={sepal.length: float64, sepal.width: float64,
# petal.length: float64, petal.width: float64, variety: object})
# <Dataset num_blocks=1, num_rows=150,
# schema={sepal.length: float64, sepal.width: float64,
# petal.length: float64, petal.width: float64, variety: object}>

# Create from Parquet.
ds = ray.data.read_parquet("example://iris.parquet")
# Dataset(num_blocks=1, num_rows=150,
# schema={sepal.length: float64, sepal.width: float64,
# petal.length: float64, petal.width: float64, variety: object})
# <Dataset num_blocks=1, num_rows=150,
# schema={sepal.length: float64, sepal.width: float64,
# petal.length: float64, petal.width: float64, variety: object}>

# __create_from_files_end__
# fmt: on
Expand All @@ -82,18 +82,18 @@

# Create 10 blocks for parallelism.
ds = ds.repartition(10)
# Dataset(num_blocks=10, num_rows=150,
# schema={sepal.length: float64, sepal.width: float64,
# petal.length: float64, petal.width: float64, variety: object})
# <Dataset num_blocks=10, num_rows=150,
# schema={sepal.length: float64, sepal.width: float64,
# petal.length: float64, petal.width: float64, variety: object}>

# Find rows with sepal.length < 5.5 and petal.length > 3.5.
def transform_batch(df: pandas.DataFrame) -> pandas.DataFrame:
return df[(df["sepal.length"] < 5.5) & (df["petal.length"] > 3.5)]

transformed_ds = ds.map_batches(transform_batch)
# Dataset(num_blocks=10, num_rows=3,
# schema={sepal.length: float64, sepal.width: float64,
# petal.length: float64, petal.width: float64, variety: object})
# <Dataset num_blocks=10, num_rows=3,
# schema={sepal.length: float64, sepal.width: float64,
# petal.length: float64, petal.width: float64, variety: object}>

transformed_ds.show()
# -> {'sepal.length': 5.2, 'sepal.width': 2.7,
Expand Down Expand Up @@ -136,18 +136,18 @@ def train(self, shard) -> int:

shards = ds.split(n=4, locality_hints=workers)
# -> [
# Dataset(num_blocks=3, num_rows=45,
# schema={sepal.length: double, sepal.width: double,
# petal.length: double, petal.width: double, variety: string}),
# Dataset(num_blocks=3, num_rows=45,
# schema={sepal.length: double, sepal.width: double,
# petal.length: double, petal.width: double, variety: string}),
# Dataset(num_blocks=2, num_rows=30,
# schema={sepal.length: double, sepal.width: double,
# petal.length: double, petal.width: double, variety: string}),
# Dataset(num_blocks=2, num_rows=30,
# schema={sepal.length: double, sepal.width: double,
# petal.length: double, petal.width: double, variety: string}),
# <Dataset num_blocks=3, num_rows=45,
# schema={sepal.length: double, sepal.width: double,
# petal.length: double, petal.width: double, variety: string}>,
# <Dataset num_blocks=3, num_rows=45,
# schema={sepal.length: double, sepal.width: double,
# petal.length: double, petal.width: double, variety: string}>,
# <Dataset num_blocks=2, num_rows=30,
# schema={sepal.length: double, sepal.width: double,
# petal.length: double, petal.width: double, variety: string}>,
# <Dataset num_blocks=2, num_rows=30,
# schema={sepal.length: double, sepal.width: double,
# petal.length: double, petal.width: double, variety: string}>,
# ]

ray.get([w.train.remote(s) for w, s in zip(workers, shards)])
Expand Down
6 changes: 3 additions & 3 deletions doc/source/data/doc_code/saving_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import ray

ds = ray.data.range(1000)
# -> Dataset(num_blocks=200, num_rows=1000, schema=<class 'int'>)
# -> <Dataset num_blocks=200, num_rows=1000, schema=<class 'int'>>
ds.take(5)
# -> [0, 1, 2, 3, 4]

Expand All @@ -26,7 +26,7 @@
import ray

ds = ray.data.range(1000)
# -> Dataset(num_blocks=200, num_rows=1000, schema=<class 'int'>)
# -> <Dataset num_blocks=200, num_rows=1000, schema=<class 'int'>>
ds.take(5)
# -> [0, 1, 2, 3, 4]

Expand All @@ -47,7 +47,7 @@
import ray

ds = ray.data.range(1000)
# -> Dataset(num_blocks=200, num_rows=1000, schema=<class 'int'>)
# -> <Dataset num_blocks=200, num_rows=1000, schema=<class 'int'>>
ds.take(5)
# -> [0, 1, 2, 3, 4]

Expand Down