-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Data] Fix throughput time calculations #44138
Merged
Merged
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
7c95ee4
fix throughput time calculations
omatthew98 327169f
update the throughput calculation further, add comments to document
omatthew98 363d4ff
fix bugs
omatthew98 aa29bdd
refactor to use collect dataset stats summaries fn
omatthew98 e2a21c8
fix existing test
omatthew98 9530448
adding in test for total time all blocks
omatthew98 b3201fa
fixing logic for runtime metrics (do not use sum)
omatthew98 55c1e1c
fix bug, simplify method
omatthew98 9587fde
adding in sanity check test for throughput
omatthew98 39f1b85
adding in test for runtime
omatthew98 436cda3
fix bug for test_sort
omatthew98 38804a5
respond to pr feedback
omatthew98 e1e1678
fix lint for robocop
omatthew98 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -212,11 +212,15 @@ def dummy_map_batches(x): | |||||
return x | ||||||
|
||||||
|
||||||
def map_batches_sleep(x, n): | ||||||
"""Dummy function used in calls to map_batches below, which | ||||||
simply sleeps for `n` seconds before returning the input batch.""" | ||||||
time.sleep(n) | ||||||
return x | ||||||
def dummy_map_batches_sleep(n): | ||||||
"""Function used to create a function that sleeps for n seconds | ||||||
to be used in map_batches below.""" | ||||||
|
||||||
def f(x): | ||||||
time.sleep(n) | ||||||
return x | ||||||
|
||||||
return f | ||||||
|
||||||
|
||||||
@contextmanager | ||||||
|
@@ -1022,7 +1026,7 @@ def test_summarize_blocks(ray_start_regular_shared, op_two_block): | |||||
def test_get_total_stats(ray_start_regular_shared, op_two_block): | ||||||
"""Tests a set of similar getter methods which pull aggregated | ||||||
statistics values after calculating operator-level stats: | ||||||
`DatasetStats.get_max_wall_time()`, | ||||||
`DatasetStats.get_total_wall_time()`, | ||||||
`DatasetStats.get_total_cpu_time()`, | ||||||
`DatasetStats.get_max_heap_memory()`.""" | ||||||
block_params, block_meta_list = op_two_block | ||||||
|
@@ -1033,8 +1037,18 @@ def test_get_total_stats(ray_start_regular_shared, op_two_block): | |||||
|
||||||
dataset_stats_summary = stats.to_summary() | ||||||
op_stats = dataset_stats_summary.operators_stats[0] | ||||||
wall_time_stats = op_stats.wall_time | ||||||
assert dataset_stats_summary.get_total_wall_time() == wall_time_stats.get("max") | ||||||
|
||||||
# simple case with only one block / summary, result should match difference between | ||||||
# the start and end time | ||||||
assert ( | ||||||
dataset_stats_summary.get_total_wall_time() | ||||||
== op_stats.latest_end_time - op_stats.earliest_start_time | ||||||
) | ||||||
|
||||||
# total time across all blocks is sum of wall times of blocks | ||||||
assert dataset_stats_summary.get_total_time_all_blocks() == sum( | ||||||
block_params["wall_time"] | ||||||
) | ||||||
|
||||||
cpu_time_stats = op_stats.cpu_time | ||||||
assert dataset_stats_summary.get_total_cpu_time() == cpu_time_stats.get("sum") | ||||||
|
@@ -1185,10 +1199,84 @@ def f(x): | |||||
assert ds._plan.stats().extra_metrics["task_submission_backpressure_time"] > 0 | ||||||
|
||||||
|
||||||
def test_runtime_metrics(ray_start_regular_shared): | ||||||
from math import isclose | ||||||
|
||||||
def time_to_seconds(time_str): | ||||||
if time_str.endswith("us"): | ||||||
# Convert microseconds to seconds | ||||||
return float(time_str[:-2]) / (1000 * 1000) | ||||||
elif time_str.endswith("ms"): | ||||||
# Convert milliseconds to seconds | ||||||
return float(time_str[:-2]) / 1000 | ||||||
elif time_str.endswith("s"): | ||||||
# Already in seconds, just remove the 's' and convert to float | ||||||
return float(time_str[:-1]) | ||||||
|
||||||
f = dummy_map_batches_sleep(0.01) | ||||||
ds = ray.data.range(100).map(f).materialize().map(f).materialize() | ||||||
metrics_str = ds._plan.stats().runtime_metrics() | ||||||
|
||||||
# Dictionary to store the metrics for testing | ||||||
metrics_dict = {} | ||||||
|
||||||
# Regular expression to match the pattern of each metric line | ||||||
pattern = re.compile(r"\* (.+?): ([\d\.]+(?:ms|s)) \(([\d\.]+)%\)") | ||||||
|
||||||
# Split the input string into lines and iterate over them | ||||||
for line in metrics_str.split("\n"): | ||||||
match = pattern.match(line) | ||||||
if match: | ||||||
# Extracting the operator name, time, and percentage | ||||||
operator_name, time_str, percent_str = match.groups() | ||||||
# Converting percentage to float and keeping time as string | ||||||
metrics_dict[operator_name] = ( | ||||||
time_to_seconds(time_str), | ||||||
float(percent_str), | ||||||
) | ||||||
|
||||||
total_time, total_percent = metrics_dict.pop("Total") | ||||||
assert total_percent == 100 | ||||||
|
||||||
for time_s, percent in metrics_dict.values(): | ||||||
assert time_s < total_time | ||||||
# Check percentage, this is done with some expected loss of precision | ||||||
# due to rounding in the intital output. | ||||||
assert isclose(percent, time_s / total_time * 100, rel_tol=0.01) | ||||||
|
||||||
|
||||||
# NOTE: All tests above share a Ray cluster, while the tests below do not. These | ||||||
# tests should only be carefully reordered to retain this invariant! | ||||||
|
||||||
|
||||||
def test_dataset_throughput(): | ||||||
ray.shutdown() | ||||||
ray.init(num_cpus=2) | ||||||
|
||||||
f = dummy_map_batches_sleep(0.01) | ||||||
ds = ray.data.range(100).map(f).materialize().map(f).materialize() | ||||||
|
||||||
# Pattern to match operator throughput | ||||||
operator_pattern = re.compile( | ||||||
r"Operator (\d+).*?Ray Data throughput: (\d+\.\d+) rows/s.*?Estimated single node throughput: (\d+\.\d+) rows/s", # noqa: E501 | ||||||
re.DOTALL, | ||||||
) | ||||||
|
||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just adding a clarifying comment
Suggested change
|
||||||
# Ray data throughput should always be better than single node throughput for | ||||||
# multi-cpu case. | ||||||
for match in operator_pattern.findall(ds.stats()): | ||||||
assert float(match[1]) >= float(match[2]) | ||||||
|
||||||
# Pattern to match dataset throughput | ||||||
dataset_pattern = re.compile( | ||||||
r"Dataset throughput:.*?Ray Data throughput: (\d+\.\d+) rows/s.*?Estimated single node throughput: (\d+\.\d+) rows/s", # noqa: E501 | ||||||
re.DOTALL, | ||||||
) | ||||||
|
||||||
dataset_match = dataset_pattern.search(ds.stats()) | ||||||
assert float(dataset_match[1]) >= float(dataset_match[2]) | ||||||
|
||||||
|
||||||
def test_stats_actor_cap_num_stats(ray_start_cluster): | ||||||
actor = _StatsActor.remote(3) | ||||||
metadatas = [] | ||||||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i think this comment is kind of copied from above? or should they be in both places?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One is for the dataset throughput and the other is for operator throughput, they are similar but have some differences. Either way, I wanted to have some information about the throughput in the two places that we are calculating it, but if that feels redundant / there is a common place to put the shared info that makes sense too.