In [1]:
import ray
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import time
import psutil

# Initialize Ray
ray.init(ignore_reinit_error=True, dashboard_port=8265)

# Load California Housing Dataset and increase the size
def load_data():
    data = fetch_california_housing()
    X, y = data.data, data.target
    
    # Increase the size of the dataset by tripling the data
    X = np.tile(X, (6, 1))  # Repeat the data three times (for X)
    y = np.tile(y, 6)       # Repeat the target values three times (for y)
    
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Linear regression training function
@ray.remote
def train_worker(X, y):
    start_time = time.time()  # Start timer for the worker
    model = LinearRegression()
    model.fit(X, y)
    score = model.score(X, y)  # R^2 score
    training_time = time.time() - start_time  # Calculate training time for this worker
    return score, training_time

# Fault Tolerance Test with Average Calculation
def measure_fault_tolerance(num_workers, num_failures, X_train, y_train, num_trials=5):
    print(f"\nFault Tolerance Test: {num_workers} Workers with {num_failures} Failures (over {num_trials} trials)")

    trial_results = []

    for trial in range(num_trials):
        workers = []
        split_X = np.array_split(X_train, num_workers)
        split_y = np.array_split(y_train, num_workers)

        # Launch workers
        for i in range(num_workers):
            workers.append(train_worker.remote(split_X[i], split_y[i]))

        # Simulate failures
        for _ in range(num_failures):
            if workers:
                failed_worker = workers.pop(np.random.randint(len(workers)))
                ray.cancel(failed_worker, force=True)
                print(f"Simulated failure for task: {failed_worker}")

        # Collect results from surviving workers
        results = []
        training_times = []
        for worker in workers:
            try:
                result, worker_training_time = ray.get(worker)
                results.append(result)
                training_times.append(worker_training_time)
            except Exception as e:
                print(f"Task {worker} failed with error: {e}")

        if results:
            avg_score = np.mean(results)
            avg_training_time = np.mean(training_times)
            trial_results.append(avg_score)
            print(f"Trial {trial + 1}: Average R^2 Score after failures: {avg_score:.4f}, Average Training Time: {avg_training_time:.2f} seconds")
        else:
            print(f"Trial {trial + 1}: No surviving workers completed their tasks.")
            trial_results.append(0)

    # Calculate overall average fault tolerance score
    overall_avg_score = np.mean(trial_results)
    print(f"Overall Average R^2 Score after failures: {overall_avg_score:.4f}\n")
    return overall_avg_score

# Resource monitoring function
def monitor_resource_usage():
    total_cores = psutil.cpu_count(logical=True)
    available_cores = ray.available_resources().get("CPU", 0)
    total_memory = psutil.virtual_memory().total / (1024 ** 3)  # GB
    used_memory = (total_memory - psutil.virtual_memory().available / (1024 ** 3))

    print(f"Total CPU Cores: {total_cores}")
    print(f"Available CPU Cores: {available_cores}")
    print(f"Total Memory: {total_memory:.2f} GB")
    print(f"Used Memory: {used_memory:.2f} GB")

# Measure communication overhead
def measure_communication_overhead(num_workers, X_train, y_train):
    start_time = time.time()

    split_X = np.array_split(X_train, num_workers)
    split_y = np.array_split(y_train, num_workers)

    # Launch workers and collect results
    workers = [train_worker.remote(split_X[i], split_y[i]) for i in range(num_workers)]
    ray.get(workers)

    end_time = time.time()
    communication_time = end_time - start_time
    print(f"Communication Overhead for {num_workers} workers: {communication_time:.2f} seconds")

# Main execution
if __name__ == "__main__":
    X_train, X_test, y_train, y_test = load_data()

    # Test fault tolerance with different configurations (2, 4, 8, and 10 workers)
    for num_workers in [2, 4, 8, 10]:
        measure_fault_tolerance(num_workers=num_workers, num_failures=2, X_train=X_train, y_train=y_train, num_trials=3)

        # Measure communication overhead
        measure_communication_overhead(num_workers=num_workers, X_train=X_train, y_train=y_train)

    # Monitor resource usage
    print("\nResource Usage:")
    monitor_resource_usage()

    ray.shutdown()


2024-12-07 01:11:00,713	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\PMLS\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\PMLS\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, i

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\PMLS\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\PMLS\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\PMLS\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:\Users\PMLS\anaconda3\Lib\site-packages

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



2024-12-07 01:11:02,956	INFO worker.py:1634 -- Connecting to existing Ray cluster at address: 127.0.0.1:6379...
2024-12-07 01:11:02,964	INFO worker.py:1819 -- Connected to Ray cluster.
2024-12-07 01:11:03,106	ERROR worker.py:422 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.



Fault Tolerance Test: 2 Workers with 2 Failures (over 3 trials)
Simulated failure for task: ObjectRef(fa31b89f94899135ffffffffffffffffffffffff0300000001000000)
Simulated failure for task: ObjectRef(44bfa4a5859b21acffffffffffffffffffffffff0300000001000000)
Trial 1: No surviving workers completed their tasks.
Simulated failure for task: ObjectRef(0ab01f2d6283d719ffffffffffffffffffffffff0300000001000000)
Simulated failure for task: ObjectRef(62ffec03f52574bdffffffffffffffffffffffff0300000001000000)
Trial 2: No surviving workers completed their tasks.
[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: fa890c0b84666e06a67b9af31b50d715cdd372af03000000 Worker ID: 62393ea2e4891110db9dcb0d249b06a7b1c56b496655e72dead53605 Node ID: ddf46cc29f954199e1fdd9d4ca119f09663a730b83b3f48dcd7b7dc7 Worker IP address: 127.0.0.1 Worker port: 10017 Worker PID: 93208 Worker exit type: S

[36m(pid=92908)[0m 
[36m(pid=92908)[0m A module that was compiled using NumPy 1.x cannot be run in
[36m(pid=92908)[0m NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
[36m(pid=92908)[0m versions of NumPy, modules must be compiled with NumPy 2.0.
[36m(pid=92908)[0m Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.
[36m(pid=92908)[0m 
[36m(pid=92908)[0m If you are a user of the module, the easiest solution will be to
[36m(pid=92908)[0m downgrade to 'numpy<2' or try to upgrade the affected module.
[36m(pid=92908)[0m We expect that some modules will need time to support NumPy 2.
[36m(pid=92908)[0m 
[36m(pid=92908)[0m Traceback (most recent call last):  File "C:\Users\PMLS\anaconda3\Lib\site-packages\ray\_private\workers\default_worker.py", line 289, in <module>
[36m(pid=92908)[0m     worker.main_loop()
[36m(pid=92908)[0m   File "C:\Users\PMLS\anaconda3\Lib\site-packages\ray\_private\worker.py", line 920, in main_loop
[36m(pid=92908)[0m 

Communication Overhead for 2 workers: 3.11 seconds

Fault Tolerance Test: 4 Workers with 2 Failures (over 3 trials)
Simulated failure for task: ObjectRef(bba958ebc147821cffffffffffffffffffffffff0300000001000000)
Simulated failure for task: ObjectRef(d1f8fccc5d0997adffffffffffffffffffffffff0300000001000000)
Trial 1: Average R^2 Score after failures: 0.6088, Average Training Time: 0.02 seconds
Simulated failure for task: ObjectRef(b5ee7196461fabb2ffffffffffffffffffffffff0300000001000000)
Simulated failure for task: ObjectRef(92ba3d58fa71ff60ffffffffffffffffffffffff0300000001000000)


[36m(pid=78016)[0m 
[36m(pid=78016)[0m 
[36m(pid=78016)[0m 
[36m(pid=78016)[0m 
[36m(pid=78016)[0m 
[36m(pid=78016)[0m 
[36m(pid=78016)[0m 
[36m(pid=78016)[0m 
[36m(pid=78016)[0m 
[36m(pid=78016)[0m 
[36m(pid=78016)[0m 
[36m(pid=78016)[0m 


Trial 2: Average R^2 Score after failures: 0.6058, Average Training Time: 0.01 seconds
Simulated failure for task: ObjectRef(e014f98d69f9c6dcffffffffffffffffffffffff0300000001000000)
Simulated failure for task: ObjectRef(8d630eb8890317ceffffffffffffffffffffffff0300000001000000)


[36m(pid=93272)[0m 
[36m(pid=93272)[0m 
[36m(pid=93272)[0m 
[36m(pid=93272)[0m 
[36m(pid=93272)[0m 
[36m(pid=93272)[0m 
[36m(pid=93272)[0m 
[36m(pid=93272)[0m 
[36m(pid=93272)[0m 
[36m(pid=93272)[0m 
[36m(pid=93272)[0m 
[36m(pid=93272)[0m 


Trial 3: Average R^2 Score after failures: 0.6079, Average Training Time: 0.01 seconds
Overall Average R^2 Score after failures: 0.6075



[36m(pid=92948)[0m 
[36m(pid=92948)[0m 
[36m(pid=92948)[0m 
[36m(pid=92948)[0m 
[36m(pid=92948)[0m 
[36m(pid=92948)[0m 
[36m(pid=92948)[0m 
[36m(pid=92948)[0m 
[36m(pid=92948)[0m 
[36m(pid=92948)[0m 
[36m(pid=92948)[0m 
[36m(pid=92948)[0m 


Communication Overhead for 4 workers: 0.95 seconds

Fault Tolerance Test: 8 Workers with 2 Failures (over 3 trials)
Simulated failure for task: ObjectRef(18749ad48ef6860cffffffffffffffffffffffff0300000001000000)
Simulated failure for task: ObjectRef(f1f684a908232704ffffffffffffffffffffffff0300000001000000)


[36m(pid=93592)[0m 
[36m(pid=93592)[0m 
[36m(pid=93592)[0m 
[36m(pid=93592)[0m 
[36m(pid=93592)[0m 
[36m(pid=93592)[0m 
[36m(pid=82840)[0m 
[36m(pid=82840)[0m 
[36m(pid=82840)[0m 
[36m(pid=82840)[0m 
[36m(pid=82840)[0m 
[36m(pid=82840)[0m 
[36m(pid=93592)[0m 
[36m(pid=93592)[0m 
[36m(pid=93592)[0m 
[36m(pid=93592)[0m 
[36m(pid=93592)[0m 
[36m(pid=93592)[0m 
[36m(pid=82840)[0m 
[36m(pid=82840)[0m 
[36m(pid=82840)[0m 
[36m(pid=82840)[0m 
[36m(pid=82840)[0m 
[36m(pid=82840)[0m 
2024-12-07 01:11:10,356	ERROR worker.py:422 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.


Trial 1: Average R^2 Score after failures: 0.6087, Average Training Time: 0.01 seconds
Simulated failure for task: ObjectRef(020ad482d0cf4e15ffffffffffffffffffffffff0300000001000000)
Simulated failure for task: ObjectRef(8f766538ab157fbbffffffffffffffffffffffff0300000001000000)
[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: 58903c2d61e0505ab420273264428ec0fb9510e803000000 Worker ID: f6dc8e98d9231c6923060d4b1691edbf0b1fe344bdc74bef4428724d Node ID: ddf46cc29f954199e1fdd9d4ca119f09663a730b83b3f48dcd7b7dc7 Worker IP address: 127.0.0.1 Worker port: 10035 Worker PID: 82840 Worker exit type: SYSTEM_ERROR Worker exit detail: The leased worker has unrecoverable failure. Worker is requested to be destroyed when it is returned. RPC Error message: Connection reset; RPC Error details: 
Trial 2: Average R^2 Score after failures: 0.6087, Average Training Time: 0.01 second

2024-12-07 01:11:10,626	ERROR worker.py:422 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.


Simulated failure for task: ObjectRef(c149fead3afd6354ffffffffffffffffffffffff0300000001000000)
Simulated failure for task: ObjectRef(f3758f9148a9b281ffffffffffffffffffffffff0300000001000000)
[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: 10d6766f3b09e195172c57cb366cb5134db0141f03000000 Worker ID: 481b7c1f29a303507b50e185a34c7e2954f2f566047036df467f25d0 Node ID: ddf46cc29f954199e1fdd9d4ca119f09663a730b83b3f48dcd7b7dc7 Worker IP address: 127.0.0.1 Worker port: 10033 Worker PID: 93272 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 10054. An existing connection was forcibly closed by the remote host. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or ot

2024-12-07 01:11:10,759	ERROR worker.py:422 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2024-12-07 01:11:10,808	ERROR worker.py:422 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.


Trial 3: Average R^2 Score after failures: 0.6123, Average Training Time: 0.01 seconds
Overall Average R^2 Score after failures: 0.6127

Communication Overhead for 10 workers: 0.43 seconds

Resource Usage:
Total CPU Cores: 12
Available CPU Cores: 8.0
Total Memory: 7.73 GB
Used Memory: 7.10 GB


[36m(pid=82840)[0m A module that was compiled using NumPy 1.x cannot be run in[32m [repeated 24x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
[36m(pid=82840)[0m NumPy 2.1.3 as it may crash. To support both 1.x and 2.x[32m [repeated 24x across cluster][0m
[36m(pid=82840)[0m versions of NumPy, modules must be compiled with NumPy 2.0.[32m [repeated 24x across cluster][0m
[36m(pid=82840)[0m Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.[32m [repeated 24x across cluster][0m
[36m(pid=82840)[0m If you are a user of the module, the easiest solution will be to[32m [repeated 24x across cluster][0m
[36m(pid=82840)[0m downgrade to 'numpy<2' or try to upgrade the affected module.[32m [repeated 24x across cluster][0m
[36m(pid=82840)[0m We expect that some modules will n