In [2]:
import ray
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import time
import psutil

# Initialize Ray
ray.init(ignore_reinit_error=True, dashboard_port=8265)

# Load California Housing Dataset
def load_data():
    data = fetch_california_housing()
    X, y = data.data, data.target
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Linear regression training function
@ray.remote
def train_worker(X, y):
    model = LinearRegression()
    model.fit(X, y)
    score = model.score(X, y)  # R^2 score
    return score

# Fault Tolerance Test with Average Calculation
def measure_fault_tolerance(num_workers, num_failures, X_train, y_train, num_trials=5):
    print(f"Fault Tolerance Test: {num_workers} Workers with {num_failures} Failures (over {num_trials} trials)")

    trial_results = []

    for trial in range(num_trials):
        workers = []
        split_X = np.array_split(X_train, num_workers)
        split_y = np.array_split(y_train, num_workers)

        # Launch workers
        for i in range(num_workers):
            workers.append(train_worker.remote(split_X[i], split_y[i]))

        # Simulate failures
        for _ in range(num_failures):
            if workers:
                failed_worker = workers.pop(np.random.randint(len(workers)))
                ray.cancel(failed_worker, force=True)
                print(f"Simulated failure for task: {failed_worker}")

        # Collect results from surviving workers
        results = []
        for worker in workers:
            try:
                results.append(ray.get(worker))
            except Exception as e:
                print(f"Task {worker} failed with error: {e}")

        if results:
            avg_score = np.mean(results)
            trial_results.append(avg_score)
            print(f"Trial {trial + 1}: Average R^2 Score after failures: {avg_score:.4f}")
        else:
            print(f"Trial {trial + 1}: No surviving workers completed their tasks.")
            trial_results.append(0)

    # Calculate overall average fault tolerance score
    overall_avg_score = np.mean(trial_results)
    print(f"Overall Average R^2 Score after failures: {overall_avg_score:.4f}\n")
    return overall_avg_score

# Resource monitoring function
def monitor_resource_usage():
    total_cores = psutil.cpu_count(logical=True)
    available_cores = ray.available_resources().get("CPU", 0)
    total_memory = psutil.virtual_memory().total / (1024 ** 3)  # GB
    used_memory = (total_memory - psutil.virtual_memory().available / (1024 ** 3))

    print(f"Total CPU Cores: {total_cores}")
    print(f"Available CPU Cores: {available_cores}")
    print(f"Total Memory: {total_memory:.2f} GB")
    print(f"Used Memory: {used_memory:.2f} GB")

# Main execution
if __name__ == "__main__":
    X_train, X_test, y_train, y_test = load_data()

    # Test fault tolerance with different configurations
    measure_fault_tolerance(num_workers=4, num_failures=1, X_train=X_train, y_train=y_train, num_trials=3)
    measure_fault_tolerance(num_workers=8, num_failures=2, X_train=X_train, y_train=y_train, num_trials=3)

    # Monitor resource usage
    print("\nResource Usage:")
    monitor_resource_usage()

    ray.shutdown()


2024-12-04 00:32:08,710	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-12-04 00:32:15,612	INFO worker.py:1634 -- Connecting to existing Ray cluster at address: 127.0.0.1:6379...
2024-12-04 00:32:15,673	INFO worker.py:1819 -- Connected to Ray cluster.


Fault Tolerance Test: 4 Workers with 1 Failures (over 3 trials)
Simulated failure for task: ObjectRef(4482c0d3e15a41a8ffffffffffffffffffffffff0500000001000000)
Trial 1: Average R^2 Score after failures: 0.6145
Simulated failure for task: ObjectRef(b9a5010f4f40611cffffffffffffffffffffffff0500000001000000)


2024-12-04 00:32:30,854	ERROR worker.py:422 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.


Trial 2: Average R^2 Score after failures: 0.6129
Simulated failure for task: ObjectRef(bbb11aeaabc120eaffffffffffffffffffffffff0500000001000000)


2024-12-04 00:32:36,006	ERROR worker.py:422 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.


Trial 3: Average R^2 Score after failures: 0.6145
Overall Average R^2 Score after failures: 0.6140

Fault Tolerance Test: 8 Workers with 2 Failures (over 3 trials)
Simulated failure for task: ObjectRef(81a3509f4f3351b2ffffffffffffffffffffffff0500000001000000)
Simulated failure for task: ObjectRef(98d7dbefad4deaeaffffffffffffffffffffffff0500000001000000)
Trial 1: Average R^2 Score after failures: 0.6386
Simulated failure for task: ObjectRef(4aa2f0cdc5f0bbceffffffffffffffffffffffff0500000001000000)
Simulated failure for task: ObjectRef(9af1a5dde5f59af2ffffffffffffffffffffffff0500000001000000)
Trial 2: Average R^2 Score after failures: 0.6275


2024-12-04 00:32:37,443	ERROR worker.py:422 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.


Simulated failure for task: ObjectRef(cd38da7afea09ef5ffffffffffffffffffffffff0500000001000000)
Simulated failure for task: ObjectRef(f9b2a1b2309c55c2ffffffffffffffffffffffff0500000001000000)
Trial 3: Average R^2 Score after failures: 0.6389
Overall Average R^2 Score after failures: 0.6350


Resource Usage:
Total CPU Cores: 12
Available CPU Cores: 5.0
Total Memory: 7.73 GB
Used Memory: 6.63 GB
