In [1]:
import tensorflow as tf
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import time
import psutil

# Load California Housing Dataset and increase the size
def load_data():
    data = fetch_california_housing()
    X, y = data.data, data.target
    
    # Increase the size of the dataset by tripling the data
    X = np.tile(X, (2, 1))  # datasize is normal not increeaed for thee both x and y (for X)
    y = np.tile(y, 2)       # dataset size is same for y (for y)
    
    return train_test_split(X, y, test_size=0.2, random_state=42)

# TensorFlow distributed linear regression training function with MirroredStrategy
def train_worker(X, y):
    # Create a MirroredStrategy for distributed training
    strategy = tf.distribute.MirroredStrategy()

    print(f"Number of devices: {strategy.num_replicas_in_sync}")

    # Open a strategy scope.
    with strategy.scope():
        # Create a simple TensorFlow model (Linear Regression)
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(1, input_dim=X.shape[1])  # 1 output (regression)
        ])
        
        model.compile(optimizer='adam', loss='mse')

        # Track the start time
        start_time = time.time()
        
        # Train the model
        model.fit(X, y, epochs=10, batch_size=32, verbose=0)
        
        # Evaluate the model and get R^2 score
        predictions = model.predict(X)
        ss_total = np.sum((y - np.mean(y))**2)
        ss_residual = np.sum((y - predictions.flatten())**2)
        r2_score = 1 - (ss_residual / ss_total)

        training_time = time.time() - start_time
        
        # Monitoring Core Usage During Training
        monitor_core_usage()
        
        return r2_score, training_time

# Function to monitor core usage during training
def monitor_core_usage():
    # Get number of physical CPUs
    total_cores = psutil.cpu_count(logical=True)
    
    # Check percentage of CPU usage during training
    core_usage = psutil.cpu_percent(interval=1)  # 1 second interval to check usage
    available_cores = psutil.cpu_count(logical=False)  # Logical cores is different from physical
    
    print(f"Total CPU Cores: {total_cores}")
    print(f"Available CPU Cores (Physical): {available_cores}")
    print(f"CPU Usage during training: {core_usage}%")
    
    # You can also monitor memory usage during training
    total_memory = psutil.virtual_memory().total / (1024 ** 3)  # GB
    used_memory = (psutil.virtual_memory().total - psutil.virtual_memory().available) / (1024 ** 3)
    
    print(f"Total Memory: {total_memory:.2f} GB")
    print(f"Used Memory: {used_memory:.2f} GB")

# Fault Tolerance Test with Average Calculation
def measure_fault_tolerance(num_workers, num_failures, X_train, y_train, num_trials=5):
    print(f"\nFault Tolerance Test: {num_workers} Workers with {num_failures} Failures (over {num_trials} trials)")

    trial_results = []

    for trial in range(num_trials):
        # Launch workers
        results = []
        training_times = []
        for _ in range(num_workers):
            r2_score, training_time = train_worker(X_train, y_train)
            results.append(r2_score)
            training_times.append(training_time)

        # Simulate failures
        for _ in range(num_failures):
            if results:
                failed_idx = np.random.randint(len(results))
                results.pop(failed_idx)
                training_times.pop(failed_idx)
                print(f"Simulated failure for task: {failed_idx}")

        if results:
            avg_score = np.mean(results)
            avg_training_time = np.mean(training_times)
            trial_results.append(avg_score)
            print(f"Trial {trial + 1}: Average R^2 Score after failures: {avg_score:.4f}, Average Training Time: {avg_training_time:.2f} seconds")
        else:
            print(f"Trial {trial + 1}: No surviving workers completed their tasks.")
            trial_results.append(0)

    # Calculate overall average fault tolerance score
    overall_avg_score = np.mean(trial_results)
    print(f"Overall Average R^2 Score after failures: {overall_avg_score:.4f}\n")
    return overall_avg_score

# Measure communication overhead
def measure_communication_overhead(num_workers, X_train, y_train):
    start_time = time.time()

    # Train with distributed setup
    for _ in range(num_workers):
        train_worker(X_train, y_train)

    end_time = time.time()
    communication_time = end_time - start_time
    print(f"Communication Overhead for {num_workers} workers: {communication_time:.2f} seconds")

# Main execution
if __name__ == "__main__":
    X_train, X_test, y_train, y_test = load_data()

    # Test fault tolerance with different configurations (2, 4, 8, and 10 workers)
    for num_workers in [2, 4, 8, 10]:
        measure_fault_tolerance(num_workers=num_workers, num_failures=2, X_train=X_train, y_train=y_train, num_trials=3)

        # Measure communication overhead
        measure_communication_overhead(num_workers=num_workers, X_train=X_train, y_train=y_train)

    # Monitor resource usage
    print("\nResource Usage:")
    monitor_core_usage()



Fault Tolerance Test: 2 Workers with 2 Failures (over 3 trials)
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Number of devices: 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



[1m3096/3096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 576us/step
Total CPU Cores: 12
Available CPU Cores (Physical): 10
CPU Usage during training: 4.1%
Total Memory: 7.73 GB
Used Memory: 7.25 GB
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Number of devices: 1
[1m3096/3096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 654us/step
Total CPU Cores: 12
Available CPU Cores (Physical): 10
CPU Usage during training: 0.1%
Total Memory: 7.73 GB
Used Memory: 7.20 GB
Simulated failure for task: 0
Simulated failure for task: 0
Trial 1: No surviving workers completed their tasks.
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Number of devices: 1
[1m3096/3096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 621us/step
Total CPU Cores: 12
Available CPU Cores (Physical): 10
CPU Usage during training: 6.4%
Total Memory: 7.73 GB
Used Memory: 7.23 GB
INFO:tensorfl