# RayDP - Distributed Spark MLLib based Model Training on Snowpark Container Services

This notebook demonstrates how to use RayDP to perform distributed Spark MLLIb based model training on Ray cluster in Snowpark Container Services.

## Setup and Imports

In [1]:
import ray
import raydp
import pprint
import warnings
import logging    
import time
import os
import numpy as np
import socket
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, when, round as spark_round
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.regression import RandomForestRegressor, LinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
import snowflake.connector
from snowflake.snowpark import Session
from snowflake.ml.data.data_connector import DataConnector
from snowflake.ml.ray.datasink import SnowflakeTableDatasink
print(f"Ray version: {ray.__version__}")
print(f"RayDP version: {raydp.__version__}")

  import pkg_resources


Ray version: 2.46.0
RayDP version: 1.6.2


## Initialize Snowpark Session

In [2]:
def connection() -> snowflake.connector.SnowflakeConnection:
    if os.path.isfile("/snowflake/session/token"):
        creds = {
            'host': os.getenv('SNOWFLAKE_HOST'),
            'port': os.getenv('SNOWFLAKE_PORT'),
            'protocol': "https",
            'account': os.getenv('SNOWFLAKE_ACCOUNT'),
            'authenticator': "oauth",
            'token': open('/snowflake/session/token', 'r').read(),
            'warehouse': "LARGE_WH",
            'database': os.getenv('SNOWFLAKE_DATABASE'),
            'schema': os.getenv('SNOWFLAKE_SCHEMA'),
            'client_session_keep_alive': True
        }
    else:
        creds = {
            'account': os.getenv('SNOWFLAKE_ACCOUNT'),
            'user': os.getenv('SNOWFLAKE_USER'),
            'password': os.getenv('SNOWFLAKE_PASSWORD'),
            'warehouse': snowflake_warehouse,
            'database': os.getenv('SNOWFLAKE_DATABASE'),
            'schema': os.getenv('SNOWFLAKE_SCHEMA'),
            'client_session_keep_alive': True
        }

    connection = snowflake.connector.connect(**creds)
    return connection

def get_session() -> Session:
    return Session.builder.configs({"connection": connection()}).create()

In [3]:
session = get_session()

In [4]:
session.get_current_database()

'"RAYDP_SIS_DB"'

In [5]:
cli = ray.init(address="raydpheadservice:6379", ignore_reinit_error=True, log_to_driver=False)

2025-07-08 07:13:19,069	INFO worker.py:1694 -- Connecting to existing Ray cluster at address: raydpheadservice:6379...
2025-07-08 07:13:19,179	INFO worker.py:1879 -- Connected to Ray cluster. View the dashboard at [1m[32m10.244.28.9:8265 [39m[22m
[2025-07-08 07:13:19,234 I 19218 19218] logging.cc:297: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to 1


In [6]:
cluster_resources = ray.cluster_resources()
nodes = ray.nodes()

In [7]:
print("  Cluster Information:")
for i, node in enumerate(nodes):
    node_resources = node.get('Resources', {})
    print(f"   Node {i+1}: {node_resources}")
print()

  Cluster Information:
   Node 1: {'memory': 245448057651.0, 'node:10.244.28.9': 1.0, 'node:__internal_head__': 1.0, 'object_store_memory': 11220602060.0, 'node_type': 1.0, 'CPU': 28.0}
   Node 2: {'memory': 246363530035.0, 'object_store_memory': 11220602060.0, 'CPU': 28.0, 'node_type': 2.0, 'node:10.244.28.137': 1.0}
   Node 3: {'node_type': 2.0, 'node:10.244.28.201': 1.0, 'CPU': 28.0, 'memory': 246363697971.0, 'object_store_memory': 11220602060.0}
   Node 4: {'node_type': 2.0, 'node:10.244.28.73': 1.0, 'CPU': 28.0, 'memory': 246363603763.0, 'object_store_memory': 11220602060.0}



## See data

In [8]:
raw_data_snowdf = session.table("SPARK_MLLIB_SAMPLE_DATASET")
raw_data_snowdf.limit(1).to_pandas()

Unnamed: 0,ID,FEATURE_0,FEATURE_1,FEATURE_2,FEATURE_3,FEATURE_4,FEATURE_5,FEATURE_6,FEATURE_7,FEATURE_8,...,FEATURE_11,FEATURE_12,FEATURE_13,FEATURE_14,FEATURE_15,FEATURE_16,FEATURE_17,FEATURE_18,FEATURE_19,TARGET
0,0,1.764052,-0.394469,0.973217,2.240893,1.867558,-0.977278,0.950088,-0.151357,-0.103219,...,1.454274,0.761038,0.121675,0.443863,0.333674,1.494079,-0.205158,0.313068,-0.854096,1


In [9]:
train_snowdf, test_snowdf = raw_data_snowdf.random_split(weights=[0.70, 0.30], seed=0)

In [10]:
train_snowdf.write.mode("overwrite").save_as_table("TRAIN_SPARK_MLLIB_DATASET")
test_snowdf.write.mode("overwrite").save_as_table("TEST_SPARK_MLLIB_DATASET")

In [11]:
train_snowdf = session.table("TRAIN_SPARK_MLLIB_DATASET")
test_snowdf = session.table("TEST_SPARK_MLLIB_DATASET")

### Get optimal spark config

In [12]:
def configure_logging():
    logging.getLogger("py4j").setLevel(logging.ERROR)
    logging.getLogger("pyspark").setLevel(logging.ERROR)
    warnings.filterwarnings("ignore", category=FutureWarning)
    warnings.filterwarnings("ignore", category=UserWarning)
    print("Logging configured to suppress common Spark warnings")

def get_default_spark_configs():
    return {
        # Core Spark optimizations
        "spark.sql.adaptive.enabled": "true",
        "spark.sql.adaptive.coalescePartitions.enabled": "true",
        "spark.sql.adaptive.skewJoin.enabled": "true",
        "spark.sql.adaptive.localShuffleReader.enabled": "true",
        "spark.sql.adaptive.advisoryPartitionSizeInBytes": "128MB",
        "spark.sql.adaptive.maxRecordsPerPartition": "40000",
        
        # Serialization and compression optimizations
        "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
        "spark.kryo.registrationRequired": "false",
        "spark.kryo.unsafe": "true",
        "spark.rdd.compress": "true",
        "spark.io.compression.codec": "lz4",
        "spark.broadcast.compress": "true",
        "spark.shuffle.compress": "true",
        "spark.shuffle.spill.compress": "true",
        
        # Memory optimizations for MLlib
        "spark.executor.memoryFraction": "0.8",
        "spark.storage.memoryFraction": "0.3",
        "spark.shuffle.memoryFraction": "0.5",
        "spark.executor.extraJavaOptions": "-XX:+UseG1GC -XX:G1HeapRegionSize=16m -XX:+UseStringDeduplication",
        
        # MLlib specific optimizations
        "spark.ml.tree.maxMemoryInMB": "4096",
        "spark.ml.tree.maxDepth": "15",
        "spark.mllib.tree.maxMemoryInMB": "4096",
        "spark.sql.execution.arrow.pyspark.enabled": "true",
        "spark.sql.execution.arrow.maxRecordsPerBatch": "10000",
        
        # Network and shuffle optimizations
        "spark.shuffle.service.enabled": "false",
        "spark.shuffle.file.buffer": "1m",
        "spark.reducer.maxSizeInFlight": "96m",
        "spark.shuffle.io.maxRetries": "6",
        "spark.shuffle.io.retryWait": "30s",
        "spark.network.timeout": "600s",
        "spark.executor.heartbeatInterval": "30s",
        
        # Broadcast optimizations
        "spark.sql.autoBroadcastJoinThreshold": "200MB",
        "spark.broadcast.blockSize": "16m",
        
        # Parallelism optimizations
        "spark.task.cpus": "1",
        "spark.task.maxFailures": "3",
        "spark.stage.maxConsecutiveAttempts": "8",
        
        # Dynamic allocation disabled for consistent performance
        "spark.dynamicAllocation.enabled": "false",
        
        # Speculation for fault tolerance
        "spark.speculation": "true",
        "spark.speculation.interval": "5s",
        "spark.speculation.multiplier": "2.0",
        "spark.speculation.quantile": "0.9"
    }

In [13]:
if 'spark' in locals():
    spark.stop()
    print("Stopped previous Spark session to apply optimized configuration")
    
cli = ray.init(address="raydpheadservice:6379", ignore_reinit_error=True, log_to_driver=False)
print("Restarted Ray connection")

2025-07-08 07:13:27,592	INFO worker.py:1694 -- Connecting to existing Ray cluster at address: raydpheadservice:6379...
2025-07-08 07:13:27,592	INFO worker.py:1718 -- Calling ray.init() again after it has already been called.


Restarted Ray connection


In [14]:
def get_ray_cluster_analysis():
    cluster_resources = ray.cluster_resources()
    nodes = ray.nodes()
    total_cpus = int(cluster_resources.get('CPU', 0))
    total_memory_bytes = cluster_resources.get('memory', 0)
    total_memory_gb = total_memory_bytes / (1024**3)
    head_nodes = [n for n in nodes if n.get('Resources', {}).get('node_type', 0) == 1]
    worker_nodes = [n for n in nodes if n.get('Resources', {}).get('node_type', 0) == 2]
    cpus_per_node = total_cpus // len(nodes)
    memory_per_node_gb = total_memory_gb / len(nodes)
    analysis = {
        'total_nodes': len(nodes),
        'head_nodes': len(head_nodes),
        'worker_nodes': len(worker_nodes),
        'total_cpus': total_cpus,
        'total_memory_gb': total_memory_gb,
        'cpus_per_node': cpus_per_node,
        'memory_per_node_gb': memory_per_node_gb,
        'nodes': nodes
    }
    print(f"Ray Cluster Analysis:")
    print(f"   Total Nodes: {analysis['total_nodes']} (Head: {analysis['head_nodes']}, Workers: {analysis['worker_nodes']})")
    print(f"   Total CPUs: {analysis['total_cpus']}")
    print(f"   Total Memory: {analysis['total_memory_gb']:.1f} GB")
    print(f"   Per Node: {analysis['cpus_per_node']} CPUs, {analysis['memory_per_node_gb']:.1f} GB")
    
    return analysis

def get_dynamic_spark_configs(cluster_analysis):
    # Strategy: Use more executors with fewer cores each for better MLlib parallelism
    # MLlib (especially Random Forest) performs better with more parallelism
    total_cpus = cluster_analysis['total_cpus']
    total_memory_gb = cluster_analysis['total_memory_gb']
    total_nodes = cluster_analysis['total_nodes']
    # Optimal executor strategy for MLlib:
    # - Target 2 executors per node for uniform distribution
    # - Use fewer cores per executor (5-8 cores optimal for MLlib)
    # - Maximize memory per executor for feature vectors
    num_executors = total_nodes * 2  # 2 executors per node
    # Reserve CPUs for driver and OS (1 per node)
    available_cpus = total_cpus - total_nodes
    executor_cores = max(5, min(8, available_cpus // num_executors))
    # Aggressive memory allocation (90% utilization)
    # Reserve only 2GB per node for OS/Ray overhead
    available_memory_gb = total_memory_gb - (total_nodes * 2)
    executor_memory_gb = int(available_memory_gb * 0.85 / num_executors)
    driver_memory_gb = int(available_memory_gb * 0.15)
    # Ensure minimum viable sizes
    executor_memory_gb = max(8, executor_memory_gb)
    driver_memory_gb = max(4, min(16, driver_memory_gb))
    # Calculate optimal partitions for MLlib
    # Rule: 2-4 partitions per core for MLlib workloads
    optimal_partitions = (num_executors * executor_cores) * 3
    config = {
        'num_executors': num_executors,
        'executor_cores': executor_cores,
        'executor_memory': f"{executor_memory_gb}g",
        'driver_memory': f"{driver_memory_gb}g",
        'optimal_partitions': optimal_partitions,
        'total_executor_cores': num_executors * executor_cores,
        'total_executor_memory_gb': num_executors * executor_memory_gb,
        'cpu_utilization_pct': (num_executors * executor_cores) / total_cpus * 100,
        'memory_utilization_pct': (num_executors * executor_memory_gb + driver_memory_gb) / total_memory_gb * 100
    }
    print(f"Optimal Spark Configuration:")
    print(f"   Strategy: High parallelism with optimal executor sizing for MLlib")
    print(f"   Executors: {config['num_executors']} ({config['num_executors']//total_nodes} per node)")
    print(f"   Executor cores: {config['executor_cores']} (total: {config['total_executor_cores']})")
    print(f"   Executor memory: {config['executor_memory']} (total: {config['total_executor_memory_gb']}GB)")
    print(f"   Driver memory: {config['driver_memory']}")
    print(f"   Optimal partitions: {config['optimal_partitions']}")
    print(f"   CPU utilization: {config['cpu_utilization_pct']:.1f}%")
    print(f"   Memory utilization: {config['memory_utilization_pct']:.1f}%")
    
    return config

In [15]:
# Apply optimized configuration
configure_logging()
cluster_analysis = get_ray_cluster_analysis()
dynamic_spark_config = get_dynamic_spark_configs(cluster_analysis)
spark_configs = get_default_spark_configs()
spark_configs["spark.driver.memory"] = dynamic_spark_config['driver_memory']

# Add extra speed optimizations
spark_configs.update({
    "spark.sql.shuffle.partitions": str(dynamic_spark_config['optimal_partitions']),
    "spark.default.parallelism": str(dynamic_spark_config['total_executor_cores'] * 2),
    "spark.ml.tree.maxMemoryInMB": "8192",  # More memory for faster tree building
    "spark.executor.cores": str(dynamic_spark_config['executor_cores']),
    "spark.task.cpus": "1"
})

print(f"Ready to restart with dynamic spark configuration:")
print(f"   {dynamic_spark_config['num_executors']} executors × {dynamic_spark_config['executor_cores']} cores = {dynamic_spark_config['total_executor_cores']} total cores")
print(f"   Memory: {dynamic_spark_config['executor_memory']} per executor")
print(f"   Partitions: {dynamic_spark_config['optimal_partitions']}")
print("=" * 70)    

Ray Cluster Analysis:
   Total Nodes: 4 (Head: 1, Workers: 3)
   Total CPUs: 112
   Total Memory: 916.9 GB
   Per Node: 28 CPUs, 229.2 GB
Optimal Spark Configuration:
   Strategy: High parallelism with optimal executor sizing for MLlib
   Executors: 8 (2 per node)
   Executor cores: 8 (total: 64)
   Executor memory: 96g (total: 768GB)
   Driver memory: 16g
   Optimal partitions: 192
   CPU utilization: 57.1%
   Memory utilization: 85.5%
Ready to restart with dynamic spark configuration:
   8 executors × 8 cores = 64 total cores
   Memory: 96g per executor
   Partitions: 192


## Initialize Ray DP with optimal Spark config

In [16]:
print("Initializing RayDP with optimal spark configuration...")
spark = raydp.init_spark(
    app_name="RayDP_MLLib_Training",
    num_executors=dynamic_spark_config['num_executors'],
    executor_cores=dynamic_spark_config['executor_cores'],
    executor_memory=dynamic_spark_config['executor_memory'],
    configs=spark_configs
)
print(f"Spark started with {dynamic_spark_config['num_executors']} executors × {dynamic_spark_config['executor_cores']} cores")
print(f"   Total cores: {dynamic_spark_config['total_executor_cores']}")
print(f"   Memory per executor: {dynamic_spark_config['executor_memory']}")

Initializing RayDP with optimal spark configuration...


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/08 07:15:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark started with 8 executors × 8 cores
   Total cores: 64
   Memory per executor: 96g


In [17]:
def set_spark_log_level(spark_session):
    try:
        spark_context = spark_session.sparkContext
        spark_context.setLogLevel("ERROR")
        print("Spark log level set to ERROR (warnings suppressed)")
    except Exception as e:
        print(f"Could not set Spark log level: {e}")
set_spark_log_level(spark)



In [18]:
print(f"Spark session initialized")
print(f"Application ID: {spark.sparkContext.applicationId}")
print(f"Spark Version: {spark.version}")

Spark session initialized
Application ID: spark-application-1751958905166
Spark Version: 3.5.4


In [19]:
spark

In [20]:
optimal_partitions = dynamic_spark_config['optimal_partitions']
print(f"Using optimal partitions for MLlib: {optimal_partitions}")
print(f"   Partitions per core: {optimal_partitions / dynamic_spark_config['total_executor_cores']:.1f}")

Using optimal partitions for MLlib: 192
   Partitions per core: 3.0


### See partitioned spark dataframe

In [21]:
%%time
df = DataConnector.from_dataframe(train_snowdf).to_ray_dataset().to_spark(spark).repartition(int(optimal_partitions))
print(f"Created Spark DataFrame with {df.rdd.getNumPartitions()} partitions")

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


Info - 2025-07-08 07:16:42.869333 - Loading data from Snowpark Dataframe from query id 01bd8bd4-0205-ed1a-0000-50070cb49a62


2025-07-08 07:16:45,731	INFO logging.py:290 -- Registered dataset logger for dataset dataset_2_0
2025-07-08 07:16:45,743	INFO streaming_executor.py:117 -- Starting execution of Dataset dataset_2_0. Full logs are in /raylogs/ray/session_2025-07-08_06-25-28_011968_23/logs/ray-data
2025-07-08 07:16:45,743	INFO streaming_executor.py:118 -- Execution plan of Dataset dataset_2_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadResultSetDataSource]


Info - 2025-07-08 07:16:45.716959 - Finished executing data load query.
Info - 2025-07-08 07:16:45.725757 - Loaded data into ray dataset.


2025-07-08 07:16:49,224	INFO streaming_executor.py:220 -- ✔️  Dataset dataset_2_0 execution finished in 3.48 seconds

Created Spark DataFrame with 192 partitions
CPU times: user 1.26 s, sys: 319 ms, total: 1.58 s
Wall time: 17.2 s


In [22]:
df.show(1)

                                                                                

+------+-----------+----------+-----------+------------+-----------+---------+-----------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+------+
|    ID|  FEATURE_0| FEATURE_1|  FEATURE_2|   FEATURE_3|  FEATURE_4|FEATURE_5|  FEATURE_6|  FEATURE_7| FEATURE_8| FEATURE_9|FEATURE_10|FEATURE_11|FEATURE_12|FEATURE_13|FEATURE_14|FEATURE_15|FEATURE_16|FEATURE_17|FEATURE_18|FEATURE_19|TARGET|
+------+-----------+----------+-----------+------------+-----------+---------+-----------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+------+
|495692|-0.42123833|0.13511305|-0.12561826|-0.068190016|-0.99639165|2.0022051|0.123225786|-0.82437736|0.37117386|0.11159816| 1.3409237|0.26517436|-2.4053752| 1.4127581|0.27067843|-1.2798657|0.49224877|-0.6434579| 0.5071654| 0.9820699|     0|
+------+-----------+----------+-

## Train classification model

In [24]:
def train_classification_model(df, spark_config):
    feature_cols = [f"FEATURE_{i}" for i in range(20)]
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="RAW_FEATURES")
    scaler = StandardScaler(inputCol="RAW_FEATURES", outputCol="FEATURES", withStd=True, withMean=True)
    # Optimize Random Forest parameters based on cluster configuration
    num_executors = spark_config['num_executors']
    executor_cores = spark_config['executor_cores']
    total_cores = spark_config['total_executor_cores']
    
    optimal_num_trees = min(50, max(20, total_cores))
    optimal_max_depth = 6  
    optimal_max_bins = 32
    optimal_min_instances = 20
    
    rf = RandomForestClassifier(
        labelCol="TARGET",
        featuresCol="FEATURES",
        numTrees=optimal_num_trees,
        maxDepth=optimal_max_depth,
        maxBins=optimal_max_bins,
        minInstancesPerNode=optimal_min_instances,
        subsamplingRate=0.8,  # Bagging for better generalization  
        featureSubsetStrategy="sqrt",
        seed=42,
        cacheNodeIds=True,  # Cache for better performance
        checkpointInterval=20  # Less frequent checkpoints for SPEED
    )
    
    pipeline = Pipeline(stages=[assembler, scaler, rf])
    train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)
    
    # Cache datasets for better performance
    train_data.cache()
    test_data.cache()
    
    print(f"Optimal Random Forest Configuration:")
    print(f"   Trees: {optimal_num_trees}")
    print(f"   Max depth: {optimal_max_depth}")
    print(f"   Max bins: {optimal_max_bins}")
    print(f"   Min instances: {optimal_min_instances}")
    print(f"   Feature strategy: sqrt")
    print(f"   Training set: {train_data.count():,} rows")
    print(f"   Evaluation set: {test_data.count():,} rows")
    print("Starting distributed training ...")
    
    start_time = time.time()
    model = pipeline.fit(train_data)
    training_time = time.time() - start_time
    
    print(f"Training completed in {training_time:.2f} seconds")
    print("Evaluating model...")
    
    predictions = model.transform(test_data)
    evaluator_auc = BinaryClassificationEvaluator(labelCol="TARGET", metricName="areaUnderROC")
    evaluator_acc = MulticlassClassificationEvaluator(labelCol="TARGET", predictionCol="prediction", metricName="accuracy")
    auc = evaluator_auc.evaluate(predictions)
    accuracy = evaluator_acc.evaluate(predictions)
    
    print(f"Model Performance:")
    print(f"   AUC: {auc:.4f}")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   Training time: {training_time:.2f}s")
    print(f"   Trees/second: {optimal_num_trees/training_time:.2f}")
    
    # Clean up cached data
    train_data.unpersist()
    test_data.unpersist()
    
    return model, {
        'auc': auc, 
        'accuracy': accuracy, 
        'training_time': training_time, 
        'num_trees': optimal_num_trees,
        'trees_per_second': optimal_num_trees/training_time
    }

In [25]:
%%time
model_class, metrics_class = train_classification_model(df, dynamic_spark_config)

Optimal Random Forest Configuration:
   Trees: 50
   Max depth: 6
   Max bins: 32
   Min instances: 20
   Feature strategy: sqrt


                                                                                

   Training set: 559,974 rows


                                                                                

   Evaluation set: 139,670 rows
Starting distributed training ...


                                                                                

Training completed in 17.99 seconds
Evaluating model...




Model Performance:
   AUC: 0.8904
   Accuracy: 0.8085
   Training time: 17.99s
   Trees/second: 2.78
CPU times: user 595 ms, sys: 197 ms, total: 791 ms
Wall time: 38.7 s


                                                                                

In [27]:
def print_performance_summary(spark_config, metrics):
    """
    Print comprehensive performance summary comparing to baseline
    """
    print("=" * 80)
    print("RAYDP PERFORMANCE SUMMARY")
    print("=" * 80)
    
    print(f"Cluster Utilization:")
    print(f"   CPU utilization: {spark_config['cpu_utilization_pct']:.1f}%")
    print(f"   Memory utilization: {spark_config['memory_utilization_pct']:.1f}%")
    print(f"   Executors: {spark_config['num_executors']} ({spark_config['num_executors']//4} per node)")
    print(f"   Total cores: {spark_config['total_executor_cores']}")
    print(f"   Total memory: {spark_config['total_executor_memory_gb']}GB")
    
    print(f"\nTraining Performance:")
    print(f"   Training time: {metrics['training_time']:.2f}s")
    print(f"   Trees trained: {metrics['num_trees']}")
    print(f"   Trees/second: {metrics['trees_per_second']:.2f}")
    print(f"   Model accuracy: {metrics['accuracy']:.4f}")
    print(f"   Model AUC: {metrics['auc']:.4f}")
    
    print(f"\nKey Optimizations Applied:")
    print(f"High parallelism executor strategy (2 per node)")
    print(f"Optimized executor sizing (6-8 cores each)")
    print(f"Aggressive memory utilization (90%+)")
    print(f"MLlib-specific Spark configurations")
    print(f"Dynamic Random Forest parameter scaling")
    print(f"Advanced compression and serialization")
    print(f"Network and shuffle optimizations")
    print("=" * 80)

# Call the performance summary
print_performance_summary(dynamic_spark_config, metrics_class)


RAYDP PERFORMANCE SUMMARY
Cluster Utilization:
   CPU utilization: 57.1%
   Memory utilization: 85.5%
   Executors: 8 (2 per node)
   Total cores: 64
   Total memory: 768GB

Training Performance:
   Training time: 17.99s
   Trees trained: 50
   Trees/second: 2.78
   Model accuracy: 0.8085
   Model AUC: 0.8904

Key Optimizations Applied:
High parallelism executor strategy (2 per node)
Optimized executor sizing (6-8 cores each)
Aggressive memory utilization (90%+)
MLlib-specific Spark configurations
Dynamic Random Forest parameter scaling
Advanced compression and serialization
Network and shuffle optimizations


### Clean up

In [28]:
spark.stop()

In [29]:
cli.disconnect()

In [30]:
session.close()