# RayDP - Distributed Spark on Ray in Snowpark Container Services

This notebook demonstrates how to use RayDP to run distributed Spark workloads on Ray cluster in Snowpark Container Services.

Based on GCP Vertex AI documentation: https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/run-spark-on-ray


## Setup and Imports

In [1]:
import ray
import raydp
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, avg, sum as spark_sum
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import os

print(f"Ray version: {ray.__version__}")
print(f"RayDP version: {raydp.__version__}")

  from .autonotebook import tqdm as notebook_tqdm
2025-06-27 23:19:43,224	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-06-27 23:19:43,798	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


Ray version: 2.42.0
RayDP version: 1.6.2


In [2]:
# Initialize Ray if not already initialized
ray.init(address="auto", ignore_reinit_error=True)

print(f"Ray cluster resources: {ray.cluster_resources()}")

2025-06-27 23:19:43,986	INFO worker.py:1654 -- Connecting to existing Ray cluster at address: 10.244.178.11:6379...
2025-06-27 23:19:43,998	INFO worker.py:1832 -- Connected to Ray cluster. View the dashboard at [1m[32m10.244.178.11:8265 [39m[22m


Ray cluster resources: {'CPU': 24.0, 'node:10.244.181.11': 1.0, 'object_store_memory': 34548841266.0, 'GPU': 4.0, 'memory': 77756481128.0, 'accelerator_type:A10G': 4.0, 'node:10.244.180.11': 1.0, 'node:10.244.178.11': 1.0, 'node:__internal_head__': 1.0, 'node:10.244.179.11': 1.0}


[2025-06-27 23:19:44,000 I 2049 2049] logging.cc:293: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1
[33m(raylet, ip=10.244.179.11)[0m [2025-06-27 23:19:45,494 I 646 646] logging.cc:293: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1
[36m(RayDPSparkMaster pid=1430)[0m [2025-06-27 23:19:48,555 I 2123 2151] gcs_client.cc:98: GcsClient has no Cluster ID set, and won't fetch from GCS.
[36m(RayDPSparkMaster pid=1430)[0m [2025-06-27 23:19:48,683 I 2123 2151] gcs_client.cc:98: GcsClient has no Cluster ID set, and won't fetch from GCS.
[36m(SparkExecutor pid=646, ip=10.244.179.11)[0m Setting default log level to "WARN".
[36m(SparkExecutor pid=646, ip=10.244.179.11)[0m To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
[33m(raylet, ip=10.244.180.11)[0m [2025-06-27 23:19:50,253 I 408 412] logging.cc:293: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1[32m [repeated 2x 

## RayDP with Ray client

In [3]:
@ray.remote
class SparkExecutor:
  import pyspark

  spark: pyspark.sql.SparkSession = None

  def __init__(self):

    import ray
    import raydp

    self.spark = raydp.init_spark(
      app_name="RayDP_Distributed_Spark_SPCS",
      num_executors=2,
      executor_cores=2,
      executor_memory="2G",
      configs={
            "spark.driver.memory": "4G",  # Set driver memory here instead
            "spark.sql.adaptive.enabled": "true",
            "spark.sql.adaptive.coalescePartitions.enabled": "true",
            "spark.sql.execution.arrow.pyspark.enabled": "true"
        }
    )
    print(f"Spark version: {self.spark.version}")
    print(f"Spark application ID: {self.spark.sparkContext.applicationId}")
    print(f"Spark UI URL: {self.spark.sparkContext.uiWebUrl}")

  def get_data(self):
    df = self.spark.createDataFrame(
        [
            ("sue", 32),
            ("li", 3),
            ("bob", 75),
            ("heo", 13),
        ],
        ["first_name", "age"],
    )
    return df.toJSON().collect()

  def stop_spark(self):
    import raydp
    raydp.stop_spark()

['{"first_name":"sue","age":32}', '{"first_name":"li","age":3}', '{"first_name":"bob","age":75}', '{"first_name":"heo","age":13}']


In [None]:
s = SparkExecutor.remote()
data = ray.get(s.get_data.remote())
print(data)
ray.get(s.stop_spark.remote())

## RayDP with Ray Job API

In [4]:
import pyspark
import raydp

def get_data(spark: pyspark.sql.SparkSession):
    df = spark.createDataFrame(
        [
            ("sue", 32),
            ("li", 3),
            ("bob", 75),
            ("heo", 13),
        ],
        ["first_name", "age"],
    )
    return df.toJSON().collect()

def stop_spark():
    raydp.stop_spark()

spark = raydp.init_spark(
      app_name="RAYDP JOB EXAMPLE",
        num_executors=1,
        executor_cores=1,
        executor_memory="500M",
    )
print(f"Spark version: {spark.version}")
print(f"Spark application ID: {spark.sparkContext.applicationId}")
print(f"Spark UI URL: {spark.sparkContext.uiWebUrl}")
print(get_data(spark))
stop_spark()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/27 23:21:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

['{"first_name":"sue","age":32}', '{"first_name":"li","age":3}', '{"first_name":"bob","age":75}', '{"first_name":"heo","age":13}']


## Pyspark Pandas UDF on Ray cluster on Vertex AI

In [9]:
import pandas as pd
import pyspark
import raydp
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import StringType

In [10]:
def test_udf(spark: pyspark.sql.SparkSession):
    import pandas as pd
    
    df = spark.createDataFrame(pd.read_csv("https://www.datavis.ca/gallery/guerry/guerry.csv"))
    return df.select(func('Lottery','Literacy', 'Pop1831')).collect()

In [11]:
@pandas_udf(StringType())
def func(s1: pd.Series, s2: pd.Series, s3: pd.Series) -> str:
    import numpy as np
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "statsmodels"])
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    
    d = {'Lottery': s1, 
         'Literacy': s2,
         'Pop1831': s3}
    data = pd.DataFrame(d)

    # Fit regression model (using the natural log of one of the regressors)
    results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=data).fit()
    return results.summary().as_csv()

In [12]:
spark = raydp.init_spark(
  app_name="RayDP UDF Example",
  num_executors=2,
  executor_cores=4,
  executor_memory="1500M",
)
print(f"Spark version: {spark.version}")
print(f"Spark application ID: {spark.sparkContext.applicationId}")
print(f"Spark UI URL: {spark.sparkContext.uiWebUrl}")

[Stage 2:>                                                          (0 + 1) / 1]

[Row(func(Lottery, Literacy, Pop1831)='                      OLS Regression Results                     \nDep. Variable:   ,Lottery         ,  R-squared:         ,   0.348\nModel:           ,OLS             ,  Adj. R-squared:    ,   0.333\nMethod:          ,Least Squares   ,  F-statistic:       ,   22.20\nDate:            ,Fri, 27 Jun 2025,  Prob (F-statistic):,1.90e-08\nTime:            ,23:25:14        ,  Log-Likelihood:    , -379.82\nNo. Observations:,    86          ,  AIC:               ,   765.6\nDf Residuals:    ,    83          ,  BIC:               ,   773.0\nDf Model:        ,     2          ,                     ,        \nCovariance Type: ,nonrobust       ,                     ,        \n               ,   coef   , std err ,    t    ,P>|t| ,  [0.025 ,  0.975] \nIntercept      ,  246.4341,   35.233,    6.995, 0.000,  176.358,  316.510\nLiteracy       ,   -0.4889,    0.128,   -3.832, 0.000,   -0.743,   -0.235\nnp.log(Pop1831),  -31.3114,    5.977,   -5.239, 0.000,  -43.199,  

                                                                                

In [None]:
test_udf(spark)