# Testing Ray Setup for Model training

In [1]:
from snowflakeray.cluster_init import SnowflakeRay
from snowflake.snowpark import Session
import json
from pprint import pprint

### Setup Snowpark Session

In [2]:
with open('creds.json') as f:
    data = json.load(f)
    username = data['username']
    password = data['password']
    account = data["account"]
    warehouse = data["warehouse"]
    database = data["database"]
    schema = data["schema"]
    role = data["role"]


CONNECTION_PARAMETERS = {
    'account': account,
    'user': username,
    'password': password,
    'schema': schema,
    'database': database,
    'warehouse': warehouse,
    'role': role,
    "session_parameters": {"PYTHON_CONNECTOR_QUERY_RESULT_FORMAT": "json"}
}

In [3]:
session = Session.builder.configs(CONNECTION_PARAMETERS).create()

 * To change owner, run `chown $USER "/Users/plakhanpal/Library/Application Support/snowflake/config.toml"`.
 * To restrict permissions, run `chmod 0600 "/Users/plakhanpal/Library/Application Support/snowflake/config.toml"`.

  warn(f"Bad owner or permissions on {str(filep)}{chmod_message}")


In [4]:
session.get_current_warehouse()

'"RAY_WH"'

In [5]:
session.get_current_role()

'"RAY_ROLE"'

In [8]:
project_name = "xgboost model training"

### Bring your own compute pools for XGBoost model training

In [9]:
head_compute_pool_name = "RAY_HEAD_CP_XGB"
worker_compute_pool_name = "RAY_WORKER_CP_XGB"

In [10]:
ray_head_cp_sql = f"""
    create compute pool if not exists {head_compute_pool_name}
        min_nodes = 1
        max_nodes = 1
        instance_family = GPU_NV_S
        auto_resume = TRUE
        AUTO_SUSPEND_SECS = 3600;
"""

In [11]:
ray_worker_cp_sql = f"""
    create compute pool if not exists {worker_compute_pool_name}
        min_nodes = 4
        max_nodes = 4
        instance_family = GPU_NV_S
        auto_resume = TRUE
        AUTO_SUSPEND_SECS = 3600;
"""

In [12]:
session.sql(ray_head_cp_sql).collect()

[Row(status="Compute Pool RAY_HEAD_CP_XGB successfully created. Please run 'DESCRIBE COMPUTE POOL RAY_HEAD_CP_XGB' to check the compute pool state. NOTE the compute pool is not ready to deploy a service or job before reaching ACTIVE or IDLE state.")]

In [13]:
session.sql(ray_worker_cp_sql).collect()

[Row(status="Compute Pool RAY_WORKER_CP_XGB successfully created. Please run 'DESCRIBE COMPUTE POOL RAY_WORKER_CP_XGB' to check the compute pool state. NOTE the compute pool is not ready to deploy a service or job before reaching ACTIVE or IDLE state.")]

### Initiate SnowflakeRay object

In [14]:
snowflake_ray = SnowflakeRay(session=session, project_name=project_name, head_compute_pool_name=head_compute_pool_name, worker_compute_pool_name=worker_compute_pool_name)

##### Note that if you are signed up for Block Storage PrPr, add the parameter need_block_storage_for_ray_logs=True below

In [15]:
endpoints = snowflake_ray.setup_ray_cluster(stage_name_for_specs="RAY_SPECS", stage_name_for_artifacts="ARTIFACTS", external_access_integrations=["ALLOW_ALL_EAI"], query_warehouse=warehouse,
                              ray_requirements=["ray[data]==2.10.0", "ray[client]==2.10.0", "ray[default]==2.10.0", "ray[serve]==2.9.3", "ray[tune]==2.10.0"],
                              pip_requirements=["jupyterlab", "py-spy", "ipywidgets", "virtualenv", "pandas==1.5.3", "snowflake-snowpark-python[pandas]", "scikit-learn", "torch==2.1.2", "xgboost"])

ray_head_precreated_compute_pool: True
ray_worker_precreated_compute_pool: True


INFO:snowflakeray.deploy_client.utils.cluster_init_helper:Created image repo: sfsenorthamerica-demo274-awseast.registry.snowflakecomputing.com/ray_db/ray_schema/spcs_ray_image_repoxgboostmodeltraining
INFO:snowflakeray.deploy_client.utils.cluster_init_helper:Building the Docker image and deploying to Snowpark Container Service. 
INFO:snowflakeray.deploy_client.image_builds.client_image_builder:Client:

INFO:snowflakeray.deploy_client.image_builds.client_image_builder: Version:    25.0.3

INFO:snowflakeray.deploy_client.image_builds.client_image_builder: Context:    desktop-linux

INFO:snowflakeray.deploy_client.image_builds.client_image_builder: Debug Mode: false

INFO:snowflakeray.deploy_client.image_builds.client_image_builder: Plugins:

INFO:snowflakeray.deploy_client.image_builds.client_image_builder:  buildx: Docker Buildx (Docker Inc.)

INFO:snowflakeray.deploy_client.image_builds.client_image_builder:    Version:  v0.12.1-desktop.4

INFO:snowflakeray.deploy_client.image_builds.c

In [16]:
endpoints

[{'api': 'nlwne-sfsenorthamerica-demo274-awseast.snowflakecomputing.app'},
 {'notebook': 'nlwni-sfsenorthamerica-demo274-awseast.snowflakecomputing.app'},
 {'ray-client-server-port': 'nlwnm-sfsenorthamerica-demo274-awseast.snowflakecomputing.app'},
 {'prometheus': 'nlwnq-sfsenorthamerica-demo274-awseast.snowflakecomputing.app'},
 {'grafana': 'nlwnu-sfsenorthamerica-demo274-awseast.snowflakecomputing.app'},
 {'ray-dashboard': 'nlwny-sfsenorthamerica-demo274-awseast.snowflakecomputing.app'}]

### Get service endpoints independent of the command above (optional)

In [11]:
pprint(snowflake_ray.get_public_endpoints())

[{'api': 'biaoesz-sfsenorthamerica-demo391.snowflakecomputing.app'},
 {'notebook': 'biaoes5-sfsenorthamerica-demo391.snowflakecomputing.app'},
 {'ray-client-server-port': 'biaoetb-sfsenorthamerica-demo391.snowflakecomputing.app'},
 {'prometheus': 'biaoetf-sfsenorthamerica-demo391.snowflakecomputing.app'},
 {'grafana': 'biaoetj-sfsenorthamerica-demo391.snowflakecomputing.app'},
 {'ray-dashboard': 'biaoetn-sfsenorthamerica-demo391.snowflakecomputing.app'}]


### Get ray head service status

In [12]:
pprint(snowflake_ray.get_ray_head_service_status())

[{'containerName': 'head',
  'image': 'sfsenorthamerica-demo391.registry.snowflakecomputing.com/ray_db/ray_schema/spcs_ray_image_repoxgboosthyperparametertuning/ray_head:a353d2b4c20f0377d45d19e405593c91c0d82ad9',
  'instanceId': '0',
  'message': 'Running',
  'restartCount': 0,
  'serviceName': 'SPCSRAYHEADSERVICEXGBOOSTHYPERPARAMETERTUNING',
  'startTime': '2024-05-08T05:56:46Z',
  'status': 'READY'},
 {'containerName': 'prometheus',
  'image': 'sfsenorthamerica-demo391.registry.snowflakecomputing.com/ray_db/ray_schema/spcs_ray_image_repoxgboosthyperparametertuning/ray_prometheus:6d19737014a9d54cd6894cd4f9ac23356dd3f0b1',
  'instanceId': '0',
  'message': 'Running',
  'restartCount': 0,
  'serviceName': 'SPCSRAYHEADSERVICEXGBOOSTHYPERPARAMETERTUNING',
  'startTime': '2024-05-08T05:58:26Z',
  'status': 'READY'},
 {'containerName': 'grafana',
  'image': 'sfsenorthamerica-demo391.registry.snowflakecomputing.com/ray_db/ray_schema/spcs_ray_image_repoxgboosthyperparametertuning/ray_grafana:

### Get ray head service logs

In [13]:
pprint(snowflake_ray.get_ray_head_logs())

("[{'SYSTEM$GET_SERVICE_LOGS': '    Traceback (most recent call last):\\n      "
 'File '
 '"/usr/local/lib/python3.8/dist-packages/jupyter_server/services/contents/handlers.py", '
 'line 154, in get\\n        model = await ensure_async(\\n      File '
 '"/usr/local/lib/python3.8/dist-packages/jupyter_core/utils/__init__.py", '
 'line 198, in ensure_async\\n        result = await obj\\n      File '
 '"/usr/local/lib/python3.8/dist-packages/jupyter_server/services/contents/filemanager.py", '
 'line 907, in get\\n        raise web.HTTPError(404, "No such file or '
 'directory: %s" % path)\\n    tornado.web.HTTPError: HTTP 404: Not Found (No '
 'such file or directory: .src/learner.cc)\\n    \\n    During handling of the '
 'above exception, another exception occurred:\\n    \\n    Traceback (most '
 'recent call last):\\n      File '
 '"/usr/local/lib/python3.8/dist-packages/tornado/web.py", line 1790, in '
 '_execute\\n        result = await result\\n      File '
 '"/usr/local/lib/pytho

### Get ray worker service status

In [14]:
pprint(snowflake_ray.get_ray_worker_service_status())

[{'containerName': 'worker',
  'image': 'sfsenorthamerica-demo391.registry.snowflakecomputing.com/ray_db/ray_schema/spcs_ray_image_repoxgboosthyperparametertuning/ray_worker:c0eb692746fa116491547d0e8758d380993242a8',
  'instanceId': '0',
  'message': 'Running',
  'restartCount': 0,
  'serviceName': 'SPCSRAYWORKERSERVICEXGBOOSTHYPERPARAMETERTUNING',
  'startTime': '2024-05-08T05:56:28Z',
  'status': 'READY'},
 {'containerName': 'worker',
  'image': 'sfsenorthamerica-demo391.registry.snowflakecomputing.com/ray_db/ray_schema/spcs_ray_image_repoxgboosthyperparametertuning/ray_worker:c0eb692746fa116491547d0e8758d380993242a8',
  'instanceId': '1',
  'message': 'Running',
  'restartCount': 0,
  'serviceName': 'SPCSRAYWORKERSERVICEXGBOOSTHYPERPARAMETERTUNING',
  'startTime': '2024-05-08T05:56:30Z',
  'status': 'READY'},
 {'containerName': 'worker',
  'image': 'sfsenorthamerica-demo391.registry.snowflakecomputing.com/ray_db/ray_schema/spcs_ray_image_repoxgboosthyperparametertuning/ray_worker:c0

### Get ray worker logs

In [15]:
pprint(snowflake_ray.get_ray_worker_logs())

("[{'SYSTEM$GET_SERVICE_LOGS': '+ WORKLOAD=rayworker\\n++ ifconfig eth0\\n++ "
 "sed -En -e \\'s/.*inet ([0-9.]+).*/\\\\1/p\\'\\n+ eth0Ip=10.244.13.10\\n+ "
 "echo \\'WORKLOAD: rayworker\\'\\nWORKLOAD: rayworker\\n+ \\'[\\' rayworker "
 "== rayhead \\']\\'\\n+ \\'[\\' rayworker == rayworker \\']\\'\\n+ \\'[\\' -z "
 "SPCSRAYHEADSERVICEXGBOOSTHYPERPARAMETERTUNING:6379 \\']\\'\\n+ export "
 'RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1\\n+ '
 'RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1\\n+ export '
 'RAY_BACKEND_LOG_LEVEL=debug\\n+ RAY_BACKEND_LOG_LEVEL=debug\\n+ export '
 'HOST_IP=10.244.13.10\\n+ HOST_IP=10.244.13.10\\n+ export NCCL_DEBUG=INFO\\n+ '
 'NCCL_DEBUG=INFO\\n+ export NCCL_SOCKET_IFNAME=eth0\\n+ '
 'NCCL_SOCKET_IFNAME=eth0\\n+ ray start --node-ip-address=10.244.13.10 '
 '--disable-usage-stats '
 '--address=SPCSRAYHEADSERVICEXGBOOSTHYPERPARAMETERTUNING:6379 '
 '\\\'--resources={"custom_llm_serving_label": 1}\\\' '
 '--object-manager-port=8076 --node-manager-port=8077 '
 '--runtime-env-a

### Delete all services

In [None]:
#snowflake_ray.delete_all_services()

INFO:snowflakeray.deploy_client.utils.cluster_init_helper:Deleted service: SPCSRAYHEADSERVICELLMSERVING
INFO:snowflakeray.deploy_client.utils.cluster_init_helper:Deleted service: SPCSRAYWORKERSERVICELLMSERVING


### Suspend all compute pools

In [24]:
snowflake_ray.suspend_all_compute_pools()

INFO:snowflakeray.deploy_client.utils.cluster_init_helper:Deleted service: SPCSRAYHEADSERVICELLMSERVINGFORSUMMIT
INFO:snowflakeray.deploy_client.utils.cluster_init_helper:Deleted service: SPCSRAYWORKERSERVICELLMSERVINGFORSUMMIT
INFO:snowflakeray.deploy_client.utils.cluster_init_helper:Suspended compute pool: RAY_HEAD_CP_LLM_SERVING_TEST
INFO:snowflakeray.deploy_client.utils.cluster_init_helper:Suspended compute pool: RAY_WORKER_CP_LLM_SERVING_TEST


### Delete all compute pools

In [34]:
snowflake_ray.delete_all_compute_pools()

INFO:snowflakeray.deploy_client.utils.cluster_init_helper:Deleted compute pool: RAY_HEAD_CP_XGB
INFO:snowflakeray.deploy_client.utils.cluster_init_helper:Deleted compute pool: RAY_WORKER_CP_XGB


### Close snowpark session

In [12]:
session.close()