Skip to content

Commit 43ede2c

Browse files
committed
Updated instructions for running with IBM Cloud Code Engine.
1 parent ff2abc6 commit 43ede2c

File tree

4 files changed

+188
-8
lines changed

4 files changed

+188
-8
lines changed

deploy/ibm_cloud_code_engine/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ export NAMESPACE=<namespace from above>
5151

5252
Update With the following command you can download a basic Ray cluster definition and customize it for your namespace:
5353
```shell
54-
sed "s/NAMESPACE/$NAMESPACE/" > ./example-cluster.yaml
54+
cd ./deploy/ibm_cloud_code_engine/
55+
sed "s/NAMESPACE/$NAMESPACE/" ./example-cluster.yaml.template > ./example-cluster.yaml
5556
```
5657

5758
This reference deployment file will create a Ray cluster with following characteristics:

deploy/ibm_cloud_code_engine/example-cluster.yaml renamed to deploy/ibm_cloud_code_engine/example-cluster.yaml.template

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ provider:
2626
use_internal_ips: true
2727

2828
# Namespace to use for all resources created.
29-
namespace:
29+
namespace: NAMESPACE
3030

3131
services:
3232
# Service that maps to the head node of the Ray cluster.
@@ -89,7 +89,7 @@ available_node_types:
8989
resources:
9090
requests:
9191
cpu: 1
92-
memory: 2G
92+
memory: 1Gi
9393
limits:
9494
# The maximum memory that this pod is allowed to use. The
9595
# limit will be detected by ray and split to use 10% for
@@ -99,7 +99,7 @@ available_node_types:
9999
# allocate a very large object store in each pod that may
100100
# cause problems for other pods.
101101
cpu: 1
102-
memory: 2G
102+
memory: 1Gi
103103
head_node:
104104
# The minimum number of worker nodes of this type to launch.
105105
# This number should be >= 0.
@@ -120,7 +120,7 @@ available_node_types:
120120
spec:
121121
# Change this if you altered the autoscaler_service_account above
122122
# or want to provide your own.
123-
serviceAccountName: ap9fjwkf04j-writer
123+
serviceAccountName: NAMESPACE-writer
124124

125125
restartPolicy: Never
126126

@@ -153,7 +153,7 @@ available_node_types:
153153
resources:
154154
requests:
155155
cpu: 1
156-
memory: 2G
156+
memory: 1Gi
157157
limits:
158158
# The maximum memory that this pod is allowed to use. The
159159
# limit will be detected by ray and split to use 10% for
@@ -163,7 +163,7 @@ available_node_types:
163163
# allocate a very large object store in each pod that may
164164
# cause problems for other pods.
165165
cpu: 1
166-
memory: 2G
166+
memory: 1Gi
167167

168168

169169
# Command to start ray on the head node. You don't need to change this.

docs/source/getting_started/starting.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ export NAMESPACE=<namespace from above>
7171

7272
Update With the following command you can download a basic Ray cluster definition and customize it for your namespace:
7373
```shell
74-
sed "s/NAMESPACE/$NAMESPACE/" > ./example-cluster.yaml
74+
cd ./deploy/ibm_cloud_code_engine/
75+
sed "s/NAMESPACE/$NAMESPACE/" ./example-cluster.yaml.template > ./example-cluster.yaml
7576
```
7677

7778
This reference deployment file will create a Ray cluster with following characteristics:

example-cluster.yaml.template

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
# A unique identifier for the head node and workers of this cluster.
2+
cluster_name: example-cluster
3+
4+
# The maximum number of workers nodes to launch in addition to the head
5+
# node.
6+
max_workers: 10
7+
8+
# The autoscaler will scale up the cluster faster with higher upscaling speed.
9+
# E.g., if the task requires adding more nodes then autoscaler will gradually
10+
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
11+
# This number should be > 0.
12+
upscaling_speed: 2
13+
14+
# If a node is idle for this many minutes, it will be removed.
15+
idle_timeout_minutes: 1
16+
17+
# Kubernetes resources that need to be configured for the autoscaler to be
18+
# able to manage the Ray cluster. If any of the provided resources don't
19+
# exist, the autoscaler will attempt to create them. If this fails, you may
20+
# not have the required permissions and will have to request them to be
21+
# created by your cluster administrator.
22+
provider:
23+
type: kubernetes
24+
25+
# Exposing external IP addresses for ray pods isn't currently supported.
26+
use_internal_ips: true
27+
28+
# Namespace to use for all resources created.
29+
namespace: NAMESPACE
30+
31+
services:
32+
# Service that maps to the head node of the Ray cluster.
33+
- apiVersion: v1
34+
kind: Service
35+
metadata:
36+
# NOTE: If you're running multiple Ray clusters with services
37+
# on one Kubernetes cluster, they must have unique service
38+
# names.
39+
name: example-cluster-ray-head
40+
spec:
41+
# This selector must match the head node pod's selector below.
42+
selector:
43+
component: example-cluster-ray-head
44+
ports:
45+
- name: client
46+
protocol: TCP
47+
port: 10001
48+
targetPort: 10001
49+
- name: dashboard
50+
protocol: TCP
51+
port: 8265
52+
targetPort: 8265
53+
54+
# Specify the pod type for the ray head node (as configured below).
55+
head_node_type: head_node
56+
# Specify the allowed pod types for this ray cluster and the resources they provide.
57+
available_node_types:
58+
worker_node:
59+
# Minimum number of Ray workers of this Pod type.
60+
min_workers: 0
61+
# Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
62+
max_workers: 10
63+
# User-specified custom resources for use by Ray. Object with string keys and integer values.
64+
# (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
65+
node_config:
66+
apiVersion: v1
67+
kind: Pod
68+
metadata:
69+
# Automatically generates a name for the pod with this prefix.
70+
generateName: example-cluster-ray-worker-
71+
spec:
72+
restartPolicy: Never
73+
volumes:
74+
- name: dshm
75+
emptyDir:
76+
medium: Memory
77+
containers:
78+
- name: ray-node
79+
imagePullPolicy: Always
80+
image: rayproject/ray:nightly
81+
command: ["/bin/bash", "-c", "--"]
82+
args: ["trap : TERM INT; sleep infinity & wait;"]
83+
# This volume allocates shared memory for Ray to use for its plasma
84+
# object store. If you do not provide this, Ray will fall back to
85+
# /tmp which cause slowdowns if is not a shared memory volume.
86+
volumeMounts:
87+
- mountPath: /dev/shm
88+
name: dshm
89+
resources:
90+
requests:
91+
cpu: 1
92+
memory: 1Gi
93+
limits:
94+
# The maximum memory that this pod is allowed to use. The
95+
# limit will be detected by ray and split to use 10% for
96+
# redis, 30% for the shared memory object store, and the
97+
# rest for application memory. If this limit is not set and
98+
# the object store size is not set manually, ray will
99+
# allocate a very large object store in each pod that may
100+
# cause problems for other pods.
101+
cpu: 1
102+
memory: 1Gi
103+
head_node:
104+
# The minimum number of worker nodes of this type to launch.
105+
# This number should be >= 0.
106+
min_workers: 0
107+
# The maximum number of worker nodes of this type to launch.
108+
# This takes precedence over min_workers.
109+
max_workers: 0
110+
node_config:
111+
apiVersion: v1
112+
kind: Pod
113+
metadata:
114+
# Automatically generates a name for the pod with this prefix.
115+
generateName: example-cluster-ray-head-
116+
# Must match the head node service selector above if a head node
117+
# service is required.
118+
labels:
119+
component: example-cluster-ray-head
120+
spec:
121+
# Change this if you altered the autoscaler_service_account above
122+
# or want to provide your own.
123+
serviceAccountName: NAMESPACE-writer
124+
125+
restartPolicy: Never
126+
127+
# This volume allocates shared memory for Ray to use for its plasma
128+
# object store. If you do not provide this, Ray will fall back to
129+
# /tmp which cause slowdowns if is not a shared memory volume.
130+
volumes:
131+
- name: dshm
132+
emptyDir:
133+
medium: Memory
134+
containers:
135+
- name: ray-node
136+
imagePullPolicy: Always
137+
image: rayproject/ray:nightly
138+
# Do not change this command - it keeps the pod alive until it is
139+
# explicitly killed.
140+
command: ["/bin/bash", "-c", "--"]
141+
args: ['trap : TERM INT; sleep infinity & wait;']
142+
ports:
143+
- containerPort: 6379 # Redis port
144+
- containerPort: 10001 # Used by Ray Client
145+
- containerPort: 8265 # Used by Ray Dashboard
146+
147+
# This volume allocates shared memory for Ray to use for its plasma
148+
# object store. If you do not provide this, Ray will fall back to
149+
# /tmp which cause slowdowns if is not a shared memory volume.
150+
volumeMounts:
151+
- mountPath: /dev/shm
152+
name: dshm
153+
resources:
154+
requests:
155+
cpu: 1
156+
memory: 1Gi
157+
limits:
158+
# The maximum memory that this pod is allowed to use. The
159+
# limit will be detected by ray and split to use 10% for
160+
# redis, 30% for the shared memory object store, and the
161+
# rest for application memory. If this limit is not set and
162+
# the object store size is not set manually, ray will
163+
# allocate a very large object store in each pod that may
164+
# cause problems for other pods.
165+
cpu: 1
166+
memory: 1Gi
167+
168+
169+
# Command to start ray on the head node. You don't need to change this.
170+
# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward.
171+
head_start_ray_commands:
172+
- ray stop
173+
- ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
174+
175+
# Command to start ray on worker nodes. You don't need to change this.
176+
worker_start_ray_commands:
177+
- ray stop
178+
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379

0 commit comments

Comments
 (0)