Updated instructions for running with IBM Cloud Code Engine.

chcost · chcost · commit 43ede2c93fb7 · 2021-06-21T10:20:25.000-04:00
diff --git a/deploy/ibm_cloud_code_engine/README.md b/deploy/ibm_cloud_code_engine/README.md
@@ -51,7 +51,8 @@ export NAMESPACE=<namespace from above>
 
 Update With the following command you can download a basic Ray cluster definition and customize it for your namespace:
 ```shell
-sed "s/NAMESPACE/$NAMESPACE/" > ./example-cluster.yaml
+cd ./deploy/ibm_cloud_code_engine/
+sed "s/NAMESPACE/$NAMESPACE/" ./example-cluster.yaml.template > ./example-cluster.yaml
 ```
 
 This reference deployment file will create a Ray cluster with following characteristics:
diff --git a/deploy/ibm_cloud_code_engine/example-cluster.yaml.template b/deploy/ibm_cloud_code_engine/example-cluster.yaml.template
@@ -26,7 +26,7 @@ provider:
     use_internal_ips: true
 
     # Namespace to use for all resources created.
-    namespace:  
+    namespace: NAMESPACE 
 
     services:
       # Service that maps to the head node of the Ray cluster.
@@ -89,7 +89,7 @@ available_node_types:
           resources:
             requests:
               cpu: 1
-              memory: 2G
+              memory: 1Gi
             limits:
               # The maximum memory that this pod is allowed to use. The
               # limit will be detected by ray and split to use 10% for
@@ -99,7 +99,7 @@ available_node_types:
               # allocate a very large object store in each pod that may
               # cause problems for other pods.
               cpu: 1
-              memory: 2G
+              memory: 1Gi
   head_node:
     # The minimum number of worker nodes of this type to launch.
     # This number should be >= 0.
@@ -120,7 +120,7 @@ available_node_types:
       spec:
         # Change this if you altered the autoscaler_service_account above
         # or want to provide your own.
-        serviceAccountName: ap9fjwkf04j-writer
+        serviceAccountName: NAMESPACE-writer
 
         restartPolicy: Never
 
@@ -153,7 +153,7 @@ available_node_types:
           resources:
             requests:
               cpu: 1
-              memory: 2G
+              memory: 1Gi
             limits:
               # The maximum memory that this pod is allowed to use. The
               # limit will be detected by ray and split to use 10% for
@@ -163,7 +163,7 @@ available_node_types:
               # allocate a very large object store in each pod that may
               # cause problems for other pods.
               cpu: 1
-              memory: 2G
+              memory: 1Gi
 
 
 # Command to start ray on the head node. You don't need to change this.
diff --git a/docs/source/getting_started/starting.md b/docs/source/getting_started/starting.md
@@ -71,7 +71,8 @@ export NAMESPACE=<namespace from above>
 
 Update With the following command you can download a basic Ray cluster definition and customize it for your namespace:
 ```shell
-sed "s/NAMESPACE/$NAMESPACE/" > ./example-cluster.yaml
+cd ./deploy/ibm_cloud_code_engine/
+sed "s/NAMESPACE/$NAMESPACE/" ./example-cluster.yaml.template > ./example-cluster.yaml
 ```
 
 This reference deployment file will create a Ray cluster with following characteristics:
diff --git a/example-cluster.yaml.template b/example-cluster.yaml.template
@@ -0,0 +1,178 @@
+# A unique identifier for the head node and workers of this cluster.
+cluster_name: example-cluster
+
+# The maximum number of workers nodes to launch in addition to the head
+# node.
+max_workers: 10
+
+# The autoscaler will scale up the cluster faster with higher upscaling speed.
+# E.g., if the task requires adding more nodes then autoscaler will gradually
+# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
+# This number should be > 0.
+upscaling_speed: 2 
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 1 
+
+# Kubernetes resources that need to be configured for the autoscaler to be
+# able to manage the Ray cluster. If any of the provided resources don't
+# exist, the autoscaler will attempt to create them. If this fails, you may
+# not have the required permissions and will have to request them to be
+# created by your cluster administrator.
+provider:
+    type: kubernetes
+
+    # Exposing external IP addresses for ray pods isn't currently supported.
+    use_internal_ips: true
+
+    # Namespace to use for all resources created.
+    namespace: NAMESPACE 
+
+    services:
+      # Service that maps to the head node of the Ray cluster.
+      - apiVersion: v1
+        kind: Service
+        metadata:
+            # NOTE: If you're running multiple Ray clusters with services
+            # on one Kubernetes cluster, they must have unique service
+            # names.
+            name: example-cluster-ray-head
+        spec:
+            # This selector must match the head node pod's selector below.
+            selector:
+                component: example-cluster-ray-head
+            ports:
+                - name: client
+                  protocol: TCP
+                  port: 10001
+                  targetPort: 10001
+                - name: dashboard
+                  protocol: TCP
+                  port: 8265
+                  targetPort: 8265
+
+# Specify the pod type for the ray head node (as configured below).
+head_node_type: head_node
+# Specify the allowed pod types for this ray cluster and the resources they provide.
+available_node_types:
+  worker_node:
+    # Minimum number of Ray workers of this Pod type.
+    min_workers: 0 
+    # Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
+    max_workers: 10
+    # User-specified custom resources for use by Ray. Object with string keys and integer values.
+    # (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
+    node_config:
+      apiVersion: v1
+      kind: Pod
+      metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: example-cluster-ray-worker-
+      spec:
+        restartPolicy: Never
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          image: rayproject/ray:nightly
+          command: ["/bin/bash", "-c", "--"]
+          args: ["trap : TERM INT; sleep infinity & wait;"]
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          resources:
+            requests:
+              cpu: 1
+              memory: 1Gi
+            limits:
+              # The maximum memory that this pod is allowed to use. The
+              # limit will be detected by ray and split to use 10% for
+              # redis, 30% for the shared memory object store, and the
+              # rest for application memory. If this limit is not set and
+              # the object store size is not set manually, ray will
+              # allocate a very large object store in each pod that may
+              # cause problems for other pods.
+              cpu: 1
+              memory: 1Gi
+  head_node:
+    # The minimum number of worker nodes of this type to launch.
+    # This number should be >= 0.
+    min_workers: 0
+    # The maximum number of worker nodes of this type to launch.
+    # This takes precedence over min_workers.
+    max_workers: 0
+    node_config:
+      apiVersion: v1
+      kind: Pod
+      metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: example-cluster-ray-head-
+        # Must match the head node service selector above if a head node
+        # service is required.
+        labels:
+            component: example-cluster-ray-head
+      spec:
+        # Change this if you altered the autoscaler_service_account above
+        # or want to provide your own.
+        serviceAccountName: NAMESPACE-writer
+
+        restartPolicy: Never
+
+        # This volume allocates shared memory for Ray to use for its plasma
+        # object store. If you do not provide this, Ray will fall back to
+        # /tmp which cause slowdowns if is not a shared memory volume.
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          image: rayproject/ray:nightly
+          # Do not change this command - it keeps the pod alive until it is
+          # explicitly killed.
+          command: ["/bin/bash", "-c", "--"]
+          args: ['trap : TERM INT; sleep infinity & wait;']
+          ports:
+          - containerPort: 6379  # Redis port
+          - containerPort: 10001  # Used by Ray Client
+          - containerPort: 8265  # Used by Ray Dashboard
+
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          resources:
+            requests:
+              cpu: 1
+              memory: 1Gi
+            limits:
+              # The maximum memory that this pod is allowed to use. The
+              # limit will be detected by ray and split to use 10% for
+              # redis, 30% for the shared memory object store, and the
+              # rest for application memory. If this limit is not set and
+              # the object store size is not set manually, ray will
+              # allocate a very large object store in each pod that may
+              # cause problems for other pods.
+              cpu: 1
+              memory: 1Gi
+
+
+# Command to start ray on the head node. You don't need to change this.
+# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379