Skip to content

Commit

Permalink
Support spot for GCP, and add spot info to Azure catalog (#113)
Browse files Browse the repository at this point in the history
* Support spot instance for GCP and Azure

* Fix Azure zones

* Add Spot info in Azure catalog
  • Loading branch information
Michaelvll committed Dec 20, 2021
1 parent 0192eb4 commit f4c858d
Show file tree
Hide file tree
Showing 8 changed files with 3,431 additions and 3,206 deletions.
13 changes: 13 additions & 0 deletions prototype/config/azure-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ available_node_types:
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 21.09.13
# optionally set priority to use Spot instances
{%- if use_spot %}
priority: Spot
# set a maximum price for spot instances if desired
# billingProfile:
# maxPrice: -1
{%- endif %}
# TODO: attach disk
{% if num_nodes > 1 %}
ray.worker.default:
Expand All @@ -44,6 +51,12 @@ available_node_types:
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 21.09.13
{%- if use_spot %}
priority: Spot
# set a maximum price for spot instances if desired
# billingProfile:
# maxPrice: -1
{%- endif %}
{%- endif %}

head_node_type: ray.head.default
Expand Down
6 changes: 6 additions & 0 deletions prototype/config/gcp-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ available_node_types:
- key: install-nvidia-driver
value: "True"
scheduling:
{%- if use_spot %}
- preemptible: true
{%- endif %}
- onHostMaintenance: TERMINATE # Required for GPU-attached VMs.
{%- endif %}
{% if num_nodes > 1 %}
Expand Down Expand Up @@ -71,6 +74,9 @@ available_node_types:
- key: install-nvidia-driver
value: "True"
scheduling:
{%- if use_spot %}
- preemptible: true
{%- endif %}
- onHostMaintenance: TERMINATE # Required for GPU-attached VMs.
{%- endif %}
{%- endif %}
Expand Down
3 changes: 2 additions & 1 deletion prototype/sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,8 @@ def _update_blocklist_on_azure_error(self, region, zones, stdout, stderr):
errors = [
s.strip()
for s in stdout_splits + stderr_splits
if 'Exception Details:' in s.strip()
if ('Exception Details:' in s.strip() or
'InvalidTemplateDeployment' in s.strip())
]
if not errors:
logger.info('====== stdout ======')
Expand Down
3 changes: 2 additions & 1 deletion prototype/sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def get_accelerators_from_instance_type(
def make_deploy_resources_variables(self, task):
r = task.best_resources
assert not r.use_spot, \
'We currently do not support spot instances for Azure.'
'Our subscription offer ID does not support spot instances.'
# r.accelerators is cleared but .instance_type encodes the info.
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
Expand All @@ -95,6 +95,7 @@ def make_deploy_resources_variables(self, task):
return {
'instance_type': r.instance_type,
'custom_resources': custom_resources,
'use_spot': r.use_spot,
}

def get_feasible_launchable_resources(self, resources):
Expand Down
64 changes: 60 additions & 4 deletions prototype/sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,64 @@ class GCP(clouds.Cloud):
_ON_DEMAND_PRICES.update(_ON_DEMAND_PRICES_GPUS)
_ON_DEMAND_PRICES.update(_ON_DEMAND_PRICES_TPUS)

_SPOT_PRICES = {
# VMs: https://cloud.google.com/compute/all-pricing.
# N1 standard
'n1-standard-1': 0.01,
'n1-standard-2': 0.02,
'n1-standard-4': 0.04,
'n1-standard-8': 0.08,
'n1-standard-16': 0.16,
'n1-standard-32': 0.32,
'n1-standard-64': 0.64,
'n1-standard-96': 0.96,
# N1 highmem
'n1-highmem-2': 0.024906,
'n1-highmem-4': 0.049812,
'n1-highmem-8': 0.099624,
'n1-highmem-16': 0.199248,
'n1-highmem-32': 0.398496,
'n1-highmem-64': 0.796992,
'n1-highmem-96': 1.195488,
}
# GPUs: https://cloud.google.com/compute/gpus-pricing.
_SPOT_PRICES_GPUS = {
# T4
'T4': 0.11,
'1x T4': 0.11,
'2x T4': 0.11 * 2,
'4x T4': 0.11 * 4,
# P4
'P4': 0.216,
'1x P4': 0.216,
'2x P4': 0.216 * 2,
'4x P4': 0.216 * 4,
# V100
'V100': 0.74,
'1x V100': 0.74,
'2x V100': 0.74 * 2,
'4x V100': 0.74 * 4,
'8x V100': 0.74 * 8,
# P100
'P100': 0.43,
'1x P100': 0.43,
'2x P100': 0.43 * 2,
'4x P100': 0.43 * 4,
# K80
'K80': 0.0375,
'1x K80': 0.0375,
'2x K80': 0.0375 * 2,
'4x K80': 0.0375 * 4,
'8x K80': 0.0375 * 8,
}
# TPUs: https://cloud.google.com/tpu/pricing.
_SPOT_PRICES_TPUS = {
'tpu-v2-8': 1.35,
'tpu-v3-8': 2.40,
}
_SPOT_PRICES.update(_SPOT_PRICES_GPUS)
_SPOT_PRICES.update(_SPOT_PRICES_TPUS)

#### Regions/Zones ####

@classmethod
Expand Down Expand Up @@ -125,9 +183,8 @@ def region_zones_provision_loop(
#### Normal methods ####

def instance_type_to_hourly_cost(self, instance_type, use_spot):
# TODO: use_spot support
if use_spot:
return clouds.Cloud.UNKNOWN_COST
return GCP._SPOT_PRICES[instance_type]
return GCP._ON_DEMAND_PRICES[instance_type]

def accelerators_to_hourly_cost(self, accelerators):
Expand Down Expand Up @@ -168,15 +225,14 @@ def get_default_region(cls) -> clouds.Region:

def make_deploy_resources_variables(self, task):
r = task.best_resources
assert not r.use_spot, \
'We currently do not support spot instances for GCP'
# Find GPU spec, if any.
resources_vars = {
'instance_type': r.instance_type,
'gpu': None,
'gpu_count': None,
'tpu': None,
'custom_resources': None,
'use_spot': r.use_spot,
}
accelerators = r.get_accelerators()
if accelerators is not None:
Expand Down
5 changes: 3 additions & 2 deletions prototype/sky/clouds/service_catalog/azure_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ def get_hourly_cost(instance_type: str,
region: str = _DEFAULT_REGION,
use_spot: bool = False) -> float:
"""Returns the cost, or the cheapest cost among all zones for spot."""
assert not use_spot, 'not implemented'
return common.get_hourly_cost_impl(_df, instance_type, region, False)
# Ref: https://azure.microsoft.com/en-us/support/legal/offer-details/
assert not use_spot, 'Current Azure subscription does not support spot.'
return common.get_hourly_cost_impl(_df, instance_type, region, use_spot)


def get_accelerators_from_instance_type(instance_type: str
Expand Down

0 comments on commit f4c858d

Please sign in to comment.