Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[vSphere Provider] Fix vc conn timout issue #40516

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 13 additions & 37 deletions python/ray/autoscaler/_private/vsphere/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,9 @@ def check_frozen_vm_status(self, frozen_vm_name):
existing and in the frozen state. If the frozen VM is existing and off, this
function will also help to power on the frozen VM and wait until it is frozen.
"""
vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name(
vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj(
[vim.VirtualMachine], frozen_vm_name
)
if vm is None:
raise ValueError(
"The frozen VM {} doesn't exist on vSphere, please contact the VI "
"admin".format(frozen_vm_name)
)
logger.info(f"Found frozen VM with name: {vm._moId}")

if vm.runtime.powerState == vim.VirtualMachinePowerState.poweredOff:
logger.debug(f"Frozen VM {vm._moId} is off. Powering it ON")
Expand Down Expand Up @@ -188,8 +182,8 @@ def node_tags(self, node_id):
def external_ip(self, node_id):
# Return the external IP of the VM
# Fetch vSphere VM object
vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_moid(
[vim.VirtualMachine], node_id
vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj(
[vim.VirtualMachine], obj_id=node_id
)
# Get the external IP address of this VM
for ipaddr in vm.guest.net[0].ipAddress:
Expand Down Expand Up @@ -339,7 +333,7 @@ def remove_tag_from_vm(self, tag_key_to_remove, vm_id):
break

def get_frozen_vm_obj(self):
vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name(
vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj(
[vim.VirtualMachine], self.frozen_vm_name
)
return vm
Expand Down Expand Up @@ -375,7 +369,7 @@ def create_instant_clone_node(self, source_vm, vm_name_target, node_config, tags
# If resource pool is not provided in the config yaml, then the resource pool
# of the frozen VM will also be the resource pool of the new VM.
resource_pool = (
self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name(
self.pyvmomi_sdk_provider.get_pyvmomi_obj(
[vim.ResourcePool], node_config["resource_pool"]
)
if "resource_pool" in node_config and node_config["resource_pool"]
Expand All @@ -384,7 +378,7 @@ def create_instant_clone_node(self, source_vm, vm_name_target, node_config, tags
# If datastore is not provided in the config yaml, then the datastore
# of the frozen VM will also be the resource pool of the new VM.
datastore = (
self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name(
self.pyvmomi_sdk_provider.get_pyvmomi_obj(
[vim.Datastore], node_config["datastore"]
)
if "datastore" in node_config and node_config["datastore"]
Expand Down Expand Up @@ -413,7 +407,7 @@ def create_instant_clone_node(self, source_vm, vm_name_target, node_config, tags
threading.Thread(target=self.tag_vm, args=(vm_name_target, tags)).start()
WaitForTask(parent_vm.InstantClone_Task(spec=instant_clone_spec))

cloned_vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name(
cloned_vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj(
[vim.VirtualMachine], vm_name_target
)

Expand Down Expand Up @@ -448,7 +442,7 @@ def create_frozen_vm_on_each_host(self, node_config, name, wait_until_frozen=Fal
exception_happened = False
vm_names = []

res_pool = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name(
res_pool = self.pyvmomi_sdk_provider.get_pyvmomi_obj(
[vim.ResourcePool], node_config["frozen_vm"]["resource_pool"]
)
# In vSphere, for any user-created resource pool, the cluster object is the
Expand Down Expand Up @@ -506,13 +500,9 @@ def create_frozen_vm_from_ovf(
"The datastore name must be provided when deploying frozen"
"VM from OVF"
)
datastore_mo = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name(
datastore_mo = self.pyvmomi_sdk_provider.get_pyvmomi_obj(
[vim.Datastore], datastore_name
)
if not datastore_mo:
raise ValueError(
f"Cannot find the vSphere datastore by name {datastore_name}"
)
datastore_id = datastore_mo._moId
if node_config.get("frozen_vm").get("resource_pool"):
rp_filter_spec = ResourcePool.FilterSpec(
Expand All @@ -534,13 +524,9 @@ def create_frozen_vm_from_ovf(
"The cluster name must be provided when deploying a single frozen"
" VM from OVF"
)
cluster_mo = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name(
cluster_mo = self.pyvmomi_sdk_provider.get_pyvmomi_obj(
[vim.ClusterComputeResource], cluster_name
)
if not cluster_mo:
raise ValueError(
f"Cannot find the vSphere cluster by name {cluster_name}"
)
node_config["host_id"] = cluster_mo.host[0]._moId
resource_pool_id = cluster_mo.resourcePool._moId

Expand Down Expand Up @@ -627,9 +613,7 @@ def create_frozen_vm_from_ovf(

# Get the created vm object
vm = self.get_vm(result.resource_id.id)
vm_mo = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name(
[vim.VirtualMachine], vm.name
)
vm_mo = self.pyvmomi_sdk_provider.get_pyvmomi_obj([vim.VirtualMachine], vm.name)
if wait_until_frozen:
self.wait_until_vm_is_frozen(vm_mo)

Expand Down Expand Up @@ -669,17 +653,9 @@ def initialize_frozen_vm_scheduler(self, frozen_vm_config):
if "schedule_policy" in self.vsphere_config
else ""
)
self.frozen_vms_resource_pool = (
self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name(
[vim.ResourcePool], self.frozen_vm_resource_pool_name
)
self.frozen_vms_resource_pool = self.pyvmomi_sdk_provider.get_pyvmomi_obj(
[vim.ResourcePool], self.frozen_vm_resource_pool_name
)

if self.frozen_vms_resource_pool is None:
raise RuntimeError(
f"Resource Pool {self.frozen_vm_resource_pool_name} could not be found."
)

# Make all frozen vms on resource pool are power on and frozen
self.check_frozen_vms_status(self.frozen_vms_resource_pool)

Expand Down
99 changes: 59 additions & 40 deletions python/ray/autoscaler/_private/vsphere/pyvmomi_sdk_provider.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,51 @@
import atexit
import ssl

from pyVim.connect import Disconnect, SmartConnect
from pyVim.connect import Disconnect, SmartStubAdapter, VimSessionOrientedStub
from pyVmomi import vim

from ray.autoscaler._private.vsphere.utils import Constants


class PyvmomiSdkProvider:
def __init__(self, server, user, password, session_type: Constants.SessionType):
def __init__(
self,
server,
user,
password,
session_type: Constants.SessionType,
port: int = 443,
):
# Instance variables
self.server = server
self.user = user
self.password = password
self.session_type = session_type
self.port = port

# Instance parameters
if self.session_type == Constants.SessionType.UNVERIFIED:
context_obj = ssl._create_unverified_context()
self.timeout = 0

smart_connect_obj = SmartConnect(
# Connect using a session oriented connection
# Ref. https://github.com/vmware/pyvmomi/issues/347
self.pyvmomi_sdk_client = None
credentials = VimSessionOrientedStub.makeUserLoginMethod(user, password)
smart_stub = SmartStubAdapter(
host=server,
user=user,
pwd=password,
port=port,
sslContext=context_obj,
connectionPoolTimeout=self.timeout,
)
atexit.register(Disconnect, smart_connect_obj)
self.pyvmomi_sdk_client = smart_connect_obj.content
self.session_stub = VimSessionOrientedStub(smart_stub, credentials)
self.pyvmomi_sdk_client = vim.ServiceInstance(
"ServiceInstance", self.session_stub
)

if not self.pyvmomi_sdk_client:
raise ValueError("Could not connect to the specified host")
atexit.register(Disconnect, self.pyvmomi_sdk_client)

def get_pyvmomi_obj_by_moid(self, vimtype, moid):
"""
Expand All @@ -35,52 +57,49 @@ def get_pyvmomi_obj_by_moid(self, vimtype, moid):
"""
obj = None
if self.pyvmomi_sdk_client is None:
raise ValueError("Must init pyvmomi_sdk_client first.")
raise RuntimeError("Must init pyvmomi_sdk_client first.")

if not moid:
raise ValueError("Invalid argument for moid")

container = self.pyvmomi_sdk_client.viewManager.CreateContainerView(
self.pyvmomi_sdk_client.rootFolder, vimtype, True
container = self.pyvmomi_sdk_client.content.viewManager.CreateContainerView(
self.pyvmomi_sdk_client.content.rootFolder, vimtype, True
)

for c in container.view:
if moid:
if moid in str(c):
obj = c
break
else:
if moid in str(c):
obj = c
break
if not obj:
raise RuntimeError(
f"Unexpected: cannot find vSphere object {vimtype} with moid: {moid}"
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • What's the reason for the changes to this function?
  • Previously get_pyvmomi_obj_by_* could not return None (instead it would raise an exception), but now it can return None. Do the callers need to handle the case where it's None?

Copy link
Contributor Author

@JingChen23 JingChen23 Oct 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • I think the reason is to simplify the function's logic. Actually when I do the cherry pick for the bug fix, I find that this file has been changed in another PR by some other teammate. So I also cherry-picked that commit.
  • Yes, I agree with you that we should never change the behaviour like this when no change to the callers of this function. So let me optimize the function to make sure it will throw an exception when things are not correct.


return obj

def get_pyvmomi_obj_by_name(self, vimtype, name):
def get_pyvmomi_obj(self, vimtype, name=None, obj_id=None):
"""
This function finds the vSphere object by the object name and the object type.
The object type can be "VM", "Host", "Datastore", etc.
The object name is a unique name under the vCenter server.
This function will return the vSphere object.
The argument for `vimtype` can be "vim.VM", "vim.Host", "vim.Datastore", etc.
Then either the name or the object id need to be provided.
To check all such object information, you can go to the managed object board
page of your vCenter Server, such as: https://<your_vc_ip/mob
"""
obj = None
if not name and not obj_id:
# Raise runtime error because this is not user fault
raise RuntimeError("Either name or obj id must be provided")
if self.pyvmomi_sdk_client is None:
raise ValueError("Must init pyvmomi_sdk_client first.")
raise RuntimeError("Must init pyvmomi_sdk_client first")

container = self.pyvmomi_sdk_client.viewManager.CreateContainerView(
self.pyvmomi_sdk_client.rootFolder, vimtype, True
container = self.pyvmomi_sdk_client.content.viewManager.CreateContainerView(
self.pyvmomi_sdk_client.content.rootFolder, vimtype, True
)

for c in container.view:
if name:
# If both name and moid are provided we will prioritize name.
if name:
for c in container.view:
if c.name == name:
obj = c
break
else:
obj = c
break
if not obj:
raise RuntimeError(
f"Unexpected: cannot find vSphere object {vimtype} with name: {name}"
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed the same code changes needed to be made in two places; I still think it would be a bit better to unify the functions, maybe like def get_myvmomi_obj(self, vimtype, name=None, moid=None), but it's not strictly related to the bug fixed in this PR and doesn't need to block the PR.

return obj
return c
elif obj_id:
for c in container.view:
if str(c) == obj_id:
return c
raise ValueError(
f"Cannot find the object with type {vimtype} on vSphere with"
f"name={name} and obj_id={obj_id}"
)
2 changes: 1 addition & 1 deletion python/ray/tests/vsphere/test_vsphere_node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def test_create_instant_clone_node(mock_wait_task, mock_ic_spec, mock_relo_spec)
VM.InstantCloneSpec = MagicMock(return_value="Clone Spec")
vnp.vsphere_sdk_client.vcenter.VM.instant_clone.return_value = "test_id_1"
vnp.vsphere_sdk_client.vcenter.vm.Power.stop.return_value = None
vnp.get_pyvmomi_obj_by_name = MagicMock(return_value=MagicMock())
vnp.get_pyvmomi_obj = MagicMock(return_value=MagicMock())
vnp.set_node_tags = MagicMock(return_value=None)
vnp.vsphere_sdk_client.vcenter.VM.list = MagicMock(
return_value=[MagicMock(vm="test VM")]
Expand Down
Loading