Skip to content

Commit

Permalink
Merge remote-tracking branch 'remotes/bonzini/tags/for-upstream' into…
Browse files Browse the repository at this point in the history
… staging

* Nuke hw_compat_4_0_1 and pc_compat_4_0_1 (Greg)
* Static analysis fixes (Igor, Lidong)
* X86 Hyper-V CPUID improvements (Vitaly)
* X86 nested virt migration (Liran)
* New MSR-based features (Xiaoyao)

# gpg: Signature made Fri 21 Jun 2019 12:25:42 BST
# gpg:                using RSA key BFFBD25F78C7AE83
# gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>" [full]
# gpg:                 aka "Paolo Bonzini <pbonzini@redhat.com>" [full]
# Primary key fingerprint: 46F5 9FBD 57D6 12E7 BFD4  E2F7 7E15 100C CD36 69B1
#      Subkey fingerprint: F133 3857 4B66 2389 866C  7682 BFFB D25F 78C7 AE83

* remotes/bonzini/tags/for-upstream: (25 commits)
  hw: Nuke hw_compat_4_0_1 and pc_compat_4_0_1
  util/main-loop: Fix incorrect assertion
  sd: Fix out-of-bounds assertions
  target/i386: kvm: Add nested migration blocker only when kernel lacks required capabilities
  target/i386: kvm: Add support for KVM_CAP_EXCEPTION_PAYLOAD
  target/i386: kvm: Add support for save and restore nested state
  vmstate: Add support for kernel integer types
  linux-headers: sync with latest KVM headers from Linux 5.2
  target/i386: kvm: Block migration for vCPUs exposed with nested virtualization
  target/i386: kvm: Re-inject #DB to guest with updated DR6
  target/i386: kvm: Use symbolic constant for #DB/#BP exception constants
  KVM: Introduce kvm_arch_destroy_vcpu()
  target/i386: kvm: Delete VMX migration blocker on vCPU init failure
  target/i386: define a new MSR based feature word - FEAT_CORE_CAPABILITY
  i386/kvm: add support for Direct Mode for Hyper-V synthetic timers
  i386/kvm: hv-evmcs requires hv-vapic
  i386/kvm: hv-tlbflush/ipi require hv-vpindex
  i386/kvm: hv-stimer requires hv-time and hv-synic
  i386/kvm: implement 'hv-passthrough' mode
  i386/kvm: document existing Hyper-V enlightenments
  ...

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
  • Loading branch information
pm215 committed Jun 21, 2019
2 parents 33d6099 + 8e8cbed commit 68d7ff0
Show file tree
Hide file tree
Showing 27 changed files with 1,506 additions and 329 deletions.
25 changes: 21 additions & 4 deletions accel/kvm/kvm-all.c
Expand Up @@ -87,6 +87,7 @@ struct KVMState
#ifdef KVM_CAP_SET_GUEST_DEBUG
QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
#endif
int max_nested_state_len;
int many_ioeventfds;
int intx_set_mask;
bool sync_mmu;
Expand Down Expand Up @@ -291,6 +292,11 @@ int kvm_destroy_vcpu(CPUState *cpu)

DPRINTF("kvm_destroy_vcpu\n");

ret = kvm_arch_destroy_vcpu(cpu);
if (ret < 0) {
goto err;
}

mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
if (mmap_size < 0) {
ret = mmap_size;
Expand Down Expand Up @@ -863,8 +869,8 @@ static void kvm_mem_ioeventfd_add(MemoryListener *listener,
data, true, int128_get64(section->size),
match_data);
if (r < 0) {
fprintf(stderr, "%s: error adding ioeventfd: %s\n",
__func__, strerror(-r));
fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
__func__, strerror(-r), -r);
abort();
}
}
Expand All @@ -881,6 +887,8 @@ static void kvm_mem_ioeventfd_del(MemoryListener *listener,
data, false, int128_get64(section->size),
match_data);
if (r < 0) {
fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
__func__, strerror(-r), -r);
abort();
}
}
Expand All @@ -897,8 +905,8 @@ static void kvm_io_ioeventfd_add(MemoryListener *listener,
data, true, int128_get64(section->size),
match_data);
if (r < 0) {
fprintf(stderr, "%s: error adding ioeventfd: %s\n",
__func__, strerror(-r));
fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
__func__, strerror(-r), -r);
abort();
}
}
Expand All @@ -916,6 +924,8 @@ static void kvm_io_ioeventfd_del(MemoryListener *listener,
data, false, int128_get64(section->size),
match_data);
if (r < 0) {
fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
__func__, strerror(-r), -r);
abort();
}
}
Expand Down Expand Up @@ -1672,6 +1682,8 @@ static int kvm_init(MachineState *ms)
s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
#endif

s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);

#ifdef KVM_CAP_IRQ_ROUTING
kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
#endif
Expand Down Expand Up @@ -2239,6 +2251,11 @@ int kvm_has_debugregs(void)
return kvm_state->debugregs;
}

int kvm_max_nested_state_length(void)
{
return kvm_state->max_nested_state_len;
}

int kvm_has_many_ioeventfds(void)
{
if (!kvm_enabled()) {
Expand Down
1 change: 0 additions & 1 deletion cpus.c
Expand Up @@ -1594,7 +1594,6 @@ static void *qemu_hax_cpu_thread_fn(void *arg)

cpu->thread_id = qemu_get_thread_id();
cpu->created = true;
cpu->halted = 0;
current_cpu = cpu;

hax_init_vcpu(cpu);
Expand Down
201 changes: 201 additions & 0 deletions docs/hyperv.txt
@@ -0,0 +1,201 @@
Hyper-V Enlightenments
======================


1. Description
===============
In some cases when implementing a hardware interface in software is slow, KVM
implements its own paravirtualized interfaces. This works well for Linux as
guest support for such features is added simultaneously with the feature itself.
It may, however, be hard-to-impossible to add support for these interfaces to
proprietary OSes, namely, Microsoft Windows.

KVM on x86 implements Hyper-V Enlightenments for Windows guests. These features
make Windows and Hyper-V guests think they're running on top of a Hyper-V
compatible hypervisor and use Hyper-V specific features.


2. Setup
=========
No Hyper-V enlightenments are enabled by default by either KVM or QEMU. In
QEMU, individual enlightenments can be enabled through CPU flags, e.g:

qemu-system-x86_64 --enable-kvm --cpu host,hv_relaxed,hv_vpindex,hv_time, ...

Sometimes there are dependencies between enlightenments, QEMU is supposed to
check that the supplied configuration is sane.

When any set of the Hyper-V enlightenments is enabled, QEMU changes hypervisor
identification (CPUID 0x40000000..0x4000000A) to Hyper-V. KVM identification
and features are kept in leaves 0x40000100..0x40000101.


3. Existing enlightenments
===========================

3.1. hv-relaxed
================
This feature tells guest OS to disable watchdog timeouts as it is running on a
hypervisor. It is known that some Windows versions will do this even when they
see 'hypervisor' CPU flag.

3.2. hv-vapic
==============
Provides so-called VP Assist page MSR to guest allowing it to work with APIC
more efficiently. In particular, this enlightenment allows paravirtualized
(exit-less) EOI processing.

3.3. hv-spinlocks=xxx
======================
Enables paravirtualized spinlocks. The parameter indicates how many times
spinlock acquisition should be attempted before indicating the situation to the
hypervisor. A special value 0xffffffff indicates "never to retry".

3.4. hv-vpindex
================
Provides HV_X64_MSR_VP_INDEX (0x40000002) MSR to the guest which has Virtual
processor index information. This enlightenment makes sense in conjunction with
hv-synic, hv-stimer and other enlightenments which require the guest to know its
Virtual Processor indices (e.g. when VP index needs to be passed in a
hypercall).

3.5. hv-runtime
================
Provides HV_X64_MSR_VP_RUNTIME (0x40000010) MSR to the guest. The MSR keeps the
virtual processor run time in 100ns units. This gives guest operating system an
idea of how much time was 'stolen' from it (when the virtual CPU was preempted
to perform some other work).

3.6. hv-crash
==============
Provides HV_X64_MSR_CRASH_P0..HV_X64_MSR_CRASH_P5 (0x40000100..0x40000105) and
HV_X64_MSR_CRASH_CTL (0x40000105) MSRs to the guest. These MSRs are written to
by the guest when it crashes, HV_X64_MSR_CRASH_P0..HV_X64_MSR_CRASH_P5 MSRs
contain additional crash information. This information is outputted in QEMU log
and through QAPI.
Note: unlike under genuine Hyper-V, write to HV_X64_MSR_CRASH_CTL causes guest
to shutdown. This effectively blocks crash dump generation by Windows.

3.7. hv-time
=============
Enables two Hyper-V-specific clocksources available to the guest: MSR-based
Hyper-V clocksource (HV_X64_MSR_TIME_REF_COUNT, 0x40000020) and Reference TSC
page (enabled via MSR HV_X64_MSR_REFERENCE_TSC, 0x40000021). Both clocksources
are per-guest, Reference TSC page clocksource allows for exit-less time stamp
readings. Using this enlightenment leads to significant speedup of all timestamp
related operations.

3.8. hv-synic
==============
Enables Hyper-V Synthetic interrupt controller - an extension of a local APIC.
When enabled, this enlightenment provides additional communication facilities
to the guest: SynIC messages and Events. This is a pre-requisite for
implementing VMBus devices (not yet in QEMU). Additionally, this enlightenment
is needed to enable Hyper-V synthetic timers. SynIC is controlled through MSRs
HV_X64_MSR_SCONTROL..HV_X64_MSR_EOM (0x40000080..0x40000084) and
HV_X64_MSR_SINT0..HV_X64_MSR_SINT15 (0x40000090..0x4000009F)

Requires: hv-vpindex

3.9. hv-stimer
===============
Enables Hyper-V synthetic timers. There are four synthetic timers per virtual
CPU controlled through HV_X64_MSR_STIMER0_CONFIG..HV_X64_MSR_STIMER3_COUNT
(0x400000B0..0x400000B7) MSRs. These timers can work either in single-shot or
periodic mode. It is known that certain Windows versions revert to using HPET
(or even RTC when HPET is unavailable) extensively when this enlightenment is
not provided; this can lead to significant CPU consumption, even when virtual
CPU is idle.

Requires: hv-vpindex, hv-synic, hv-time

3.10. hv-tlbflush
==================
Enables paravirtualized TLB shoot-down mechanism. On x86 architecture, remote
TLB flush procedure requires sending IPIs and waiting for other CPUs to perform
local TLB flush. In virtualized environment some virtual CPUs may not even be
scheduled at the time of the call and may not require flushing (or, flushing
may be postponed until the virtual CPU is scheduled). hv-tlbflush enlightenment
implements TLB shoot-down through hypervisor enabling the optimization.

Requires: hv-vpindex

3.11. hv-ipi
=============
Enables paravirtualized IPI send mechanism. HvCallSendSyntheticClusterIpi
hypercall may target more than 64 virtual CPUs simultaneously, doing the same
through APIC requires more than one access (and thus exit to the hypervisor).

Requires: hv-vpindex

3.12. hv-vendor-id=xxx
=======================
This changes Hyper-V identification in CPUID 0x40000000.EBX-EDX from the default
"Microsoft Hv". The parameter should be no longer than 12 characters. According
to the specification, guests shouldn't use this information and it is unknown
if there is a Windows version which acts differently.
Note: hv-vendor-id is not an enlightenment and thus doesn't enable Hyper-V
identification when specified without some other enlightenment.

3.13. hv-reset
===============
Provides HV_X64_MSR_RESET (0x40000003) MSR to the guest allowing it to reset
itself by writing to it. Even when this MSR is enabled, it is not a recommended
way for Windows to perform system reboot and thus it may not be used.

3.14. hv-frequencies
============================================
Provides HV_X64_MSR_TSC_FREQUENCY (0x40000022) and HV_X64_MSR_APIC_FREQUENCY
(0x40000023) allowing the guest to get its TSC/APIC frequencies without doing
measurements.

3.15 hv-reenlightenment
========================
The enlightenment is nested specific, it targets Hyper-V on KVM guests. When
enabled, it provides HV_X64_MSR_REENLIGHTENMENT_CONTROL (0x40000106),
HV_X64_MSR_TSC_EMULATION_CONTROL (0x40000107)and HV_X64_MSR_TSC_EMULATION_STATUS
(0x40000108) MSRs allowing the guest to get notified when TSC frequency changes
(only happens on migration) and keep using old frequency (through emulation in
the hypervisor) until it is ready to switch to the new one. This, in conjunction
with hv-frequencies, allows Hyper-V on KVM to pass stable clocksource (Reference
TSC page) to its own guests.

Recommended: hv-frequencies

3.16. hv-evmcs
===============
The enlightenment is nested specific, it targets Hyper-V on KVM guests. When
enabled, it provides Enlightened VMCS feature to the guest. The feature
implements paravirtualized protocol between L0 (KVM) and L1 (Hyper-V)
hypervisors making L2 exits to the hypervisor faster. The feature is Intel-only.
Note: some virtualization features (e.g. Posted Interrupts) are disabled when
hv-evmcs is enabled. It may make sense to measure your nested workload with and
without the feature to find out if enabling it is beneficial.

Requires: hv-vapic

3.17. hv-stimer-direct
=======================
Hyper-V specification allows synthetic timer operation in two modes: "classic",
when expiration event is delivered as SynIC message and "direct", when the event
is delivered via normal interrupt. It is known that nested Hyper-V can only
use synthetic timers in direct mode and thus 'hv-stimer-direct' needs to be
enabled.

Requires: hv-vpindex, hv-synic, hv-time, hv-stimer


4. Development features
========================
In some cases (e.g. during development) it may make sense to use QEMU in
'pass-through' mode and give Windows guests all enlightenments currently
supported by KVM. This pass-through mode is enabled by "hv-passthrough" CPU
flag.
Note: enabling this flag effectively prevents migration as supported features
may differ between target and destination.


4. Useful links
================
Hyper-V Top Level Functional specification and other information:
https://github.com/MicrosoftDocs/Virtualization-Documentation
5 changes: 1 addition & 4 deletions hw/core/machine.c
Expand Up @@ -24,16 +24,13 @@
#include "hw/pci/pci.h"
#include "hw/mem/nvdimm.h"

GlobalProperty hw_compat_4_0_1[] = {
GlobalProperty hw_compat_4_0[] = {
{ "VGA", "edid", "false" },
{ "secondary-vga", "edid", "false" },
{ "bochs-display", "edid", "false" },
{ "virtio-vga", "edid", "false" },
{ "virtio-gpu-pci", "edid", "false" },
};
const size_t hw_compat_4_0_1_len = G_N_ELEMENTS(hw_compat_4_0_1);

GlobalProperty hw_compat_4_0[] = {};
const size_t hw_compat_4_0_len = G_N_ELEMENTS(hw_compat_4_0);

GlobalProperty hw_compat_3_1[] = {
Expand Down
6 changes: 2 additions & 4 deletions hw/i386/pc.c
Expand Up @@ -111,9 +111,6 @@ struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX};
/* Physical Address of PVH entry point read from kernel ELF NOTE */
static size_t pvh_start_addr;

GlobalProperty pc_compat_4_0_1[] = {};
const size_t pc_compat_4_0_1_len = G_N_ELEMENTS(pc_compat_4_0_1);

GlobalProperty pc_compat_4_0[] = {};
const size_t pc_compat_4_0_len = G_N_ELEMENTS(pc_compat_4_0);

Expand Down Expand Up @@ -2386,7 +2383,8 @@ static void pc_cpu_pre_plug(HotplugHandler *hotplug_dev,
}
cpu->thread_id = topo.smt_id;

if (cpu->hyperv_vpindex && !kvm_hv_vpindex_settable()) {
if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) &&
!kvm_hv_vpindex_settable()) {
error_setg(errp, "kernel doesn't allow setting HyperV VP_INDEX");
return;
}
Expand Down
12 changes: 8 additions & 4 deletions hw/i386/pc_q35.c
Expand Up @@ -378,8 +378,13 @@ static void pc_q35_4_0_1_machine_options(MachineClass *m)
{
pc_q35_4_1_machine_options(m);
m->alias = NULL;
compat_props_add(m->compat_props, hw_compat_4_0_1, hw_compat_4_0_1_len);
compat_props_add(m->compat_props, pc_compat_4_0_1, pc_compat_4_0_1_len);
/*
* This is the default machine for the 4.0-stable branch. It is basically
* a 4.0 that doesn't use split irqchip by default. It MUST hence apply the
* 4.0 compat props.
*/
compat_props_add(m->compat_props, hw_compat_4_0, hw_compat_4_0_len);
compat_props_add(m->compat_props, pc_compat_4_0, pc_compat_4_0_len);
}

DEFINE_Q35_MACHINE(v4_0_1, "pc-q35-4.0.1", NULL,
Expand All @@ -390,8 +395,7 @@ static void pc_q35_4_0_machine_options(MachineClass *m)
pc_q35_4_0_1_machine_options(m);
m->default_kernel_irqchip_split = true;
m->alias = NULL;
compat_props_add(m->compat_props, hw_compat_4_0, hw_compat_4_0_len);
compat_props_add(m->compat_props, pc_compat_4_0, pc_compat_4_0_len);
/* Compat props are applied by the 4.0.1 machine */
}

DEFINE_Q35_MACHINE(v4_0, "pc-q35-4.0", NULL,
Expand Down
4 changes: 2 additions & 2 deletions hw/sd/sd.c
Expand Up @@ -145,7 +145,7 @@ static const char *sd_state_name(enum SDCardStates state)
if (state == sd_inactive_state) {
return "inactive";
}
assert(state <= ARRAY_SIZE(state_name));
assert(state < ARRAY_SIZE(state_name));
return state_name[state];
}

Expand All @@ -166,7 +166,7 @@ static const char *sd_response_name(sd_rsp_type_t rsp)
if (rsp == sd_r1b) {
rsp = sd_r1;
}
assert(rsp <= ARRAY_SIZE(response_name));
assert(rsp < ARRAY_SIZE(response_name));
return response_name[rsp];
}

Expand Down
3 changes: 0 additions & 3 deletions include/hw/boards.h
Expand Up @@ -293,9 +293,6 @@ struct MachineState {
} \
type_init(machine_initfn##_register_types)

extern GlobalProperty hw_compat_4_0_1[];
extern const size_t hw_compat_4_0_1_len;

extern GlobalProperty hw_compat_4_0[];
extern const size_t hw_compat_4_0_len;

Expand Down
3 changes: 0 additions & 3 deletions include/hw/i386/pc.h
Expand Up @@ -293,9 +293,6 @@ int e820_add_entry(uint64_t, uint64_t, uint32_t);
int e820_get_num_entries(void);
bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);

extern GlobalProperty pc_compat_4_0_1[];
extern const size_t pc_compat_4_0_1_len;

extern GlobalProperty pc_compat_4_0[];
extern const size_t pc_compat_4_0_len;

Expand Down

0 comments on commit 68d7ff0

Please sign in to comment.