Skip to content

Commit

Permalink
Merge remote-tracking branch 'awilliam/tags/vfio-pci-for-qemu-2013120…
Browse files Browse the repository at this point in the history
…6.0' into staging

vfio-pci updates include:
 - Update linux-headers to include KVM-VFIO device support
 - Enable QEMU support for KVM-VFIO device
 - Additional Nvidia x-vga quirk to ACK MSI interrupts
 - Debug options to disable MSI/X KVM acceleration
 - Fix to cleanup MSI-X vectors on shutdown and avoid IRQ route leaks

The KVM-VFIO device support enables KVM to manage how it handles
coherency instructions in the presence of non-coherent I/O.  Dave
Airlie had noted that the Nvidia MSI ACK support here may just be
scratching the surface, but it's better than what we have now and
it's only enabled via the x-vga option, so I'm willing to add since
it does enable some users.

# gpg: Signature made Fri 06 Dec 2013 12:28:19 PM PST using RSA key ID 3BB08B22
# gpg: Can't check signature: public key not found

# By Alex Williamson
# Via Alex Williamson
* awilliam/tags/vfio-pci-for-qemu-20131206.0:
  vfio-pci: Release all MSI-X vectors when disabled
  vfio-pci: Add debug config options to disable MSI/X KVM support
  vfio-pci: Fix Nvidia MSI ACK through 0x88000 quirk
  vfio-pci: Make use of new KVM-VFIO device
  linux-headers: Update from v3.13-rc3

Message-id: 20131206204715.16731.12627.stgit@bling.home
Signed-off-by: Anthony Liguori <aliguori@amazon.com>
  • Loading branch information
Anthony Liguori committed Dec 7, 2013
2 parents 9353137 + 3e40ba0 commit d2aa90c
Show file tree
Hide file tree
Showing 7 changed files with 247 additions and 15 deletions.
133 changes: 128 additions & 5 deletions hw/misc/vfio.c
Expand Up @@ -52,6 +52,8 @@
/* Extra debugging, trap acceleration paths for more logging */
#define VFIO_ALLOW_MMAP 1
#define VFIO_ALLOW_KVM_INTX 1
#define VFIO_ALLOW_KVM_MSI 1
#define VFIO_ALLOW_KVM_MSIX 1

struct VFIODevice;

Expand Down Expand Up @@ -208,6 +210,17 @@ static QLIST_HEAD(, VFIOContainer)
static QLIST_HEAD(, VFIOGroup)
group_list = QLIST_HEAD_INITIALIZER(group_list);

#ifdef CONFIG_KVM
/*
* We have a single VFIO pseudo device per KVM VM. Once created it lives
* for the life of the VM. Closing the file descriptor only drops our
* reference to it and the device's reference to kvm. Therefore once
* initialized, this file descriptor is only released on QEMU exit and
* we'll re-use it should another vfio device be attached before then.
*/
static int vfio_kvm_device_fd = -1;
#endif

static void vfio_disable_interrupts(VFIODevice *vdev);
static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
Expand Down Expand Up @@ -579,9 +592,21 @@ static void vfio_msi_interrupt(void *opaque)
return;
}

DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __func__,
#ifdef VFIO_DEBUG
MSIMessage msg;

if (vdev->interrupt == VFIO_INT_MSIX) {
msg = msi_get_message(&vdev->pdev, nr);
} else if (vdev->interrupt == VFIO_INT_MSI) {
msg = msix_get_message(&vdev->pdev, nr);
} else {
abort();
}

DPRINTF("%s(%04x:%02x:%02x.%x) vector %d 0x%"PRIx64"/0x%x\n", __func__,
vdev->host.domain, vdev->host.bus, vdev->host.slot,
vdev->host.function, nr);
vdev->host.function, nr, msg.address, msg.data);
#endif

if (vdev->interrupt == VFIO_INT_MSIX) {
msix_notify(&vdev->pdev, nr);
Expand Down Expand Up @@ -649,7 +674,8 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
* Attempt to enable route through KVM irqchip,
* default to userspace handling if unavailable.
*/
vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
vector->virq = msg && VFIO_ALLOW_KVM_MSIX ?
kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
if (vector->virq < 0 ||
kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
NULL, vector->virq) < 0) {
Expand Down Expand Up @@ -816,7 +842,8 @@ static void vfio_enable_msi(VFIODevice *vdev)
* Attempt to enable route through KVM irqchip,
* default to userspace handling if unavailable.
*/
vector->virq = kvm_irqchip_add_msi_route(kvm_state, vector->msg);
vector->virq = VFIO_ALLOW_KVM_MSI ?
kvm_irqchip_add_msi_route(kvm_state, vector->msg) : -1;
if (vector->virq < 0 ||
kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
NULL, vector->virq) < 0) {
Expand Down Expand Up @@ -878,8 +905,20 @@ static void vfio_disable_msi_common(VFIODevice *vdev)

static void vfio_disable_msix(VFIODevice *vdev)
{
int i;

msix_unset_vector_notifiers(&vdev->pdev);

/*
* MSI-X will only release vectors if MSI-X is still enabled on the
* device, check through the rest and release it ourselves if necessary.
*/
for (i = 0; i < vdev->nr_vectors; i++) {
if (vdev->msi_vectors[i].use) {
vfio_msix_vector_release(&vdev->pdev, i);
}
}

if (vdev->nr_vectors) {
vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
}
Expand Down Expand Up @@ -1800,6 +1839,34 @@ static void vfio_probe_nvidia_bar5_window_quirk(VFIODevice *vdev, int nr)
vdev->host.function);
}

static void vfio_nvidia_88000_quirk_write(void *opaque, hwaddr addr,
uint64_t data, unsigned size)
{
VFIOQuirk *quirk = opaque;
VFIODevice *vdev = quirk->vdev;
PCIDevice *pdev = &vdev->pdev;
hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;

vfio_generic_quirk_write(opaque, addr, data, size);

/*
* Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
* MSI capability ID register. Both the ID and next register are
* read-only, so we allow writes covering either of those to real hw.
* NB - only fixed for the 0x88000 MMIO window.
*/
if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
vfio_bar_write(&vdev->bars[quirk->data.bar], addr + base, data, size);
}
}

static const MemoryRegionOps vfio_nvidia_88000_quirk = {
.read = vfio_generic_quirk_read,
.write = vfio_nvidia_88000_quirk_write,
.endianness = DEVICE_LITTLE_ENDIAN,
};

/*
* Finally, BAR0 itself. We want to redirect any accesses to either
* 0x1800 or 0x88000 through the PCI config space access functions.
Expand All @@ -1826,7 +1893,7 @@ static void vfio_probe_nvidia_bar0_88000_quirk(VFIODevice *vdev, int nr)
quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
quirk->data.bar = nr;

memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk,
memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_88000_quirk,
quirk, "vfio-nvidia-bar0-88000-quirk",
TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
Expand Down Expand Up @@ -3041,6 +3108,59 @@ static void vfio_pci_reset_handler(void *opaque)
}
}

static void vfio_kvm_device_add_group(VFIOGroup *group)
{
#ifdef CONFIG_KVM
struct kvm_device_attr attr = {
.group = KVM_DEV_VFIO_GROUP,
.attr = KVM_DEV_VFIO_GROUP_ADD,
.addr = (uint64_t)(unsigned long)&group->fd,
};

if (!kvm_enabled()) {
return;
}

if (vfio_kvm_device_fd < 0) {
struct kvm_create_device cd = {
.type = KVM_DEV_TYPE_VFIO,
};

if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
DPRINTF("KVM_CREATE_DEVICE: %m\n");
return;
}

vfio_kvm_device_fd = cd.fd;
}

if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
error_report("Failed to add group %d to KVM VFIO device: %m",
group->groupid);
}
#endif
}

static void vfio_kvm_device_del_group(VFIOGroup *group)
{
#ifdef CONFIG_KVM
struct kvm_device_attr attr = {
.group = KVM_DEV_VFIO_GROUP,
.attr = KVM_DEV_VFIO_GROUP_DEL,
.addr = (uint64_t)(unsigned long)&group->fd,
};

if (vfio_kvm_device_fd < 0) {
return;
}

if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
error_report("Failed to remove group %d to KVM VFIO device: %m",
group->groupid);
}
#endif
}

static int vfio_connect_container(VFIOGroup *group)
{
VFIOContainer *container;
Expand Down Expand Up @@ -3189,6 +3309,8 @@ static VFIOGroup *vfio_get_group(int groupid)

QLIST_INSERT_HEAD(&group_list, group, next);

vfio_kvm_device_add_group(group);

return group;
}

Expand All @@ -3198,6 +3320,7 @@ static void vfio_put_group(VFIOGroup *group)
return;
}

vfio_kvm_device_del_group(group);
vfio_disconnect_container(group);
QLIST_REMOVE(group, next);
DPRINTF("vfio_put_group: close group->fd\n");
Expand Down
3 changes: 2 additions & 1 deletion linux-headers/asm-arm/kvm.h
Expand Up @@ -63,7 +63,8 @@ struct kvm_regs {

/* Supported Processor Types */
#define KVM_ARM_TARGET_CORTEX_A15 0
#define KVM_ARM_NUM_TARGETS 1
#define KVM_ARM_TARGET_CORTEX_A7 1
#define KVM_ARM_NUM_TARGETS 2

/* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */
#define KVM_ARM_DEVICE_TYPE_SHIFT 0
Expand Down
4 changes: 2 additions & 2 deletions linux-headers/asm-powerpc/epapr_hcalls.h
Expand Up @@ -78,7 +78,7 @@
#define EV_SUCCESS 0
#define EV_EPERM 1 /* Operation not permitted */
#define EV_ENOENT 2 /* Entry Not Found */
#define EV_EIO 3 /* I/O error occurred */
#define EV_EIO 3 /* I/O error occured */
#define EV_EAGAIN 4 /* The operation had insufficient
* resources to complete and should be
* retried
Expand All @@ -89,7 +89,7 @@
#define EV_ENODEV 7 /* No such device */
#define EV_EINVAL 8 /* An argument supplied to the hcall
was out of range or invalid */
#define EV_INTERNAL 9 /* An internal error occurred */
#define EV_INTERNAL 9 /* An internal error occured */
#define EV_CONFIG 10 /* A configuration error was detected */
#define EV_INVALID_STATE 11 /* The object is in an invalid state */
#define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */
Expand Down
86 changes: 82 additions & 4 deletions linux-headers/asm-powerpc/kvm.h
Expand Up @@ -27,6 +27,7 @@
#define __KVM_HAVE_PPC_SMT
#define __KVM_HAVE_IRQCHIP
#define __KVM_HAVE_IRQ_LINE
#define __KVM_HAVE_GUEST_DEBUG

struct kvm_regs {
__u64 pc;
Expand Down Expand Up @@ -269,7 +270,24 @@ struct kvm_fpu {
__u64 fpr[32];
};

/*
* Defines for h/w breakpoint, watchpoint (read, write or both) and
* software breakpoint.
* These are used as "type" in KVM_SET_GUEST_DEBUG ioctl and "status"
* for KVM_DEBUG_EXIT.
*/
#define KVMPPC_DEBUG_NONE 0x0
#define KVMPPC_DEBUG_BREAKPOINT (1UL << 1)
#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2)
#define KVMPPC_DEBUG_WATCH_READ (1UL << 3)
struct kvm_debug_exit_arch {
__u64 address;
/*
* exiting to userspace because of h/w breakpoint, watchpoint
* (read, write or both) and software breakpoint.
*/
__u32 status;
__u32 reserved;
};

/* for KVM_SET_GUEST_DEBUG */
Expand All @@ -281,10 +299,6 @@ struct kvm_guest_debug_arch {
* Type denotes h/w breakpoint, read watchpoint, write
* watchpoint or watchpoint (both read and write).
*/
#define KVMPPC_DEBUG_NONE 0x0
#define KVMPPC_DEBUG_BREAKPOINT (1UL << 1)
#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2)
#define KVMPPC_DEBUG_WATCH_READ (1UL << 3)
__u32 type;
__u32 reserved;
} bp[16];
Expand Down Expand Up @@ -429,6 +443,11 @@ struct kvm_get_htab_header {
#define KVM_REG_PPC_MMCR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
#define KVM_REG_PPC_MMCR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
#define KVM_REG_PPC_MMCRA (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
#define KVM_REG_PPC_MMCR2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
#define KVM_REG_PPC_MMCRS (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x14)
#define KVM_REG_PPC_SIAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x15)
#define KVM_REG_PPC_SDAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x16)
#define KVM_REG_PPC_SIER (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x17)

#define KVM_REG_PPC_PMC1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x18)
#define KVM_REG_PPC_PMC2 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x19)
Expand Down Expand Up @@ -499,6 +518,65 @@ struct kvm_get_htab_header {
#define KVM_REG_PPC_TLB3PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a)
#define KVM_REG_PPC_EPTCFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9b)

/* Timebase offset */
#define KVM_REG_PPC_TB_OFFSET (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9c)

/* POWER8 registers */
#define KVM_REG_PPC_SPMC1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9d)
#define KVM_REG_PPC_SPMC2 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9e)
#define KVM_REG_PPC_IAMR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9f)
#define KVM_REG_PPC_TFHAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa0)
#define KVM_REG_PPC_TFIAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa1)
#define KVM_REG_PPC_TEXASR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa2)
#define KVM_REG_PPC_FSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa3)
#define KVM_REG_PPC_PSPB (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xa4)
#define KVM_REG_PPC_EBBHR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa5)
#define KVM_REG_PPC_EBBRR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa6)
#define KVM_REG_PPC_BESCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa7)
#define KVM_REG_PPC_TAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa8)
#define KVM_REG_PPC_DPDES (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa9)
#define KVM_REG_PPC_DAWR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaa)
#define KVM_REG_PPC_DAWRX (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xab)
#define KVM_REG_PPC_CIABR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xac)
#define KVM_REG_PPC_IC (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xad)
#define KVM_REG_PPC_VTB (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xae)
#define KVM_REG_PPC_CSIGR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaf)
#define KVM_REG_PPC_TACR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb0)
#define KVM_REG_PPC_TCSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb1)
#define KVM_REG_PPC_PID (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb2)
#define KVM_REG_PPC_ACOP (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb3)

#define KVM_REG_PPC_VRSAVE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb4)
#define KVM_REG_PPC_LPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb5)
#define KVM_REG_PPC_PPR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb6)

/* Architecture compatibility level */
#define KVM_REG_PPC_ARCH_COMPAT (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb7)

/* Transactional Memory checkpointed state:
* This is all GPRs, all VSX regs and a subset of SPRs
*/
#define KVM_REG_PPC_TM (KVM_REG_PPC | 0x80000000)
/* TM GPRs */
#define KVM_REG_PPC_TM_GPR0 (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0)
#define KVM_REG_PPC_TM_GPR(n) (KVM_REG_PPC_TM_GPR0 + (n))
#define KVM_REG_PPC_TM_GPR31 (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x1f)
/* TM VSX */
#define KVM_REG_PPC_TM_VSR0 (KVM_REG_PPC_TM | KVM_REG_SIZE_U128 | 0x20)
#define KVM_REG_PPC_TM_VSR(n) (KVM_REG_PPC_TM_VSR0 + (n))
#define KVM_REG_PPC_TM_VSR63 (KVM_REG_PPC_TM | KVM_REG_SIZE_U128 | 0x5f)
/* TM SPRS */
#define KVM_REG_PPC_TM_CR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x60)
#define KVM_REG_PPC_TM_LR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x61)
#define KVM_REG_PPC_TM_CTR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x62)
#define KVM_REG_PPC_TM_FPSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x63)
#define KVM_REG_PPC_TM_AMR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x64)
#define KVM_REG_PPC_TM_PPR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x65)
#define KVM_REG_PPC_TM_VRSAVE (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x66)
#define KVM_REG_PPC_TM_VSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U32 | 0x67)
#define KVM_REG_PPC_TM_DSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x68)
#define KVM_REG_PPC_TM_TAR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x69)

/* PPC64 eXternal Interrupt Controller Specification */
#define KVM_DEV_XICS_GRP_SOURCES 1 /* 64-bit source attributes */

Expand Down
19 changes: 19 additions & 0 deletions linux-headers/asm-x86/hyperv.h
Expand Up @@ -27,6 +27,19 @@
#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)

/*
* There is a single feature flag that signifies the presence of the MSR
* that can be used to retrieve both the local APIC Timer frequency as
* well as the TSC frequency.
*/

/* Local APIC timer frequency MSR (HV_X64_MSR_APIC_FREQUENCY) is available */
#define HV_X64_MSR_APIC_FREQUENCY_AVAILABLE (1 << 11)

/* TSC frequency MSR (HV_X64_MSR_TSC_FREQUENCY) is available */
#define HV_X64_MSR_TSC_FREQUENCY_AVAILABLE (1 << 11)

/*
* Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
* and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
Expand Down Expand Up @@ -136,6 +149,12 @@
/* MSR used to read the per-partition time reference counter */
#define HV_X64_MSR_TIME_REF_COUNT 0x40000020

/* MSR used to retrieve the TSC frequency */
#define HV_X64_MSR_TSC_FREQUENCY 0x40000022

/* MSR used to retrieve the local APIC timer frequency */
#define HV_X64_MSR_APIC_FREQUENCY 0x40000023

/* Define the virtual APIC registers */
#define HV_X64_MSR_EOI 0x40000070
#define HV_X64_MSR_ICR 0x40000071
Expand Down

0 comments on commit d2aa90c

Please sign in to comment.