Skip to content

Commit

Permalink
vfio/quirks: ioeventfd quirk acceleration
Browse files Browse the repository at this point in the history
The NVIDIA BAR0 quirks virtualize the PCI config space mirrors found
in device MMIO space.  Normally PCI config space is considered a slow
path and further optimization is unnecessary, however NVIDIA uses a
register here to enable the MSI interrupt to re-trigger.  Exiting to
QEMU for this MSI-ACK handling can therefore rate limit our interrupt
handling.  Fortunately the MSI-ACK write is easily detected since the
quirk MemoryRegion otherwise has very few accesses, so simply looking
for consecutive writes with the same data is sufficient, in this case
10 consecutive writes with the same data and size is arbitrarily
chosen.  We configure the KVM ioeventfd with data match, so there's
no risk of triggering for the wrong data or size, but we do risk that
pathological driver behavior might consume all of QEMU's file
descriptors, so we cap ourselves to 10 ioeventfds for this purpose.

In support of the above, generic ioeventfd infrastructure is added
for vfio quirks.  This automatically initializes an ioeventfd list
per quirk, disables and frees ioeventfds on exit, and allows
ioeventfds marked as dynamic to be dropped on device reset.  The
rationale for this latter feature is that useful ioeventfds may
depend on specific driver behavior and since we necessarily place a
cap on our use of ioeventfds, a machine reset is a reasonable point
at which to assume a new driver and re-profile.

Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
  • Loading branch information
awilliam committed Jun 5, 2018
1 parent 469d02d commit c958c51
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 2 deletions.
166 changes: 164 additions & 2 deletions hw/vfio/pci-quirks.c
Expand Up @@ -12,6 +12,7 @@

#include "qemu/osdep.h"
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
#include "qemu/range.h"
#include "qapi/error.h"
#include "qapi/visitor.h"
Expand Down Expand Up @@ -202,6 +203,7 @@ typedef struct VFIOConfigMirrorQuirk {
uint32_t offset;
uint8_t bar;
MemoryRegion *mem;
uint8_t data[];
} VFIOConfigMirrorQuirk;

static uint64_t vfio_generic_quirk_mirror_read(void *opaque,
Expand Down Expand Up @@ -278,12 +280,95 @@ static const MemoryRegionOps vfio_ati_3c3_quirk = {
static VFIOQuirk *vfio_quirk_alloc(int nr_mem)
{
VFIOQuirk *quirk = g_new0(VFIOQuirk, 1);
QLIST_INIT(&quirk->ioeventfds);
quirk->mem = g_new0(MemoryRegion, nr_mem);
quirk->nr_mem = nr_mem;

return quirk;
}

static void vfio_ioeventfd_exit(VFIOIOEventFD *ioeventfd)
{
QLIST_REMOVE(ioeventfd, next);
memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
true, ioeventfd->data, &ioeventfd->e);
qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), NULL, NULL, NULL);
event_notifier_cleanup(&ioeventfd->e);
trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
(uint64_t)ioeventfd->addr, ioeventfd->size,
ioeventfd->data);
g_free(ioeventfd);
}

static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
{
VFIOIOEventFD *ioeventfd, *tmp;

QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
if (ioeventfd->dynamic) {
vfio_ioeventfd_exit(ioeventfd);
}
}
}

static void vfio_ioeventfd_handler(void *opaque)
{
VFIOIOEventFD *ioeventfd = opaque;

if (event_notifier_test_and_clear(&ioeventfd->e)) {
vfio_region_write(ioeventfd->region, ioeventfd->region_addr,
ioeventfd->data, ioeventfd->size);
trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr),
(uint64_t)ioeventfd->addr, ioeventfd->size,
ioeventfd->data);
}
}

static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
MemoryRegion *mr, hwaddr addr,
unsigned size, uint64_t data,
VFIORegion *region,
hwaddr region_addr, bool dynamic)
{
VFIOIOEventFD *ioeventfd;

if (vdev->no_kvm_ioeventfd) {
return NULL;
}

ioeventfd = g_malloc0(sizeof(*ioeventfd));

if (event_notifier_init(&ioeventfd->e, 0)) {
g_free(ioeventfd);
return NULL;
}

/*
* MemoryRegion and relative offset, plus additional ioeventfd setup
* parameters for configuring and later tearing down KVM ioeventfd.
*/
ioeventfd->mr = mr;
ioeventfd->addr = addr;
ioeventfd->size = size;
ioeventfd->data = data;
ioeventfd->dynamic = dynamic;
/*
* VFIORegion and relative offset for implementing the userspace
* handler. data & size fields shared for both uses.
*/
ioeventfd->region = region;
ioeventfd->region_addr = region_addr;

qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
vfio_ioeventfd_handler, NULL, ioeventfd);
memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
true, ioeventfd->data, &ioeventfd->e);
trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
size, data);

return ioeventfd;
}

static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
{
VFIOQuirk *quirk;
Expand Down Expand Up @@ -719,6 +804,18 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr)
trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name);
}

typedef struct LastDataSet {
VFIOQuirk *quirk;
hwaddr addr;
uint64_t data;
unsigned size;
int hits;
int added;
} LastDataSet;

#define MAX_DYN_IOEVENTFD 10
#define HITS_FOR_IOEVENTFD 10

/*
* Finally, BAR0 itself. We want to redirect any accesses to either
* 0x1800 or 0x88000 through the PCI config space access functions.
Expand All @@ -729,6 +826,7 @@ static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
VFIOConfigMirrorQuirk *mirror = opaque;
VFIOPCIDevice *vdev = mirror->vdev;
PCIDevice *pdev = &vdev->pdev;
LastDataSet *last = (LastDataSet *)&mirror->data;

vfio_generic_quirk_mirror_write(opaque, addr, data, size);

Expand All @@ -743,6 +841,49 @@ static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
addr + mirror->offset, data, size);
trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name);
}

/*
* Automatically add an ioeventfd to handle any repeated write with the
* same data and size above the standard PCI config space header. This is
* primarily expected to accelerate the MSI-ACK behavior, such as noted
* above. Current hardware/drivers should trigger an ioeventfd at config
* offset 0x704 (region offset 0x88704), with data 0x0, size 4.
*
* The criteria of 10 successive hits is arbitrary but reliably adds the
* MSI-ACK region. Note that as some writes are bypassed via the ioeventfd,
* the remaining ones have a greater chance of being seen successively.
* To avoid the pathological case of burning up all of QEMU's open file
* handles, arbitrarily limit this algorithm from adding no more than 10
* ioeventfds, print an error if we would have added an 11th, and then
* stop counting.
*/
if (!vdev->no_kvm_ioeventfd &&
addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) {
if (addr != last->addr || data != last->data || size != last->size) {
last->addr = addr;
last->data = data;
last->size = size;
last->hits = 1;
} else if (++last->hits >= HITS_FOR_IOEVENTFD) {
if (last->added < MAX_DYN_IOEVENTFD) {
VFIOIOEventFD *ioeventfd;
ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size,
data, &vdev->bars[mirror->bar].region,
mirror->offset + addr, true);
if (ioeventfd) {
VFIOQuirk *quirk = last->quirk;

QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next);
last->added++;
}
} else {
last->added++;
warn_report("NVIDIA ioeventfd queue full for %s, unable to "
"accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", "
"size %u", vdev->vbasedev.name, addr, data, size);
}
}
}
}

static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
Expand All @@ -751,10 +892,21 @@ static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
.endianness = DEVICE_LITTLE_ENDIAN,
};

static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
{
VFIOConfigMirrorQuirk *mirror = quirk->data;
LastDataSet *last = (LastDataSet *)&mirror->data;

last->addr = last->data = last->size = last->hits = last->added = 0;

vfio_drop_dynamic_eventfds(vdev, quirk);
}

static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
{
VFIOQuirk *quirk;
VFIOConfigMirrorQuirk *mirror;
LastDataSet *last;

if (vdev->no_geforce_quirks ||
!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
Expand All @@ -763,11 +915,14 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
}

quirk = vfio_quirk_alloc(1);
mirror = quirk->data = g_malloc0(sizeof(*mirror));
quirk->reset = vfio_nvidia_bar0_quirk_reset;
mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
mirror->mem = quirk->mem;
mirror->vdev = vdev;
mirror->offset = 0x88000;
mirror->bar = nr;
last = (LastDataSet *)&mirror->data;
last->quirk = quirk;

memory_region_init_io(mirror->mem, OBJECT(vdev),
&vfio_nvidia_mirror_quirk, mirror,
Expand All @@ -781,11 +936,14 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
/* The 0x1800 offset mirror only seems to get used by legacy VGA */
if (vdev->vga) {
quirk = vfio_quirk_alloc(1);
mirror = quirk->data = g_malloc0(sizeof(*mirror));
quirk->reset = vfio_nvidia_bar0_quirk_reset;
mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
mirror->mem = quirk->mem;
mirror->vdev = vdev;
mirror->offset = 0x1800;
mirror->bar = nr;
last = (LastDataSet *)&mirror->data;
last->quirk = quirk;

memory_region_init_io(mirror->mem, OBJECT(vdev),
&vfio_nvidia_mirror_quirk, mirror,
Expand Down Expand Up @@ -1668,6 +1826,10 @@ void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
int i;

QLIST_FOREACH(quirk, &bar->quirks, next) {
while (!QLIST_EMPTY(&quirk->ioeventfds)) {
vfio_ioeventfd_exit(QLIST_FIRST(&quirk->ioeventfds));
}

for (i = 0; i < quirk->nr_mem; i++) {
memory_region_del_subregion(bar->region.mem, &quirk->mem[i]);
}
Expand Down
2 changes: 2 additions & 0 deletions hw/vfio/pci.c
Expand Up @@ -3175,6 +3175,8 @@ static Property vfio_pci_dev_properties[] = {
DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
no_geforce_quirks, false),
DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
false),
DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
Expand Down
14 changes: 14 additions & 0 deletions hw/vfio/pci.h
Expand Up @@ -24,9 +24,22 @@

struct VFIOPCIDevice;

typedef struct VFIOIOEventFD {
QLIST_ENTRY(VFIOIOEventFD) next;
MemoryRegion *mr;
hwaddr addr;
unsigned size;
uint64_t data;
EventNotifier e;
VFIORegion *region;
hwaddr region_addr;
bool dynamic; /* Added runtime, removed on device reset */
} VFIOIOEventFD;

typedef struct VFIOQuirk {
QLIST_ENTRY(VFIOQuirk) next;
void *data;
QLIST_HEAD(, VFIOIOEventFD) ioeventfds;
int nr_mem;
MemoryRegion *mem;
void (*reset)(struct VFIOPCIDevice *vdev, struct VFIOQuirk *quirk);
Expand Down Expand Up @@ -149,6 +162,7 @@ typedef struct VFIOPCIDevice {
bool no_kvm_msi;
bool no_kvm_msix;
bool no_geforce_quirks;
bool no_kvm_ioeventfd;
VFIODisplay *dpy;
} VFIOPCIDevice;

Expand Down
3 changes: 3 additions & 0 deletions hw/vfio/trace-events
Expand Up @@ -77,6 +77,9 @@ vfio_quirk_ati_bonaire_reset_no_smc(const char *name) "%s"
vfio_quirk_ati_bonaire_reset_timeout(const char *name) "%s"
vfio_quirk_ati_bonaire_reset_done(const char *name) "%s"
vfio_quirk_ati_bonaire_reset(const char *name) "%s"
vfio_ioeventfd_exit(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d]:0x%"PRIx64
vfio_ioeventfd_handler(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d] -> 0x%"PRIx64
vfio_ioeventfd_init(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d]:0x%"PRIx64
vfio_pci_igd_bar4_write(const char *name, uint32_t index, uint32_t data, uint32_t base) "%s [0x%03x] 0x%08x -> 0x%08x"
vfio_pci_igd_bdsm_enabled(const char *name, int size) "%s %dMB"
vfio_pci_igd_opregion_enabled(const char *name) "%s"
Expand Down

0 comments on commit c958c51

Please sign in to comment.