Browse files

Merge tag 'kvm-3.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Avi Kivity:
 "Highlights include
  - full big real mode emulation on pre-Westmere Intel hosts (can be
    disabled with emulate_invalid_guest_state=0)
  - relatively small ppc and s390 updates
  - PCID/INVPCID support in guests
  - EOI avoidance; 3.6 guests should perform better on 3.6 hosts on
    interrupt intensive workloads)
  - Lockless write faults during live migration
  - EPT accessed/dirty bits support for new Intel processors"

Fix up conflicts in:
 - Documentation/virtual/kvm/api.txt:

   Stupid subchapter numbering, added next to each other.

 - arch/powerpc/kvm/booke_interrupts.S:

   PPC asm changes clashing with the KVM fixes

 - arch/s390/include/asm/sigp.h, arch/s390/kvm/sigp.c:

   Duplicated commits through the kvm tree and the s390 tree, with
   subsequent edits in the KVM tree.

* tag 'kvm-3.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (93 commits)
  KVM: fix race with level interrupts
  x86, hyper: fix build with !CONFIG_KVM_GUEST
  Revert "apic: fix kvm build on UP without IOAPIC"
  KVM guest: switch to apic_set_eoi_write, apic_write
  apic: add apic_set_eoi_write for PV use
  KVM: VMX: Implement PCID/INVPCID for guests with EPT
  KVM: Add x86_hyper_kvm to complete detect_hypervisor_platform check
  KVM: PPC: Critical interrupt emulation support
  KVM: PPC: e500mc: Fix tlbilx emulation for 64-bit guests
  KVM: PPC64: booke: Set interrupt computation mode for 64-bit host
  KVM: PPC: bookehv: Add ESR flag to Data Storage Interrupt
  KVM: PPC: bookehv64: Add support for std/ld emulation.
  booke: Added crit/mc exception handler for e500v2
  booke/bookehv: Add host crit-watchdog exception support
  KVM: MMU: document mmu-lock and fast page fault
  KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint
  KVM: MMU: trace fast page fault
  KVM: MMU: fast path of handling guest page fault
  KVM: MMU: introduce SPTE_MMU_WRITEABLE bit
  KVM: MMU: fold tlb flush judgement into mmu_spte_update
  ...
  • Loading branch information...
2 parents 3c4cfad + 1a577b7 commit 5fecc9d8f59e765c2a48379dd7c6f5cf88c7d75a @torvalds torvalds committed Jul 24, 2012
Showing with 1,914 additions and 519 deletions.
  1. +34 −0 Documentation/virtual/kvm/api.txt
  2. +129 −1 Documentation/virtual/kvm/locking.txt
  3. +33 −0 Documentation/virtual/kvm/msr.txt
  4. +0 −2 Documentation/virtual/kvm/ppc-pv.txt
  5. +1 −1 MAINTAINERS
  6. +1 −0 arch/ia64/include/asm/kvm.h
  7. +1 −0 arch/ia64/kvm/Kconfig
  8. +2 −0 arch/powerpc/include/asm/epapr_hcalls.h
  9. +2 −0 arch/powerpc/include/asm/hw_irq.h
  10. +2 −5 arch/powerpc/include/asm/kvm_book3s_64.h
  11. +6 −0 arch/powerpc/include/asm/kvm_host.h
  12. +2 −1 arch/powerpc/include/asm/kvm_ppc.h
  13. +1 −0 arch/powerpc/kernel/Makefile
  14. +25 −0 arch/powerpc/kernel/epapr_hcalls.S
  15. +52 −0 arch/powerpc/kernel/epapr_paravirt.c
  16. +3 −25 arch/powerpc/kernel/kvm.c
  17. +1 −11 arch/powerpc/kernel/kvm_emul.S
  18. +95 −28 arch/powerpc/kvm/book3s_64_mmu_hv.c
  19. +28 −12 arch/powerpc/kvm/book3s_hv.c
  20. +4 −1 arch/powerpc/kvm/book3s_hv_builtin.c
  21. +8 −7 arch/powerpc/kvm/book3s_hv_rm_mmu.c
  22. +26 −0 arch/powerpc/kvm/booke.c
  23. +28 −0 arch/powerpc/kvm/booke_emulate.c
  24. +27 −28 arch/powerpc/kvm/booke_interrupts.S
  25. +1 −1 arch/powerpc/kvm/bookehv_interrupts.S
  26. +3 −0 arch/powerpc/kvm/e500_emulate.c
  27. +6 −2 arch/powerpc/kvm/e500mc.c
  28. +16 −0 arch/powerpc/kvm/emulate.c
  29. +18 −0 arch/powerpc/kvm/powerpc.c
  30. +9 −0 arch/powerpc/platforms/Kconfig
  31. +2 −0 arch/s390/include/asm/sclp.h
  32. +1 −0 arch/s390/include/asm/sigp.h
  33. +9 −3 arch/s390/kernel/setup.c
  34. +1 −0 arch/s390/kvm/kvm-s390.c
  35. +42 −35 arch/s390/kvm/sigp.c
  36. +3 −0 arch/x86/include/asm/apic.h
  37. +7 −0 arch/x86/include/asm/bitops.h
  38. +1 −0 arch/x86/include/asm/hypervisor.h
  39. +1 −0 arch/x86/include/asm/kvm.h
  40. +3 −3 arch/x86/include/asm/kvm_emulate.h
  41. +29 −2 arch/x86/include/asm/kvm_host.h
  42. +7 −0 arch/x86/include/asm/kvm_para.h
  43. +2 −0 arch/x86/include/asm/processor-flags.h
  44. +6 −0 arch/x86/include/asm/vmx.h
  45. +17 −0 arch/x86/kernel/apic/apic.c
  46. +3 −0 arch/x86/kernel/cpu/hypervisor.c
  47. +61 −3 arch/x86/kernel/kvm.c
  48. +26 −20 arch/x86/kvm/cpuid.c
  49. +9 −0 arch/x86/kvm/cpuid.h
  50. +219 −54 arch/x86/kvm/emulate.c
  51. +14 −3 arch/x86/kvm/i8259.c
  52. +188 −6 arch/x86/kvm/lapic.c
  53. +11 −0 arch/x86/kvm/lapic.h
  54. +261 −98 arch/x86/kvm/mmu.c
  55. +41 −4 arch/x86/kvm/mmutrace.h
  56. +1 −2 arch/x86/kvm/paging_tmpl.h
  57. +9 −3 arch/x86/kvm/svm.c
  58. +34 −0 arch/x86/kvm/trace.h
  59. +152 −37 arch/x86/kvm/vmx.c
  60. +67 −56 arch/x86/kvm/x86.c
  61. +0 −10 drivers/s390/char/sclp.c
  62. +10 −0 drivers/s390/char/sclp.h
  63. +38 −0 drivers/s390/char/sclp_cmd.c
  64. +2 −1 drivers/s390/kvm/kvm_virtio.c
  65. +3 −0 include/linux/kvm.h
  66. +18 −9 include/linux/kvm_host.h
  67. +5 −2 include/trace/events/kvm.h
  68. +16 −3 virt/kvm/ioapic.c
  69. +3 −1 virt/kvm/ioapic.h
  70. +4 −27 virt/kvm/irq_comm.c
  71. +24 −12 virt/kvm/kvm_main.c
View
34 Documentation/virtual/kvm/api.txt
@@ -1946,6 +1946,40 @@ the guest using the specified gsi pin. The irqfd is removed using
the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd
and kvm_irqfd.gsi.
+4.76 KVM_PPC_ALLOCATE_HTAB
+
+Capability: KVM_CAP_PPC_ALLOC_HTAB
+Architectures: powerpc
+Type: vm ioctl
+Parameters: Pointer to u32 containing hash table order (in/out)
+Returns: 0 on success, -1 on error
+
+This requests the host kernel to allocate an MMU hash table for a
+guest using the PAPR paravirtualization interface. This only does
+anything if the kernel is configured to use the Book 3S HV style of
+virtualization. Otherwise the capability doesn't exist and the ioctl
+returns an ENOTTY error. The rest of this description assumes Book 3S
+HV.
+
+There must be no vcpus running when this ioctl is called; if there
+are, it will do nothing and return an EBUSY error.
+
+The parameter is a pointer to a 32-bit unsigned integer variable
+containing the order (log base 2) of the desired size of the hash
+table, which must be between 18 and 46. On successful return from the
+ioctl, it will have been updated with the order of the hash table that
+was allocated.
+
+If no hash table has been allocated when any vcpu is asked to run
+(with the KVM_RUN ioctl), the host kernel will allocate a
+default-sized hash table (16 MB).
+
+If this ioctl is called when a hash table has already been allocated,
+the kernel will clear out the existing hash table (zero all HPTEs) and
+return the hash table order in the parameter. (If the guest is using
+the virtualized real-mode area (VRMA) facility, the kernel will
+re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
+
5. The kvm_run structure
------------------------
View
130 Documentation/virtual/kvm/locking.txt
@@ -6,7 +6,129 @@ KVM Lock Overview
(to be written)
-2. Reference
+2: Exception
+------------
+
+Fast page fault:
+
+Fast page fault is the fast path which fixes the guest page fault out of
+the mmu-lock on x86. Currently, the page fault can be fast only if the
+shadow page table is present and it is caused by write-protect, that means
+we just need change the W bit of the spte.
+
+What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
+SPTE_MMU_WRITEABLE bit on the spte:
+- SPTE_HOST_WRITEABLE means the gfn is writable on host.
+- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when
+ the gfn is writable on guest mmu and it is not write-protected by shadow
+ page write-protection.
+
+On fast page fault path, we will use cmpxchg to atomically set the spte W
+bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this
+is safe because whenever changing these bits can be detected by cmpxchg.
+
+But we need carefully check these cases:
+1): The mapping from gfn to pfn
+The mapping from gfn to pfn may be changed since we can only ensure the pfn
+is not changed during cmpxchg. This is a ABA problem, for example, below case
+will happen:
+
+At the beginning:
+gpte = gfn1
+gfn1 is mapped to pfn1 on host
+spte is the shadow page table entry corresponding with gpte and
+spte = pfn1
+
+ VCPU 0 VCPU0
+on fast page fault path:
+
+ old_spte = *spte;
+ pfn1 is swapped out:
+ spte = 0;
+
+ pfn1 is re-alloced for gfn2.
+
+ gpte is changed to point to
+ gfn2 by the guest:
+ spte = pfn1;
+
+ if (cmpxchg(spte, old_spte, old_spte+W)
+ mark_page_dirty(vcpu->kvm, gfn1)
+ OOPS!!!
+
+We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap.
+
+For direct sp, we can easily avoid it since the spte of direct sp is fixed
+to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic()
+to pin gfn to pfn, because after gfn_to_pfn_atomic():
+- We have held the refcount of pfn that means the pfn can not be freed and
+ be reused for another gfn.
+- The pfn is writable that means it can not be shared between different gfns
+ by KSM.
+
+Then, we can ensure the dirty bitmaps is correctly set for a gfn.
+
+Currently, to simplify the whole things, we disable fast page fault for
+indirect shadow page.
+
+2): Dirty bit tracking
+In the origin code, the spte can be fast updated (non-atomically) if the
+spte is read-only and the Accessed bit has already been set since the
+Accessed bit and Dirty bit can not be lost.
+
+But it is not true after fast page fault since the spte can be marked
+writable between reading spte and updating spte. Like below case:
+
+At the beginning:
+spte.W = 0
+spte.Accessed = 1
+
+ VCPU 0 VCPU0
+In mmu_spte_clear_track_bits():
+
+ old_spte = *spte;
+
+ /* 'if' condition is satisfied. */
+ if (old_spte.Accssed == 1 &&
+ old_spte.W == 0)
+ spte = 0ull;
+ on fast page fault path:
+ spte.W = 1
+ memory write on the spte:
+ spte.Dirty = 1
+
+
+ else
+ old_spte = xchg(spte, 0ull)
+
+
+ if (old_spte.Accssed == 1)
+ kvm_set_pfn_accessed(spte.pfn);
+ if (old_spte.Dirty == 1)
+ kvm_set_pfn_dirty(spte.pfn);
+ OOPS!!!
+
+The Dirty bit is lost in this case.
+
+In order to avoid this kind of issue, we always treat the spte as "volatile"
+if it can be updated out of mmu-lock, see spte_has_volatile_bits(), it means,
+the spte is always atomicly updated in this case.
+
+3): flush tlbs due to spte updated
+If the spte is updated from writable to readonly, we should flush all TLBs,
+otherwise rmap_write_protect will find a read-only spte, even though the
+writable spte might be cached on a CPU's TLB.
+
+As mentioned before, the spte can be updated to writable out of mmu-lock on
+fast page fault path, in order to easily audit the path, we see if TLBs need
+be flushed caused by this reason in mmu_spte_update() since this is a common
+function to update spte (present -> present).
+
+Since the spte is "volatile" if it can be updated out of mmu-lock, we always
+atomicly update the spte, the race caused by fast page fault can be avoided,
+See the comments in spte_has_volatile_bits() and mmu_spte_update().
+
+3. Reference
------------
Name: kvm_lock
@@ -23,3 +145,9 @@ Arch: x86
Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset}
- tsc offset in vmcb
Comment: 'raw' because updating the tsc offsets must not be preempted.
+
+Name: kvm->mmu_lock
+Type: spinlock_t
+Arch: any
+Protects: -shadow page/shadow tlb entry
+Comment: it is a spinlock since it is used in mmu notifier.
View
33 Documentation/virtual/kvm/msr.txt
@@ -223,3 +223,36 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
steal: the amount of time in which this vCPU did not run, in
nanoseconds. Time during which the vcpu is idle, will not be
reported as steal time.
+
+MSR_KVM_EOI_EN: 0x4b564d04
+ data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0
+ when disabled. Bit 1 is reserved and must be zero. When PV end of
+ interrupt is enabled (bit 0 set), bits 63-2 hold a 4-byte aligned
+ physical address of a 4 byte memory area which must be in guest RAM and
+ must be zeroed.
+
+ The first, least significant bit of 4 byte memory location will be
+ written to by the hypervisor, typically at the time of interrupt
+ injection. Value of 1 means that guest can skip writing EOI to the apic
+ (using MSR or MMIO write); instead, it is sufficient to signal
+ EOI by clearing the bit in guest memory - this location will
+ later be polled by the hypervisor.
+ Value of 0 means that the EOI write is required.
+
+ It is always safe for the guest to ignore the optimization and perform
+ the APIC EOI write anyway.
+
+ Hypervisor is guaranteed to only modify this least
+ significant bit while in the current VCPU context, this means that
+ guest does not need to use either lock prefix or memory ordering
+ primitives to synchronise with the hypervisor.
+
+ However, hypervisor can set and clear this memory bit at any time:
+ therefore to make sure hypervisor does not interrupt the
+ guest and clear the least significant bit in the memory area
+ in the window between guest testing it to detect
+ whether it can skip EOI apic write and between guest
+ clearing it to signal EOI to the hypervisor,
+ guest must both read the least significant bit in the memory area and
+ clear it using a single CPU instruction, such as test and clear, or
+ compare and exchange.
View
2 Documentation/virtual/kvm/ppc-pv.txt
@@ -109,8 +109,6 @@ The following bits are safe to be set inside the guest:
MSR_EE
MSR_RI
- MSR_CR
- MSR_ME
If any other bit changes in the MSR, please still use mtmsr(d).
View
2 MAINTAINERS
@@ -4002,8 +4002,8 @@ F: arch/ia64/include/asm/kvm*
F: arch/ia64/kvm/
KERNEL VIRTUAL MACHINE for s390 (KVM/s390)
-M: Carsten Otte <cotte@de.ibm.com>
M: Christian Borntraeger <borntraeger@de.ibm.com>
+M: Cornelia Huck <cornelia.huck@de.ibm.com>
M: linux390@de.ibm.com
L: linux-s390@vger.kernel.org
W: http://www.ibm.com/developerworks/linux/linux390/
View
1 arch/ia64/include/asm/kvm.h
@@ -26,6 +26,7 @@
/* Select x86 specific features in <linux/kvm.h> */
#define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_IRQ_LINE
#define __KVM_HAVE_DEVICE_ASSIGNMENT
/* Architectural interrupt line count. */
View
1 arch/ia64/kvm/Kconfig
@@ -19,6 +19,7 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
+ depends on BROKEN
depends on HAVE_KVM && MODULES && EXPERIMENTAL
# for device assignment:
depends on PCI
View
2 arch/powerpc/include/asm/epapr_hcalls.h
@@ -153,6 +153,8 @@
#define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5"
#define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4"
+extern bool epapr_paravirt_enabled;
+extern u32 epapr_hypercall_start[];
/*
* We use "uintptr_t" to define a register because it's guaranteed to be a
View
2 arch/powerpc/include/asm/hw_irq.h
@@ -34,6 +34,8 @@ extern void __replay_interrupt(unsigned int vector);
extern void timer_interrupt(struct pt_regs *);
extern void performance_monitor_exception(struct pt_regs *regs);
+extern void WatchdogException(struct pt_regs *regs);
+extern void unknown_exception(struct pt_regs *regs);
#ifdef CONFIG_PPC64
#include <asm/paca.h>
View
7 arch/powerpc/include/asm/kvm_book3s_64.h
@@ -36,11 +36,8 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
#define SPAPR_TCE_SHIFT 12
#ifdef CONFIG_KVM_BOOK3S_64_HV
-/* For now use fixed-size 16MB page table */
-#define HPT_ORDER 24
-#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */
-#define HPT_NPTE (HPT_NPTEG << 3) /* 8 PTEs per PTEG */
-#define HPT_HASH_MASK (HPT_NPTEG - 1)
+#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */
+extern int kvm_hpt_order; /* order of preallocated HPTs */
#endif
#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */
View
6 arch/powerpc/include/asm/kvm_host.h
@@ -237,6 +237,10 @@ struct kvm_arch {
unsigned long vrma_slb_v;
int rma_setup_done;
int using_mmu_notifiers;
+ u32 hpt_order;
+ atomic_t vcpus_running;
+ unsigned long hpt_npte;
+ unsigned long hpt_mask;
spinlock_t slot_phys_lock;
unsigned long *slot_phys[KVM_MEM_SLOTS_NUM];
int slot_npages[KVM_MEM_SLOTS_NUM];
@@ -414,7 +418,9 @@ struct kvm_vcpu_arch {
ulong mcsrr1;
ulong mcsr;
u32 dec;
+#ifdef CONFIG_BOOKE
u32 decar;
+#endif
u32 tbl;
u32 tbu;
u32 tcr;
View
3 arch/powerpc/include/asm/kvm_ppc.h
@@ -119,7 +119,8 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
-extern long kvmppc_alloc_hpt(struct kvm *kvm);
+extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp);
+extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp);
extern void kvmppc_free_hpt(struct kvm *kvm);
extern long kvmppc_prepare_vrma(struct kvm *kvm,
struct kvm_userspace_memory_region *mem);
View
1 arch/powerpc/kernel/Makefile
@@ -128,6 +128,7 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),)
obj-y += ppc_save_regs.o
endif
+obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o
obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o
# Disable GCOV in odd or sensitive code
View
25 arch/powerpc/kernel/epapr_hcalls.S
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2012 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+/* Hypercall entry point. Will be patched with device tree instructions. */
+.global epapr_hypercall_start
+epapr_hypercall_start:
+ li r3, -1
+ nop
+ nop
+ nop
+ blr
View
52 arch/powerpc/kernel/epapr_paravirt.c
@@ -0,0 +1,52 @@
+/*
+ * ePAPR para-virtualization support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright (C) 2012 Freescale Semiconductor, Inc.
+ */
+
+#include <linux/of.h>
+#include <asm/epapr_hcalls.h>
+#include <asm/cacheflush.h>
+#include <asm/code-patching.h>
+
+bool epapr_paravirt_enabled;
+
+static int __init epapr_paravirt_init(void)
+{
+ struct device_node *hyper_node;
+ const u32 *insts;
+ int len, i;
+
+ hyper_node = of_find_node_by_path("/hypervisor");
+ if (!hyper_node)
+ return -ENODEV;
+
+ insts = of_get_property(hyper_node, "hcall-instructions", &len);
+ if (!insts)
+ return -ENODEV;
+
+ if (len % 4 || len > (4 * 4))
+ return -ENODEV;
+
+ for (i = 0; i < (len / 4); i++)
+ patch_instruction(epapr_hypercall_start + i, insts[i]);
+
+ epapr_paravirt_enabled = true;
+
+ return 0;
+}
+
+early_initcall(epapr_paravirt_init);
View
28 arch/powerpc/kernel/kvm.c
@@ -31,6 +31,7 @@
#include <asm/cacheflush.h>
#include <asm/disassemble.h>
#include <asm/ppc-opcode.h>
+#include <asm/epapr_hcalls.h>
#define KVM_MAGIC_PAGE (-4096L)
#define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
@@ -726,7 +727,7 @@ unsigned long kvm_hypercall(unsigned long *in,
unsigned long register r11 asm("r11") = nr;
unsigned long register r12 asm("r12");
- asm volatile("bl kvm_hypercall_start"
+ asm volatile("bl epapr_hypercall_start"
: "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
"=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11),
"=r"(r12)
@@ -747,29 +748,6 @@ unsigned long kvm_hypercall(unsigned long *in,
}
EXPORT_SYMBOL_GPL(kvm_hypercall);
-static int kvm_para_setup(void)
-{
- extern u32 kvm_hypercall_start;
- struct device_node *hyper_node;
- u32 *insts;
- int len, i;
-
- hyper_node = of_find_node_by_path("/hypervisor");
- if (!hyper_node)
- return -1;
-
- insts = (u32*)of_get_property(hyper_node, "hcall-instructions", &len);
- if (len % 4)
- return -1;
- if (len > (4 * 4))
- return -1;
-
- for (i = 0; i < (len / 4); i++)
- kvm_patch_ins(&(&kvm_hypercall_start)[i], insts[i]);
-
- return 0;
-}
-
static __init void kvm_free_tmp(void)
{
unsigned long start, end;
@@ -791,7 +769,7 @@ static int __init kvm_guest_init(void)
if (!kvm_para_available())
goto free_tmp;
- if (kvm_para_setup())
+ if (!epapr_paravirt_enabled)
goto free_tmp;
if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
View
12 arch/powerpc/kernel/kvm_emul.S
@@ -24,16 +24,6 @@
#include <asm/page.h>
#include <asm/asm-offsets.h>
-/* Hypercall entry point. Will be patched with device tree instructions. */
-
-.global kvm_hypercall_start
-kvm_hypercall_start:
- li r3, -1
- nop
- nop
- nop
- blr
-
#define KVM_MAGIC_PAGE (-4096)
#ifdef CONFIG_64BIT
@@ -132,7 +122,7 @@ kvm_emulate_mtmsrd_len:
.long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4
-#define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI)
+#define MSR_SAFE_BITS (MSR_EE | MSR_RI)
#define MSR_CRITICAL_BITS ~MSR_SAFE_BITS
.global kvm_emulate_mtmsr
View
123 arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -37,64 +37,130 @@
/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
#define MAX_LPID_970 63
-long kvmppc_alloc_hpt(struct kvm *kvm)
+/* Power architecture requires HPT is at least 256kB */
+#define PPC_MIN_HPT_ORDER 18
+
+long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
{
unsigned long hpt;
- long lpid;
struct revmap_entry *rev;
struct kvmppc_linear_info *li;
+ long order = kvm_hpt_order;
- /* Allocate guest's hashed page table */
- li = kvm_alloc_hpt();
- if (li) {
- /* using preallocated memory */
- hpt = (ulong)li->base_virt;
- kvm->arch.hpt_li = li;
- } else {
- /* using dynamic memory */
+ if (htab_orderp) {
+ order = *htab_orderp;
+ if (order < PPC_MIN_HPT_ORDER)
+ order = PPC_MIN_HPT_ORDER;
+ }
+
+ /*
+ * If the user wants a different size from default,
+ * try first to allocate it from the kernel page allocator.
+ */
+ hpt = 0;
+ if (order != kvm_hpt_order) {
hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
- __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT);
+ __GFP_NOWARN, order - PAGE_SHIFT);
+ if (!hpt)
+ --order;
}
+ /* Next try to allocate from the preallocated pool */
if (!hpt) {
- pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
- return -ENOMEM;
+ li = kvm_alloc_hpt();
+ if (li) {
+ hpt = (ulong)li->base_virt;
+ kvm->arch.hpt_li = li;
+ order = kvm_hpt_order;
+ }
}
+
+ /* Lastly try successively smaller sizes from the page allocator */
+ while (!hpt && order > PPC_MIN_HPT_ORDER) {
+ hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
+ __GFP_NOWARN, order - PAGE_SHIFT);
+ if (!hpt)
+ --order;
+ }
+
+ if (!hpt)
+ return -ENOMEM;
+
kvm->arch.hpt_virt = hpt;
+ kvm->arch.hpt_order = order;
+ /* HPTEs are 2**4 bytes long */
+ kvm->arch.hpt_npte = 1ul << (order - 4);
+ /* 128 (2**7) bytes in each HPTEG */
+ kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
/* Allocate reverse map array */
- rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE);
+ rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
if (!rev) {
pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
goto out_freehpt;
}
kvm->arch.revmap = rev;
+ kvm->arch.sdr1 = __pa(hpt) | (order - 18);
- lpid = kvmppc_alloc_lpid();
- if (lpid < 0)
- goto out_freeboth;
+ pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
+ hpt, order, kvm->arch.lpid);
- kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
- kvm->arch.lpid = lpid;
-
- pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
+ if (htab_orderp)
+ *htab_orderp = order;
return 0;
- out_freeboth:
- vfree(rev);
out_freehpt:
- free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
+ if (kvm->arch.hpt_li)
+ kvm_release_hpt(kvm->arch.hpt_li);
+ else
+ free_pages(hpt, order - PAGE_SHIFT);
return -ENOMEM;
}
+long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
+{
+ long err = -EBUSY;
+ long order;
+
+ mutex_lock(&kvm->lock);
+ if (kvm->arch.rma_setup_done) {
+ kvm->arch.rma_setup_done = 0;
+ /* order rma_setup_done vs. vcpus_running */
+ smp_mb();
+ if (atomic_read(&kvm->arch.vcpus_running)) {
+ kvm->arch.rma_setup_done = 1;
+ goto out;
+ }
+ }
+ if (kvm->arch.hpt_virt) {
+ order = kvm->arch.hpt_order;
+ /* Set the entire HPT to 0, i.e. invalid HPTEs */
+ memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
+ /*
+ * Set the whole last_vcpu array to an invalid vcpu number.
+ * This ensures that each vcpu will flush its TLB on next entry.
+ */
+ memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu));
+ *htab_orderp = order;
+ err = 0;
+ } else {
+ err = kvmppc_alloc_hpt(kvm, htab_orderp);
+ order = *htab_orderp;
+ }
+ out:
+ mutex_unlock(&kvm->lock);
+ return err;
+}
+
void kvmppc_free_hpt(struct kvm *kvm)
{
kvmppc_free_lpid(kvm->arch.lpid);
vfree(kvm->arch.revmap);
if (kvm->arch.hpt_li)
kvm_release_hpt(kvm->arch.hpt_li);
else
- free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
+ free_pages(kvm->arch.hpt_virt,
+ kvm->arch.hpt_order - PAGE_SHIFT);
}
/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
@@ -119,6 +185,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
unsigned long psize;
unsigned long hp0, hp1;
long ret;
+ struct kvm *kvm = vcpu->kvm;
psize = 1ul << porder;
npages = memslot->npages >> (porder - PAGE_SHIFT);
@@ -127,8 +194,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
if (npages > 1ul << (40 - porder))
npages = 1ul << (40 - porder);
/* Can't use more than 1 HPTE per HPTEG */
- if (npages > HPT_NPTEG)
- npages = HPT_NPTEG;
+ if (npages > kvm->arch.hpt_mask + 1)
+ npages = kvm->arch.hpt_mask + 1;
hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
@@ -138,7 +205,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
for (i = 0; i < npages; ++i) {
addr = i << porder;
/* can't use hpt_hash since va > 64 bits */
- hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
+ hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask;
/*
* We assume that the hash table is empty and no
* vcpus are using it at this stage. Since we create
View
40 arch/powerpc/kvm/book3s_hv.c
@@ -56,7 +56,7 @@
/* #define EXIT_DEBUG_INT */
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
-static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu);
+static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
@@ -1104,11 +1104,15 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
return -EINTR;
}
- /* On the first time here, set up VRMA or RMA */
+ atomic_inc(&vcpu->kvm->arch.vcpus_running);
+ /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */
+ smp_mb();
+
+ /* On the first time here, set up HTAB and VRMA or RMA */
if (!vcpu->kvm->arch.rma_setup_done) {
- r = kvmppc_hv_setup_rma(vcpu);
+ r = kvmppc_hv_setup_htab_rma(vcpu);
if (r)
- return r;
+ goto out;
}
flush_fp_to_thread(current);
@@ -1126,6 +1130,9 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
kvmppc_core_prepare_to_enter(vcpu);
}
} while (r == RESUME_GUEST);
+
+ out:
+ atomic_dec(&vcpu->kvm->arch.vcpus_running);
return r;
}
@@ -1341,7 +1348,7 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
{
}
-static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu)
+static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
{
int err = 0;
struct kvm *kvm = vcpu->kvm;
@@ -1360,6 +1367,15 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu)
if (kvm->arch.rma_setup_done)
goto out; /* another vcpu beat us to it */
+ /* Allocate hashed page table (if not done already) and reset it */
+ if (!kvm->arch.hpt_virt) {
+ err = kvmppc_alloc_hpt(kvm, NULL);
+ if (err) {
+ pr_err("KVM: Couldn't alloc HPT\n");
+ goto out;
+ }
+ }
+
/* Look up the memslot for guest physical address 0 */
memslot = gfn_to_memslot(kvm, 0);
@@ -1471,13 +1487,14 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu)
int kvmppc_core_init_vm(struct kvm *kvm)
{
- long r;
- unsigned long lpcr;
+ unsigned long lpcr, lpid;
- /* Allocate hashed page table */
- r = kvmppc_alloc_hpt(kvm);
- if (r)
- return r;
+ /* Allocate the guest's logical partition ID */
+
+ lpid = kvmppc_alloc_lpid();
+ if (lpid < 0)
+ return -ENOMEM;
+ kvm->arch.lpid = lpid;
INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
@@ -1487,7 +1504,6 @@ int kvmppc_core_init_vm(struct kvm *kvm)
if (cpu_has_feature(CPU_FTR_ARCH_201)) {
/* PPC970; HID4 is effectively the LPCR */
- unsigned long lpid = kvm->arch.lpid;
kvm->arch.host_lpid = 0;
kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4);
lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
View
5 arch/powerpc/kvm/book3s_hv_builtin.c
@@ -25,6 +25,9 @@ static void __init kvm_linear_init_one(ulong size, int count, int type);
static struct kvmppc_linear_info *kvm_alloc_linear(int type);
static void kvm_release_linear(struct kvmppc_linear_info *ri);
+int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER;
+EXPORT_SYMBOL_GPL(kvm_hpt_order);
+
/*************** RMA *************/
/*
@@ -209,7 +212,7 @@ static void kvm_release_linear(struct kvmppc_linear_info *ri)
void __init kvm_linear_init(void)
{
/* HPT */
- kvm_linear_init_one(1 << HPT_ORDER, kvm_hpt_count, KVM_LINEAR_HPT);
+ kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT);
/* RMA */
/* Only do this on PPC970 in HV mode */
View
15 arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -237,7 +237,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
/* Find and lock the HPTEG slot to use */
do_insert:
- if (pte_index >= HPT_NPTE)
+ if (pte_index >= kvm->arch.hpt_npte)
return H_PARAMETER;
if (likely((flags & H_EXACT) == 0)) {
pte_index &= ~7UL;
@@ -352,7 +352,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long v, r, rb;
struct revmap_entry *rev;
- if (pte_index >= HPT_NPTE)
+ if (pte_index >= kvm->arch.hpt_npte)
return H_PARAMETER;
hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
@@ -419,7 +419,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
i = 4;
break;
}
- if (req != 1 || flags == 3 || pte_index >= HPT_NPTE) {
+ if (req != 1 || flags == 3 ||
+ pte_index >= kvm->arch.hpt_npte) {
/* parameter error */
args[j] = ((0xa0 | flags) << 56) + pte_index;
ret = H_PARAMETER;
@@ -521,7 +522,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
struct revmap_entry *rev;
unsigned long v, r, rb, mask, bits;
- if (pte_index >= HPT_NPTE)
+ if (pte_index >= kvm->arch.hpt_npte)
return H_PARAMETER;
hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
@@ -583,7 +584,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
int i, n = 1;
struct revmap_entry *rev = NULL;
- if (pte_index >= HPT_NPTE)
+ if (pte_index >= kvm->arch.hpt_npte)
return H_PARAMETER;
if (flags & H_READ_4) {
pte_index &= ~3;
@@ -678,7 +679,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
somask = (1UL << 28) - 1;
vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
}
- hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK;
+ hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
avpn = slb_v & ~(somask >> 16); /* also includes B */
avpn |= (eaddr & somask) >> 16;
@@ -723,7 +724,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
if (val & HPTE_V_SECONDARY)
break;
val |= HPTE_V_SECONDARY;
- hash = hash ^ HPT_HASH_MASK;
+ hash = hash ^ kvm->arch.hpt_mask;
}
return -1;
}
View
26 arch/powerpc/kvm/booke.c
@@ -612,6 +612,12 @@ static void kvmppc_fill_pt_regs(struct pt_regs *regs)
regs->link = lr;
}
+/*
+ * For interrupts needed to be handled by host interrupt handlers,
+ * corresponding host handler are called from here in similar way
+ * (but not exact) as they are called from low level handler
+ * (such as from arch/powerpc/kernel/head_fsl_booke.S).
+ */
static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
unsigned int exit_nr)
{
@@ -639,6 +645,17 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
kvmppc_fill_pt_regs(&regs);
performance_monitor_exception(&regs);
break;
+ case BOOKE_INTERRUPT_WATCHDOG:
+ kvmppc_fill_pt_regs(&regs);
+#ifdef CONFIG_BOOKE_WDT
+ WatchdogException(&regs);
+#else
+ unknown_exception(&regs);
+#endif
+ break;
+ case BOOKE_INTERRUPT_CRITICAL:
+ unknown_exception(&regs);
+ break;
}
}
@@ -683,6 +700,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
r = RESUME_GUEST;
break;
+ case BOOKE_INTERRUPT_WATCHDOG:
+ r = RESUME_GUEST;
+ break;
+
case BOOKE_INTERRUPT_DOORBELL:
kvmppc_account_exit(vcpu, DBELL_EXITS);
r = RESUME_GUEST;
@@ -1267,6 +1288,11 @@ void kvmppc_decrementer_func(unsigned long data)
{
struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+ if (vcpu->arch.tcr & TCR_ARE) {
+ vcpu->arch.dec = vcpu->arch.decar;
+ kvmppc_emulate_dec(vcpu);
+ }
+
kvmppc_set_tsr_bits(vcpu, TSR_DIS);
}
View
28 arch/powerpc/kvm/booke_emulate.c
@@ -24,6 +24,7 @@
#include "booke.h"
#define OP_19_XOP_RFI 50
+#define OP_19_XOP_RFCI 51
#define OP_31_XOP_MFMSR 83
#define OP_31_XOP_WRTEE 131
@@ -36,6 +37,12 @@ static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1);
}
+static void kvmppc_emul_rfci(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pc = vcpu->arch.csrr0;
+ kvmppc_set_msr(vcpu, vcpu->arch.csrr1);
+}
+
int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int inst, int *advance)
{
@@ -52,6 +59,12 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
*advance = 0;
break;
+ case OP_19_XOP_RFCI:
+ kvmppc_emul_rfci(vcpu);
+ kvmppc_set_exit_type(vcpu, EMULATED_RFCI_EXITS);
+ *advance = 0;
+ break;
+
default:
emulated = EMULATE_FAIL;
break;
@@ -113,6 +126,12 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
case SPRN_ESR:
vcpu->arch.shared->esr = spr_val;
break;
+ case SPRN_CSRR0:
+ vcpu->arch.csrr0 = spr_val;
+ break;
+ case SPRN_CSRR1:
+ vcpu->arch.csrr1 = spr_val;
+ break;
case SPRN_DBCR0:
vcpu->arch.dbcr0 = spr_val;
break;
@@ -129,6 +148,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
kvmppc_set_tcr(vcpu, spr_val);
break;
+ case SPRN_DECAR:
+ vcpu->arch.decar = spr_val;
+ break;
/*
* Note: SPRG4-7 are user-readable.
* These values are loaded into the real SPRGs when resuming the
@@ -229,6 +251,12 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
case SPRN_ESR:
*spr_val = vcpu->arch.shared->esr;
break;
+ case SPRN_CSRR0:
+ *spr_val = vcpu->arch.csrr0;
+ break;
+ case SPRN_CSRR1:
+ *spr_val = vcpu->arch.csrr1;
+ break;
case SPRN_DBCR0:
*spr_val = vcpu->arch.dbcr0;
break;
View
55 arch/powerpc/kvm/booke_interrupts.S
@@ -52,54 +52,57 @@
(1<<BOOKE_INTERRUPT_PROGRAM) | \
(1<<BOOKE_INTERRUPT_DTLB_MISS))
-.macro KVM_HANDLER ivor_nr
+.macro KVM_HANDLER ivor_nr scratch srr0
_GLOBAL(kvmppc_handler_\ivor_nr)
/* Get pointer to vcpu and record exit number. */
- mtspr SPRN_SPRG_WSCRATCH0, r4
+ mtspr \scratch , r4
mfspr r4, SPRN_SPRG_RVCPU
+ stw r3, VCPU_GPR(R3)(r4)
stw r5, VCPU_GPR(R5)(r4)
stw r6, VCPU_GPR(R6)(r4)
+ mfspr r3, \scratch
mfctr r5
- lis r6, kvmppc_resume_host@h
+ stw r3, VCPU_GPR(R4)(r4)
stw r5, VCPU_CTR(r4)
+ mfspr r3, \srr0
+ lis r6, kvmppc_resume_host@h
+ stw r3, VCPU_PC(r4)
li r5, \ivor_nr
ori r6, r6, kvmppc_resume_host@l
mtctr r6
bctr
.endm
_GLOBAL(kvmppc_handlers_start)
-KVM_HANDLER BOOKE_INTERRUPT_CRITICAL
-KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK
-KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE
-KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE
-KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL
-KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT
-KVM_HANDLER BOOKE_INTERRUPT_PROGRAM
-KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL
-KVM_HANDLER BOOKE_INTERRUPT_SYSCALL
-KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL
-KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER
-KVM_HANDLER BOOKE_INTERRUPT_FIT
-KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG
-KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS
-KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS
-KVM_HANDLER BOOKE_INTERRUPT_DEBUG
-KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL
-KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA
-KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND
+KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
+KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0
+KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_PROGRAM SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_SYSCALL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
+KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
+KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0
_GLOBAL(kvmppc_handler_len)
.long kvmppc_handler_1 - kvmppc_handler_0
-
/* Registers:
* SPRG_SCRATCH0: guest r4
* r4: vcpu pointer
* r5: KVM exit number
*/
_GLOBAL(kvmppc_resume_host)
- stw r3, VCPU_GPR(R3)(r4)
mfcr r3
stw r3, VCPU_CR(r4)
stw r7, VCPU_GPR(R7)(r4)
@@ -180,10 +183,6 @@ _GLOBAL(kvmppc_resume_host)
stw r3, VCPU_LR(r4)
mfxer r3
stw r3, VCPU_XER(r4)
- mfspr r3, SPRN_SPRG_RSCRATCH0
- stw r3, VCPU_GPR(R4)(r4)
- mfspr r3, SPRN_SRR0
- stw r3, VCPU_PC(r4)
/* Restore host stack pointer and PID before IVPR, since the host
* exception handlers use them. */
View
2 arch/powerpc/kvm/bookehv_interrupts.S
@@ -262,7 +262,7 @@ kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \
kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \
SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0
kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \
- SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR)
+ SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR
kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0
kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \
View
3 arch/powerpc/kvm/e500_emulate.c
@@ -269,6 +269,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
*spr_val = vcpu->arch.shared->mas7_3 >> 32;
break;
#endif
+ case SPRN_DECAR:
+ *spr_val = vcpu->arch.decar;
+ break;
case SPRN_TLB0CFG:
*spr_val = vcpu->arch.tlbcfg[0];
break;
View
8 arch/powerpc/kvm/e500mc.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010 Freescale Semiconductor, Inc. All rights reserved.
+ * Copyright (C) 2010,2012 Freescale Semiconductor, Inc. All rights reserved.
*
* Author: Varun Sethi, <varun.sethi@freescale.com>
*
@@ -57,7 +57,8 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
struct kvm_book3e_206_tlb_entry *gtlbe)
{
unsigned int tid, ts;
- u32 val, eaddr, lpid;
+ gva_t eaddr;
+ u32 val, lpid;
unsigned long flags;
ts = get_tlb_ts(gtlbe);
@@ -183,6 +184,9 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \
SPRN_EPCR_DUVD;
+#ifdef CONFIG_64BIT
+ vcpu->arch.shadow_epcr |= SPRN_EPCR_ICM;
+#endif
vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP;
vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT);
vcpu->arch.epsc = vcpu->arch.eplc;
View
16 arch/powerpc/kvm/emulate.c
@@ -59,11 +59,13 @@
#define OP_31_XOP_STHBRX 918
#define OP_LWZ 32
+#define OP_LD 58
#define OP_LWZU 33
#define OP_LBZ 34
#define OP_LBZU 35
#define OP_STW 36
#define OP_STWU 37
+#define OP_STD 62
#define OP_STB 38
#define OP_STBU 39
#define OP_LHZ 40
@@ -392,6 +394,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
break;
+ /* TBD: Add support for other 64 bit load variants like ldu, ldux, ldx etc. */
+ case OP_LD:
+ rt = get_rt(inst);
+ emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+ break;
+
case OP_LWZU:
emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
@@ -412,6 +420,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
4, 1);
break;
+ /* TBD: Add support for other 64 bit store variants like stdu, stdux, stdx etc. */
+ case OP_STD:
+ rs = get_rs(inst);
+ emulated = kvmppc_handle_store(run, vcpu,
+ kvmppc_get_gpr(vcpu, rs),
+ 8, 1);
+ break;
+
case OP_STWU:
emulated = kvmppc_handle_store(run, vcpu,
kvmppc_get_gpr(vcpu, rs),
View
18 arch/powerpc/kvm/powerpc.c
@@ -246,6 +246,7 @@ int kvm_dev_ioctl_check_extension(long ext)
#endif
#ifdef CONFIG_PPC_BOOK3S_64
case KVM_CAP_SPAPR_TCE:
+ case KVM_CAP_PPC_ALLOC_HTAB:
r = 1;
break;
#endif /* CONFIG_PPC_BOOK3S_64 */
@@ -802,6 +803,23 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = -EFAULT;
break;
}
+
+ case KVM_PPC_ALLOCATE_HTAB: {
+ struct kvm *kvm = filp->private_data;
+ u32 htab_order;
+
+ r = -EFAULT;
+ if (get_user(htab_order, (u32 __user *)argp))
+ break;
+ r = kvmppc_alloc_reset_hpt(kvm, &htab_order);
+ if (r)
+ break;
+ r = -EFAULT;
+ if (put_user(htab_order, (u32 __user *)argp))
+ break;
+ r = 0;
+ break;
+ }
#endif /* CONFIG_KVM_BOOK3S_64_HV */
#ifdef CONFIG_PPC_BOOK3S_64
View
9 arch/powerpc/platforms/Kconfig
@@ -25,13 +25,22 @@ source "arch/powerpc/platforms/wsp/Kconfig"
config KVM_GUEST
bool "KVM Guest support"
default n
+ select EPAPR_PARAVIRT
---help---
This option enables various optimizations for running under the KVM
hypervisor. Overhead for the kernel when not running inside KVM should
be minimal.
In case of doubt, say Y
+config EPAPR_PARAVIRT
+ bool "ePAPR para-virtualization support"
+ default n
+ help
+ Enables ePAPR para-virtualization support for guests.
+
+ In case of doubt, say Y
+
config PPC_NATIVE
bool
depends on 6xx || PPC64
View
2 arch/s390/include/asm/sclp.h
@@ -53,5 +53,7 @@ int sclp_chp_configure(struct chp_id chpid);
int sclp_chp_deconfigure(struct chp_id chpid);
int sclp_chp_read_info(struct sclp_chp_info *info);
void sclp_get_ipl_info(struct sclp_ipl_info *info);
+bool sclp_has_linemode(void);
+bool sclp_has_vt220(void);
#endif /* _ASM_S390_SCLP_H */
View
1 arch/s390/include/asm/sigp.h
@@ -24,6 +24,7 @@
#define SIGP_STATUS_CHECK_STOP 0x00000010UL
#define SIGP_STATUS_STOPPED 0x00000040UL
+#define SIGP_STATUS_EXT_CALL_PENDING 0x00000080UL
#define SIGP_STATUS_INVALID_PARAMETER 0x00000100UL
#define SIGP_STATUS_INCORRECT_STATE 0x00000200UL
#define SIGP_STATUS_NOT_RUNNING 0x00000400UL
View
12 arch/s390/kernel/setup.c
@@ -61,6 +61,7 @@
#include <asm/kvm_virtio.h>
#include <asm/diag.h>
#include <asm/os_info.h>
+#include <asm/sclp.h>
#include "entry.h"
long psw_kernel_bits = PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_PRIMARY |
@@ -136,9 +137,14 @@ __setup("condev=", condev_setup);
static void __init set_preferred_console(void)
{
- if (MACHINE_IS_KVM)
- add_preferred_console("hvc", 0, NULL);
- else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP)
+ if (MACHINE_IS_KVM) {
+ if (sclp_has_vt220())
+ add_preferred_console("ttyS", 1, NULL);
+ else if (sclp_has_linemode())
+ add_preferred_console("ttyS", 0, NULL);
+ else
+ add_preferred_console("hvc", 0, NULL);
+ } else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP)
add_preferred_console("ttyS", 0, NULL);
else if (CONSOLE_IS_3270)
add_preferred_console("tty3270", 0, NULL);
View
1 arch/s390/kvm/kvm-s390.c
@@ -347,6 +347,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
vcpu->arch.guest_fpregs.fpc = 0;
asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc));
vcpu->arch.sie_block->gbea = 1;
+ atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
}
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
View
77 arch/s390/kvm/sigp.c
@@ -26,19 +26,23 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
int rc;
if (cpu_addr >= KVM_MAX_VCPUS)
- return 3; /* not operational */
+ return SIGP_CC_NOT_OPERATIONAL;
spin_lock(&fi->lock);
if (fi->local_int[cpu_addr] == NULL)
- rc = 3; /* not operational */
+ rc = SIGP_CC_NOT_OPERATIONAL;
else if (!(atomic_read(fi->local_int[cpu_addr]->cpuflags)
- & CPUSTAT_STOPPED)) {
- *reg &= 0xffffffff00000000UL;
- rc = 1; /* status stored */
- } else {
+ & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED)))
+ rc = SIGP_CC_ORDER_CODE_ACCEPTED;
+ else {
*reg &= 0xffffffff00000000UL;
- *reg |= SIGP_STATUS_STOPPED;
- rc = 1; /* status stored */
+ if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
+ & CPUSTAT_ECALL_PEND)
+ *reg |= SIGP_STATUS_EXT_CALL_PENDING;
+ if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
+ & CPUSTAT_STOPPED)
+ *reg |= SIGP_STATUS_STOPPED;
+ rc = SIGP_CC_STATUS_STORED;
}
spin_unlock(&fi->lock);
@@ -54,7 +58,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
int rc;
if (cpu_addr >= KVM_MAX_VCPUS)
- return 3; /* not operational */
+ return SIGP_CC_NOT_OPERATIONAL;
inti = kzalloc(sizeof(*inti), GFP_KERNEL);
if (!inti)
@@ -66,7 +70,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
spin_lock(&fi->lock);
li = fi->local_int[cpu_addr];
if (li == NULL) {
- rc = 3; /* not operational */
+ rc = SIGP_CC_NOT_OPERATIONAL;
kfree(inti);
goto unlock;
}
@@ -77,7 +81,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
if (waitqueue_active(&li->wq))
wake_up_interruptible(&li->wq);
spin_unlock_bh(&li->lock);
- rc = 0; /* order accepted */
+ rc = SIGP_CC_ORDER_CODE_ACCEPTED;
VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
unlock:
spin_unlock(&fi->lock);
@@ -92,7 +96,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
int rc;
if (cpu_addr >= KVM_MAX_VCPUS)
- return 3; /* not operational */
+ return SIGP_CC_NOT_OPERATIONAL;
inti = kzalloc(sizeof(*inti), GFP_KERNEL);
if (!inti)
@@ -104,7 +108,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
spin_lock(&fi->lock);
li = fi->local_int[cpu_addr];
if (li == NULL) {
- rc = 3; /* not operational */
+ rc = SIGP_CC_NOT_OPERATIONAL;
kfree(inti);
goto unlock;
}
@@ -115,7 +119,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
if (waitqueue_active(&li->wq))
wake_up_interruptible(&li->wq);
spin_unlock_bh(&li->lock);
- rc = 0; /* order accepted */
+ rc = SIGP_CC_ORDER_CODE_ACCEPTED;
VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
unlock:
spin_unlock(&fi->lock);
@@ -143,7 +147,7 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
out:
spin_unlock_bh(&li->lock);
- return 0; /* order accepted */
+ return SIGP_CC_ORDER_CODE_ACCEPTED;
}
static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action)
@@ -153,12 +157,12 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action)
int rc;
if (cpu_addr >= KVM_MAX_VCPUS)
- return 3; /* not operational */
+ return SIGP_CC_NOT_OPERATIONAL;
spin_lock(&fi->lock);
li = fi->local_int[cpu_addr];
if (li == NULL) {
- rc = 3; /* not operational */
+ rc = SIGP_CC_NOT_OPERATIONAL;
goto unlock;
}
@@ -182,11 +186,11 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
switch (parameter & 0xff) {
case 0:
- rc = 3; /* not operational */
+ rc = SIGP_CC_NOT_OPERATIONAL;
break;
case 1:
case 2:
- rc = 0; /* order accepted */
+ rc = SIGP_CC_ORDER_CODE_ACCEPTED;
break;
default:
rc = -EOPNOTSUPP;
@@ -207,30 +211,33 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
address = address & 0x7fffe000u;
if (copy_from_guest_absolute(vcpu, &tmp, address, 1) ||
copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)) {
+ *reg &= 0xffffffff00000000UL;
*reg |= SIGP_STATUS_INVALID_PARAMETER;
- return 1; /* invalid parameter */
+ return SIGP_CC_STATUS_STORED;
}
inti = kzalloc(sizeof(*inti), GFP_KERNEL);
if (!inti)
- return 2; /* busy */
+ return SIGP_CC_BUSY;
spin_lock(&fi->lock);
if (cpu_addr < KVM_MAX_VCPUS)
li = fi->local_int[cpu_addr];
if (li == NULL) {
- rc = 1; /* incorrect state */
- *reg &= SIGP_STATUS_INCORRECT_STATE;
+ *reg &= 0xffffffff00000000UL;
+ *reg |= SIGP_STATUS_INCORRECT_STATE;
+ rc = SIGP_CC_STATUS_STORED;
kfree(inti);
goto out_fi;
}
spin_lock_bh(&li->lock);
/* cpu must be in stopped state */
if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) {
- rc = 1; /* incorrect state */
- *reg &= SIGP_STATUS_INCORRECT_STATE;
+ *reg &= 0xffffffff00000000UL;
+ *reg |= SIGP_STATUS_INCORRECT_STATE;
+ rc = SIGP_CC_STATUS_STORED;
kfree(inti);
goto out_li;
}
@@ -242,7 +249,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
atomic_set(&li->active, 1);
if (waitqueue_active(&li->wq))
wake_up_interruptible(&li->wq);
- rc = 0; /* order accepted */
+ rc = SIGP_CC_ORDER_CODE_ACCEPTED;
VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address);
out_li:
@@ -259,21 +266,21 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
if (cpu_addr >= KVM_MAX_VCPUS)
- return 3; /* not operational */
+ return SIGP_CC_NOT_OPERATIONAL;
spin_lock(&fi->lock);
if (fi->local_int[cpu_addr] == NULL)
- rc = 3; /* not operational */
+ rc = SIGP_CC_NOT_OPERATIONAL;
else {
if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
& CPUSTAT_RUNNING) {
/* running */
- rc = 1;
+ rc = SIGP_CC_ORDER_CODE_ACCEPTED;
} else {
/* not running */
*reg &= 0xffffffff00000000UL;
*reg |= SIGP_STATUS_NOT_RUNNING;
- rc = 0;
+ rc = SIGP_CC_STATUS_STORED;
}
}
spin_unlock(&fi->lock);
@@ -286,23 +293,23 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
static int __sigp_restart(struct kvm_vcpu *vcpu, u16 cpu_addr)
{
- int rc = 0;
struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
struct kvm_s390_local_interrupt *li;
+ int rc = SIGP_CC_ORDER_CODE_ACCEPTED;
if (cpu_addr >= KVM_MAX_VCPUS)
- return 3; /* not operational */
+ return SIGP_CC_NOT_OPERATIONAL;
spin_lock(&fi->lock);
li = fi->local_int[cpu_addr];
if (li == NULL) {
- rc = 3; /* not operational */
+ rc = SIGP_CC_NOT_OPERATIONAL;
goto out;
}
spin_lock_bh(&li->lock);
if (li->action_bits & ACTION_STOP_ON_STOP)
- rc = 2; /* busy */
+ rc = SIGP_CC_BUSY;
else
VCPU_EVENT(vcpu, 4, "sigp restart %x to handle userspace",
cpu_addr);
@@ -377,7 +384,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
case SIGP_RESTART:
vcpu->stat.instruction_sigp_restart++;
rc = __sigp_restart(vcpu, cpu_addr);
- if (rc == 2) /* busy */
+ if (rc == SIGP_CC_BUSY)
break;
/* user space must know about restart */
default:
View
3 arch/x86/include/asm/apic.h
@@ -465,6 +465,8 @@ static inline u32 safe_apic_wait_icr_idle(void)
return apic->safe_wait_icr_idle();
}
+extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v));
+
#else /* CONFIG_X86_LOCAL_APIC */
static inline u32 apic_read(u32 reg) { return 0; }
@@ -474,6 +476,7 @@ static inline u64 apic_icr_read(void) { return 0; }
static inline void apic_icr_write(u32 low, u32 high) { }
static inline void apic_wait_icr_idle(void) { }
static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
+static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}
#endif /* CONFIG_X86_LOCAL_APIC */
View
7 arch/x86/include/asm/bitops.h
@@ -264,6 +264,13 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
* This operation is non-atomic and can be reordered.
* If two examples of this operation race, one can appear to succeed
* but actually fail. You must protect multiple accesses with a lock.
+ *
+ * Note: the operation is performed atomically with respect to
+ * the local CPU, but not other CPUs. Portable code should not
+ * rely on this behaviour.
+ * KVM relies on this behaviour on x86 for modifying memory that is also
+ * accessed from a hypervisor on the same CPU if running in a VM: don't change
+ * this without also updating arch/x86/kernel/kvm.c
*/
static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
{
View
1 arch/x86/include/asm/hypervisor.h
@@ -49,6 +49,7 @@ extern const struct hypervisor_x86 *x86_hyper;
extern const struct hypervisor_x86 x86_hyper_vmware;
extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
extern const struct hypervisor_x86 x86_hyper_xen_hvm;
+extern const struct hypervisor_x86 x86_hyper_kvm;
static inline bool hypervisor_x2apic_available(void)
{
View
1 arch/x86/include/asm/kvm.h
@@ -12,6 +12,7 @@
/* Select x86 specific features in <linux/kvm.h> */
#define __KVM_HAVE_PIT
#define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_IRQ_LINE
#define __KVM_HAVE_DEVICE_ASSIGNMENT
#define __KVM_HAVE_MSI
#define __KVM_HAVE_USER_NMI
View
6 arch/x86/include/asm/kvm_emulate.h
@@ -192,8 +192,8 @@ struct x86_emulate_ops {
struct x86_instruction_info *info,
enum x86_intercept_stage stage);
- bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+ void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
+ u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
};
typedef u32 __attribute__((vector_size(16))) sse128_t;
@@ -280,9 +280,9 @@ struct x86_emulate_ctxt {
u8 modrm_seg;
bool rip_relative;
unsigned long _eip;
+ struct operand memop;
/* Fields above regs are cleared together. */
unsigned long regs[NR_VCPU_REGS];
- struct operand memop;
struct operand *memopp;
struct fetch_cache fetch;
struct read_cache io_read;
View
31 arch/x86/include/asm/kvm_host.h
@@ -48,12 +48,13 @@
#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
+#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL
#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
0xFFFFFF0000000000ULL)
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
- | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
| X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
@@ -175,6 +176,13 @@ enum {
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
+/*
+ * The following bit is set with PV-EOI, unset on EOI.
+ * We detect PV-EOI changes by guest by comparing
+ * this bit with PV-EOI in guest memory.
+ * See the implementation in apic_update_pv_eoi.
+ */
+#define KVM_APIC_PV_EOI_PENDING 1
/*
* We don't want allocation failures within the mmu code, so we preallocate
@@ -484,6 +492,11 @@ struct kvm_vcpu_arch {
u64 length;
u64 status;
} osvw;
+
+ struct {
+ u64 msr_val;
+ struct gfn_to_hva_cache data;
+ } pv_eoi;
};
struct kvm_lpage_info {
@@ -661,6 +674,7 @@ struct kvm_x86_ops {
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
+ bool (*invpcid_supported)(void);
void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -802,7 +816,20 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
-int kvm_pic_set_irq(void *opaque, int irq, int level);
+static inline int __kvm_irq_line_state(unsigned long *irq_state,
+ int irq_source_id, int level)
+{
+ /* Logical OR for level trig interrupt */
+ if (level)
+ __set_bit(irq_source_id, irq_state);
+ else
+ __clear_bit(irq_source_id, irq_state);
+
+ return !!(*irq_state);
+}
+
+int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
+void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
View
7 arch/x86/include/asm/kvm_para.h
@@ -22,6 +22,7 @@
#define KVM_FEATURE_CLOCKSOURCE2 3
#define KVM_FEATURE_ASYNC_PF 4
#define KVM_FEATURE_STEAL_TIME 5
+#define KVM_FEATURE_PV_EOI 6
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
@@ -37,6 +38,7 @@
#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
#define MSR_KVM_STEAL_TIME 0x4b564d03
+#define MSR_KVM_PV_EOI_EN 0x4b564d04
struct kvm_steal_time {
__u64 steal;
@@ -89,6 +91,11 @@ struct kvm_vcpu_pv_apf_data {
__u32 enabled;
};
+#define KVM_PV_EOI_BIT 0
+#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
+#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
+#define KVM_PV_EOI_DISABLED 0x0
+
#ifdef __KERNEL__
#include <asm/processor.h>
View
2 arch/x86/include/asm/processor-flags.h
@@ -44,6 +44,7 @@
*/
#define X86_CR3_PWT 0x00000008 /* Page Write Through */
#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
+#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */
/*
* Intel CPU features in CR4
@@ -61,6 +62,7 @@
#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
+#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */
#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
View
6 arch/x86/include/asm/vmx.h
@@ -60,6 +60,7 @@
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
+#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -281,6 +282,7 @@ enum vmcs_field {
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
+#define EXIT_REASON_INVPCID 58
/*
* Interruption-information format
@@ -404,6 +406,7 @@ enum vmcs_field {
#define VMX_EPTP_WB_BIT (1ull << 14)
#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
+#define VMX_EPT_AD_BIT (1ull << 21)
#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -415,11 +418,14 @@ enum vmcs_field {
#define VMX_EPT_MAX_GAW 0x4
#define VMX_EPT_MT_EPTE_SHIFT 3
#define VMX_EPT_GAW_EPTP_SHIFT 3
+#define VMX_EPT_AD_ENABLE_BIT (1ull << 6)
#define VMX_EPT_DEFAULT_MT 0x6ull
#define VMX_EPT_READABLE_MASK 0x1ull
#define VMX_EPT_WRITABLE_MASK 0x2ull
#define VMX_EPT_EXECUTABLE_MASK 0x4ull
#define VMX_EPT_IPAT_BIT (1ull << 6)
+#define VMX_EPT_ACCESS_BIT (1ull << 8)
+#define VMX_EPT_DIRTY_BIT (1ull << 9)
#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
View
17 arch/x86/kernel/apic/apic.c
@@ -2143,6 +2143,23 @@ int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
}
/*
+ * Override the generic EOI implementation with an optimized version.
+ * Only called during early boot when only one CPU is active and with
+ * interrupts disabled, so we know this does not race with actual APIC driver
+ * use.
+ */
+void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ /* Should happen once for each apic */
+ WARN_ON((*drv)->eoi_write == eoi_write);
+ (*drv)->eoi_write = eoi_write;
+ }
+}
+
+/*
* Power management
*/
#ifdef CONFIG_PM
View
3 arch/x86/kernel/cpu/hypervisor.c
@@ -37,6 +37,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
#endif
&x86_hyper_vmware,
&x86_hyper_ms_hyperv,
+#ifdef CONFIG_KVM_GUEST
+ &x86_hyper_kvm,
+#endif
};
const struct hypervisor_x86 *x86_hyper;
View
64 arch/x86/kernel/kvm.c
@@ -39,6 +39,9 @@
#include <asm/desc.h>
#include <asm/tlbflush.h>
#include <asm/idle.h>
+#include <asm/apic.h>
+#include <asm/apicdef.h>
+#include <asm/hypervisor.h>
static int kvmapf = 1;
@@ -283,6 +286,22 @@ static void kvm_register_steal_time(void)
cpu, __pa(st));
}
+static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
+