Skip to content
Browse files

tracing: Do not allocate buffer for trace_marker

When doing intense tracing, the kmalloc inside trace_marker can
introduce side effects to what is being traced.

As trace_marker() is used by userspace to inject data into the
kernel ring buffer, it needs to do so with the least amount
of intrusion to the operations of the kernel or the user space
application.

As the ring buffer is designed to write directly into the buffer
without the need to make a temporary buffer, and userspace already
went through the hassle of knowing how big the write will be,
we can simply pin the userspace pages and write the data directly
into the buffer. This improves the impact of tracing via trace_marker
tremendously!

Thanks to Peter Zijlstra and Thomas Gleixner for pointing out the
use of get_user_pages_fast() and kmap_atomic().

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
  • Loading branch information...
1 parent e0a413f commit d696b58ca2c3ca76e784ef89a7e0453d9b7ab187 Steven Rostedt committed with Sep 22, 2011
Showing with 83 additions and 28 deletions.
  1. +83 −28 kernel/trace/trace.c
View
111 kernel/trace/trace.c
@@ -3628,51 +3628,106 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
return 0;
}
-static int mark_printk(const char *fmt, ...)
-{
- int ret;
- va_list args;
- va_start(args, fmt);
- ret = trace_vprintk(0, fmt, args);
- va_end(args);
- return ret;
-}
-
static ssize_t
tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
{
- char *buf;
- size_t written;
+ unsigned long addr = (unsigned long)ubuf;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct print_entry *entry;
+ unsigned long irq_flags;
+ struct page *pages[2];
+ int nr_pages = 1;
+ ssize_t written;
+ void *page1;
+ void *page2;
+ int offset;
+ int size;
+ int len;
+ int ret;
if (tracing_disabled)
return -EINVAL;
if (cnt > TRACE_BUF_SIZE)
cnt = TRACE_BUF_SIZE;
- buf = kmalloc(cnt + 2, GFP_KERNEL);
- if (buf == NULL)
- return -ENOMEM;
+ /*
+ * Userspace is injecting traces into the kernel trace buffer.
+ * We want to be as non intrusive as possible.
+ * To do so, we do not want to allocate any special buffers
+ * or take any locks, but instead write the userspace data
+ * straight into the ring buffer.
+ *
+ * First we need to pin the userspace buffer into memory,
+ * which, most likely it is, because it just referenced it.
+ * But there's no guarantee that it is. By using get_user_pages_fast()
+ * and kmap_atomic/kunmap_atomic() we can get access to the
+ * pages directly. We then write the data directly into the
+ * ring buffer.
+ */
+ BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
- if (copy_from_user(buf, ubuf, cnt)) {
- kfree(buf);
- return -EFAULT;
+ /* check if we cross pages */
+ if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
+ nr_pages = 2;
+
+ offset = addr & (PAGE_SIZE - 1);
+ addr &= PAGE_MASK;
+
+ ret = get_user_pages_fast(addr, nr_pages, 0, pages);
+ if (ret < nr_pages) {
+ while (--ret >= 0)
+ put_page(pages[ret]);
+ written = -EFAULT;
+ goto out;
}
- if (buf[cnt-1] != '\n') {
- buf[cnt] = '\n';
- buf[cnt+1] = '\0';
+
+ page1 = kmap_atomic(pages[0]);
+ if (nr_pages == 2)
+ page2 = kmap_atomic(pages[1]);
+
+ local_save_flags(irq_flags);
+ size = sizeof(*entry) + cnt + 2; /* possible \n added */
+ buffer = global_trace.buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+ irq_flags, preempt_count());
+ if (!event) {
+ /* Ring buffer disabled, return as if not open for write */
+ written = -EBADF;
+ goto out_unlock;
+ }
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = _THIS_IP_;
+
+ if (nr_pages == 2) {
+ len = PAGE_SIZE - offset;
+ memcpy(&entry->buf, page1 + offset, len);
+ memcpy(&entry->buf[len], page2, cnt - len);
} else
- buf[cnt] = '\0';
+ memcpy(&entry->buf, page1 + offset, cnt);
- written = mark_printk("%s", buf);
- kfree(buf);
- *fpos += written;
+ if (entry->buf[cnt - 1] != '\n') {
+ entry->buf[cnt] = '\n';
+ entry->buf[cnt + 1] = '\0';
+ } else
+ entry->buf[cnt] = '\0';
+
+ ring_buffer_unlock_commit(buffer, event);
- /* don't tell userspace we wrote more - it might confuse them */
- if (written > cnt)
- written = cnt;
+ written = cnt;
+ *fpos += written;
+
+ out_unlock:
+ if (nr_pages == 2)
+ kunmap_atomic(page2);
+ kunmap_atomic(page1);
+ while (nr_pages > 0)
+ put_page(pages[--nr_pages]);
+ out:
return written;
}

0 comments on commit d696b58

Please sign in to comment.
Something went wrong with that request. Please try again.