Permalink
Browse files

added introduction

  • Loading branch information...
1 parent 19be34a commit dd51e5da7b03872972443cee0e0579ff42f7d937 Shinpei Kato committed Oct 11, 2012
Showing with 204 additions and 48 deletions.
  1. +27 −0 draft/abstract.tex
  2. +8 −38 draft/draft.tex
  3. +75 −0 draft/introduction.tex
  4. +94 −10 draft/references.bib
View
@@ -0,0 +1,27 @@
+\begin{abstract}
+ Cyber-physical systems (CPS) often control complex physical
+ phenomenon.
+ The computational workload of control algorithms, hence, is becoming a
+ core challenge of CPS due to their real-time constraints.
+ By nature, control algorithms of CPS exhibit a high degree of data
+ parallelism, which can be offloaded to parallel compute devices,
+ such as graphics processing units (GPUs).
+ Yet another problem is introduced by the communication between the host
+ processor and the compute device.
+ As a matter of fact, plasma control requires an order of a few
+ microseconds for the sampling period, while today's systems may take
+ several ten microseconds to copy data between the host and the device
+ memory at scale of the required data size.
+ In this paper, we present a zero-copy I/O processing scheme that
+ enables sensor and actuator devices to directly transfer data to and
+ from compute devices without using the host processor.
+ The basic idea behind this scheme is to map the I/O address space onto
+ the device memory, removing data-copy operations upon the host memory.
+ The experimental results from the real-world plasma control
+ system demonstrate that a sampling period of plasma control can be
+ reduced by 33\% under the zero-copy I/O scheme.
+ The microbenchmarking results also show that GPU-accelerated matrix
+ computations can be completed in 34\% less time than current methods,
+ while effective data throughput is at least as good as the current best
+ performers.
+\end{abstract}
View
@@ -80,16 +80,16 @@
% e-mail address with \email.
%
\alignauthor Shinpei Kato\\
- \affaddr{Department of Information Engineering}\\
+ \affaddr{Dept. Information Engineering}\\
\affaddr{Nagoya University}
\and
-\alignauthor Jason Aumiller and Scott Brandt\\
- \affaddr{Department of Computer Science}\\
- \affaddr{University of California, Santa Cruz}
-\and
\alignauthor Nikolaus Rath\\
- \affaddr{Department of Applied Physics and Applied Mathematics}\\
+ \affaddr{Dept. Applied Physics and Applied Mathematics}\\
\affaddr{Columbia University}
+\and
+\alignauthor Jason Aumiller and Scott Brandt\\
+ \affaddr{Dept. Computer Science}\\
+ \affaddr{University of California, Santa Cruz}
}
% There's nothing stopping you putting the seventh, eighth, etc.
% author on the opening page (as the 'third row') but we ask,
@@ -109,40 +109,10 @@
% need for camera-ready
\thispagestyle{empty}
-\begin{abstract}
- Cyber-physical systems (CPS) often control complex physical
- phenomenon.
- The computational workload of control algorithms, hence, is becoming a
- core challenge of CPS due to their real-time constraints.
- By nature, CPS control algorithms exhibit a high degree of data
- parallelism, which can be offloaded to parallel compute devices,
- such as graphics processing units (GPUs).
- Yet another problem is introduced by the communication overhead between
- the host processor and the compute device.
- As a matter of fact, plasma fusion requires a sampling period of a few
- microseconds, while today's systems may take several ten microseconds
- to copy data between the host and the device memory at scale of the
- required data size.
- In this paper, we present a zero-copy I/O processing scheme that
- enables sensor and actuator devices to directly communicate with
- compute devices without accessing the host processor.
- This scheme maps the I/O address space to the device memory to remove
- data-copy operations with respect to the host memory.
- The experimental results from Columbia University's ``Tokamak'' fusion
- control system demonstrate that a sampling period of plasma fusion can
- be reduced by 33\% under the zero-copy I/O scheme.
- The microbenchmarking results also show that GPU-accelerated
- tasks can be completed in 34\% less time than current methods, while
- effective data throughput is at least as good as the best performers of
- current methods.
-\end{abstract}
-
+\input{abstract.tex}
\keywords{GPGPU, Zero-Copy I/O, Plasma Fusion}
-\section{Introduction}
-\label{sec:introduction}
-
-\cite{Kato_ATC11}.\cite{Kato_RTAS11}.
+\input{introduction.tex}
\bibliographystyle{abbrv}
\bibliography{references}
@@ -0,0 +1,75 @@
+\section{Introduction}
+\label{sec:introduction}
+
+Cyber-physical systems (CPS) are next generations of networked and
+embedded systems, tightly coupled with computations and physical
+elements to control physical phenomenon.
+Control algorithms of CPS, therefore, are becoming more and more
+complex, which makes CPS distinguished from traditional safety-critical
+systems.
+In CPS applications, the real-fast is as important as the real-time,
+while only the real-time is a primary concern in safety-critical systems.
+This double-edge requirement of the real-time and the real-fast,
+however, has posed a core challenge of CPS platforms.
+
+\begin{figure}[tb]
+ \centering
+ \includegraphics[width=0.8\hsize]{eps/tokamak.eps}
+ \caption{The HBT-EP ``Tokamak'' at Columbia University.}
+ \label{fig:tokamak}
+\end{figure}
+
+Plasma control for fusion is an applications of energy CPS, where
+complex algorithms must be computed at a very high rate.
+Figure~\ref{fig:tokamak} shows the HBT-EP Tokamak at Columbia
+University~\cite{Maurer_PPCF11,Rath_FED12} that magnetically controls
+the 3-D perturbed equilibrium state of the plasma~\cite{Boozer_PP99}.
+It is required to process 96 inputs and 64 outputs of 16-bit data at a
+sampling rate of a few microseconds.
+An initial attempt of the Columbia team employed fast CPUs or FPGAs, but
+even the simplified algorithm failed to run within 20$\mu$s.
+An alternative approach was to parallelize the algorithm for the
+graphics processing unit (GPU) using CUDA~\cite{CUDA}, the most
+successful massively parallel computing technology.
+However, the current system for GPU computing is not designed to
+integrate sensor and actuator devices.
+This is largely attributed to the fact that the GPU computing stack is
+independent of I/O device drivers.
+Since it may take tens of microseconds to transfer hundreds of bytes
+between the CPU and the GPU, the current system does not allow plasma
+control to use the GPU.
+This is a signficant problem not only for plasma control but also for
+many applications of CPS that utilize compute devices with I/O devices.
+
+To the best of our knowledge, there is currently no generic support for
+direct communication between the GPU and I/O devices, though a
+specialized proprietary product for InfiniBand networks is
+available~\cite{GPUDirect}.
+There are also pinned memory allocation methods available from current
+programming frameworks to reduce data-copy operations, but it is unclear
+if they are best suited for real-time GPU applications.
+Although GPUs have been increasingly utilized in the domain of
+CPS~\cite{Hirabayashi_REACTION12, Mangharam11, McNaughton_ICRA11,
+Michel_IROS07}, and GPU resource management techniques have been
+invented~\cite{Elliott_RTS12, Elliott_ECRTS12, Kato_RTAS11, Kato_RTSS11,
+Kato_ATC11, Kato_ATC12, Liu_PACT12}, an integration of I/O processing
+and GPUs remains an open problem.
+
+In this paper, we present a zero-copy I/O processing scheme for GPU
+applications.
+This scheme incorporates functions and their application programming
+interface (API) for I/O device drivers to directly transfer data to and
+from GPU memory space, removing additional data-copy operations between
+the CPU and the GPU.
+We also investigate exisiting approaches, and compare them to the
+presented zero-copy I/O processing scheme.
+Our case study uses the Columbia University's Tokamak plasma control
+system to evalaute a reduced sampling rate of plasma control.
+In order to evaluate more generic properties of I/O processing schemes,
+we further provide microbenchmarks, and discuss the pros and cons of each
+scheme.
+By clarifying GPU capabilities, we aim to not only improve the overall
+performance but also broaden the scope of CPS that can benefit from the
+use of GPU technology.
+
+
View
@@ -1,10 +1,39 @@
+@article{Boozer_PP99,
+author = {A.H. Boozer},
+title = {{Perturbed plasma equilibria}},
+journal = {Physics of Plasma},
+volume = {6},
+year = {1999}
+}
+
+@inproceedings{Elliott10,
+author = {G. Elliott and J. Anderson},
+title = {{Real-Time Multiprocessor Systems with GPUs}},
+booktitle = {Proceedings of the International Confrence on Real-Time and Network Systems},
+year = {2010}
+}
+
+@inproceedings{Hirabayashi_REACTION12,
+author = {M. Hirabayashi and S. Kato and M. Edahiro and Y. Sugiyama},
+title = {{Toward GPU-accelerated traffic simulation and its real-time challenge}},
+booktitle = {Proc. of the International Workshop on Real-Time and Distributed Computing in Emerging Applications},
+year = {2012 (to appear)}
+}
+
@inproceedings{Kato_ATC11,
author = {S. Kato and K. Lakshmanan and R. Rajkumar and Y. Ishikawa},
title = {{TimeGraph: GPU Scheduling for Real-Time Multi-Tasking Environments}},
booktitle = {Proceedings of the USENIX Annual Technical Conference},
year = {2011}
}
+@inproceedings{Kato_ATC12,
+author = {S. Kato and M. McThrow and C. Maltzahn and S. Brandt},
+title = {{Gdev: First-Class GPU Resource Management in the Operating System}},
+booktitle = {Proc. of the USENIX Annual Technical Conference},
+year = {2012},
+}
+
@inproceedings{Kato_RTAS11,
author = {S. Kato and K. Lakshmanan and Y. Ishikawa and R. Rajkumar},
title = {{Resource Sharing in GPU-accelerated Windowing Systems}},
@@ -28,6 +57,52 @@ @inproceedings{Kim_RTSS12
year = {2012}
}
+@inproceedings{Liu_PACT12,
+author = {C. Liu and J. Li and W. Huang and J. Rubio and E. Speight and X. Lin},
+title = {{Power-Efficient Time-Sensitive Mapping in Heterogeneous Systems}},
+booktitle = {Proc. of the International Conference on Parallel Architectures and Compilation Techniques},
+year = {2012}
+}
+
+@inproceedings{Mangharam11,
+author = {R. Mangharam and A. Saba},
+title = {{Anytime Algorithms for GPU Architectures}},
+booktitle = {Proc. of the IEEE Real-Time Systems Symposium},
+pages = {47--56},
+year = {2011}
+}
+
+@article{Maurer_PPCF11,
+author = {D.A. Maurer and J. Bialek and P.J. Byrne and B. De Bono and J.P. Levesque and B.Q. Li, et al.},
+title = {{The high beta tokamak-extended pulse magnetohydrodynamic mode control research program}},
+journal = {Plasma Physics and Controlled Fusion},
+volume = {53},
+year = {2011}
+}
+
+@inproceedings{McNaughton_ICRA11,
+author = {M. McNaughton and C. Urmson and J. Dolan and J-W. Lee},
+title = {{Motion Planning for Autonomous Driving with a Conformal Spatiotemporal Lattice}},
+booktitle = {Proc. of the IEE International Conference on Robotics and Automation},
+pages = {4889--4895},
+year = {2011}
+}
+
+@inproceedings{Michel_IROS07,
+author = {P. Michel and J. Chestnutt and S. Kagami and K. Nishiwaki and J. Kuffner and T. Kanade},
+title = {{GPU-accelerated Real-Time 3D Tracking for Humanoid Locomotion and Stair Climbing}},
+booktitle = {Proc. of the IEEE/RSJ International Conference on Intelligent Robots and Systems},
+pages = {463--469},
+year = {2007}
+}
+
+@article{Rath_FED12,
+author = {N. Rath and J. Bialek and P.J. Byrne and B. DeBono and J.P. Levesque and B. Li and M.E. Mauel and D.A. Maurer and G.A. Navratil and D. Shiraki},
+title = {{High-speed, multi-input, multi-output control using GPU processing in the HBT-EP tokamak}},
+journal = {Fusion Engineering and Design},
+year = {2012}
+}
+
@inproceedings{Rossbach_SOSP11,
author = {C. Rossbach and J. Currey and M. Silberstein and B. Ray and E. Witchel},
title = {{PTask: Operating system abstractions to manage GPUs as compute devices}},
@@ -43,14 +118,12 @@ @misc{Fermi
@misc{CUDA,
author = {NVIDIA},
-title = {{CUDA C Programming Guide}},
-howpublished = {\url{http://developer.nvidia.com/nvidia-gpu-computing-documentation}},
+title = {{CUDA C Programming Guide Version 4.2}},
+year = {2012}
}
-
-
@inproceedings{Al-Kiswany08,
author = {S. Al-Kiswany and A. Gharaibeh and E. Santos-Neto and G. Yuan and M. Ripeanu},
title = {{StoreGPU: Exploiting Graphics Processing Units to Accelerate Distributed Storage Systems}},
@@ -110,11 +183,22 @@ @article{Dowty09
year = {2009}
}
-@inproceedings{Elliott10,
+@inproceedings{Elliott_ECRTS12,
author = {G. Elliott and J. Anderson},
-title = {{Real-Time Multiprocessor Systems with GPUs}},
-booktitle = {Proceedings of the International Confrence on Real-Time and Network Systems},
-year = {2010}
+title = {{Robust Real-Time Multiprocessor Interrupt Handling Motivated by GPUs}},
+booktitle = {Proc. of the Euromicro Conference on Real-Time Systems},
+pages = {267--276},
+year = {2012}
+}
+
+@article{Elliott_RTS12,
+author = {G. Elliott and J. Anderson},
+title = {{Globally Scheduled Real-Time Multiprocessor Systems with GPUs}},
+journal = {Real-Time Systems},
+volume = {48},
+number = {1},
+pages = {34--74},
+year = {2012}
}
@inproceedings{Gharaibeh10,
@@ -221,10 +305,10 @@ @article{Liu73
year = {1973}
}
-@misc{Mellanox,
+@misc{GPUDirect,
author = {Mellanox},
title = {{NVIDIA GPUDirect Technology-- Accelerating GPU-based Systems (Whitepaper)}},
-howpublished = {\url{http://www.mellanox.com/pdf/whitepapers/TB_GPU_Direct.pdf}}
+year = {2010}
}
@misc{Mesa3D,

0 comments on commit dd51e5d

Please sign in to comment.