Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

first commit

  • Loading branch information...
commit b61e3a8e2e9212e985f8fef84312ef4adf29c720 1 parent 5a62dd1
Shinpei Kato authored
Showing with 1,812 additions and 2 deletions.
  1. +0 −2  README.md
  2. BIN  draft/.DS_Store
  3. BIN  draft/.hg/00changelog.i
  4. +1 −0  draft/.hg/branch
  5. +2 −0  draft/.hg/cache/branchheads
  6. +2 −0  draft/.hg/cache/tags
  7. BIN  draft/.hg/dirstate
  8. +2 −0  draft/.hg/hgrc
  9. +4 −0 draft/.hg/requires
  10. +5 −0 draft/.hg/sourcetreeconfig
  11. BIN  draft/.hg/store/00changelog.i
  12. BIN  draft/.hg/store/00manifest.i
  13. BIN  draft/.hg/store/data/20120720-_hot_power-ja-draft.pdf.d
  14. BIN  draft/.hg/store/data/20120720-_hot_power-ja-draft.pdf.i
  15. BIN  draft/.hg/store/data/_hot_power-figure.docx.d
  16. BIN  draft/.hg/store/data/_hot_power-figure.docx.i
  17. BIN  draft/.hg/store/data/abstract.tex.i
  18. BIN  draft/.hg/store/data/analysis.tex.i
  19. BIN  draft/.hg/store/data/background.tex.i
  20. BIN  draft/.hg/store/data/conc.tex.i
  21. BIN  draft/.hg/store/data/evalu.tex.i
  22. BIN  draft/.hg/store/data/experiment.tex.i
  23. BIN  draft/.hg/store/data/figures/a.pdf.i
  24. BIN  draft/.hg/store/data/figures/idol.pdf.i
  25. BIN  draft/.hg/store/data/figures/madd-gdev-nvidia-energy.pdf.i
  26. BIN  draft/.hg/store/data/figures/madd-gdev-nvidia-time.pdf.i
  27. BIN  draft/.hg/store/data/figures/madd-nvidia-energy.pdf.i
  28. BIN  draft/.hg/store/data/figures/madd-nvidia-time.pdf.i
  29. BIN  draft/.hg/store/data/figures/madd-time-power.pdf.i
  30. BIN  draft/.hg/store/data/figures/mmul-gdev-nvidia-energy.pdf.i
  31. BIN  draft/.hg/store/data/figures/mmul-gdev-nvidia-time.pdf.i
  32. BIN  draft/.hg/store/data/figures/mmul-nvidia-energy.pdf.i
  33. BIN  draft/.hg/store/data/figures/mmul-nvidia-time.pdf.i
  34. BIN  draft/.hg/store/data/figures/perf-per-watt-abe.pdf.i
  35. BIN  draft/.hg/store/data/figures/perf__per__watt.pdf.i
  36. BIN  draft/.hg/store/data/figures/rodinia-energy.pdf.i
  37. BIN  draft/.hg/store/data/figures/rodinia-time.pdf.i
  38. BIN  draft/.hg/store/data/intro.tex.i
  39. BIN  draft/.hg/store/data/main.pdf.d
  40. BIN  draft/.hg/store/data/main.pdf.i
  41. BIN  draft/.hg/store/data/main.tex.i
  42. BIN  draft/.hg/store/data/mediabb.sty.i
  43. BIN  draft/.hg/store/data/multirow.sty.i
  44. BIN  draft/.hg/store/data/poster__usenix-atc.xlsx.d
  45. BIN  draft/.hg/store/data/poster__usenix-atc.xlsx.i
  46. BIN  draft/.hg/store/data/refer.bib.i
  47. BIN  draft/.hg/store/data/relatedwork.tex.i
  48. BIN  draft/.hg/store/data/subfigure.sty.i
  49. BIN  draft/.hg/store/data/usenix-kato.sty.i
  50. BIN  draft/.hg/store/data/usenix.sty.i
  51. +38 −0 draft/.hg/store/fncache
  52. BIN  draft/HotPower-figure.docx
  53. +24 −0 draft/abstract.tex
  54. +14 −0 draft/analysis.tex
  55. +17 −0 draft/background.tex
  56. +22 −0 draft/bibspacing.sty
  57. +24 −0 draft/conclusion.tex
  58. +202 −0 draft/evaluation.tex
  59. +65 −0 draft/experiment.tex
  60. BIN  draft/figures/a.pdf
  61. BIN  draft/figures/idol.pdf
  62. BIN  draft/figures/madd-gdev-nvidia-energy.pdf
  63. BIN  draft/figures/madd-gdev-nvidia-time.pdf
  64. BIN  draft/figures/madd-nvidia-energy.pdf
  65. BIN  draft/figures/madd-nvidia-time.pdf
  66. BIN  draft/figures/madd-time-power.pdf
  67. BIN  draft/figures/mmul-gdev-nvidia-energy.pdf
  68. BIN  draft/figures/mmul-gdev-nvidia-time.pdf
  69. BIN  draft/figures/mmul-nvidia-energy.pdf
  70. BIN  draft/figures/mmul-nvidia-time.pdf
  71. BIN  draft/figures/perf-per-watt-abe.pdf
  72. BIN  draft/figures/perf_per_watt.pdf
  73. BIN  draft/figures/rodinia-energy.pdf
  74. BIN  draft/figures/rodinia-time.pdf
  75. +67 −0 draft/introduction.tex
  76. +64 −0 draft/main.aux
  77. +76 −0 draft/main.bbl
  78. +50 −0 draft/main.blg
  79. BIN  draft/main.dvi
  80. +224 −0 draft/main.log
  81. BIN  draft/main.pdf
  82. +68 −0 draft/main.tex
  83. +20 −0 draft/make.sh
  84. +99 −0 draft/mediabb.sty
  85. +159 −0 draft/multirow.sty
  86. +79 −0 draft/platform.tex
  87. BIN  draft/poster_usenix-atc.xlsx
  88. +126 −0 draft/refer.bib
  89. +66 −0 draft/relatedwork.tex
  90. +104 −0 draft/subfigure.sty
  91. +94 −0 draft/usenix-kato.sty
  92. +94 −0 draft/usenix.sty
View
2  README.md
@@ -1,2 +0,0 @@
-hotpower12
-==========
View
BIN  draft/.DS_Store
Binary file not shown
View
BIN  draft/.hg/00changelog.i
Binary file not shown
View
1  draft/.hg/branch
@@ -0,0 +1 @@
+default
View
2  draft/.hg/cache/branchheads
@@ -0,0 +1,2 @@
+66662897816fe1f74add0762c6025043ea86e71a 1
+66662897816fe1f74add0762c6025043ea86e71a default
View
2  draft/.hg/cache/tags
@@ -0,0 +1,2 @@
+1 66662897816fe1f74add0762c6025043ea86e71a
+
View
BIN  draft/.hg/dirstate
Binary file not shown
View
2  draft/.hg/hgrc
@@ -0,0 +1,2 @@
+[paths]
+default = /Users/abe/Dropbox/soc-lab/abe/20120713-hotpower-ja.draft
View
4 draft/.hg/requires
@@ -0,0 +1,4 @@
+revlogv1
+fncache
+store
+dotencode
View
5 draft/.hg/sourcetreeconfig
@@ -0,0 +1,5 @@
+lastUsedView=0
+autorefresh=1
+savedIncoming=0
+lastCheckedRemotes=2012-07-18 00:53:05 +0000
+savedOutgoing=0
View
BIN  draft/.hg/store/00changelog.i
Binary file not shown
View
BIN  draft/.hg/store/00manifest.i
Binary file not shown
View
BIN  draft/.hg/store/data/20120720-_hot_power-ja-draft.pdf.d
Binary file not shown
View
BIN  draft/.hg/store/data/20120720-_hot_power-ja-draft.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/_hot_power-figure.docx.d
Binary file not shown
View
BIN  draft/.hg/store/data/_hot_power-figure.docx.i
Binary file not shown
View
BIN  draft/.hg/store/data/abstract.tex.i
Binary file not shown
View
BIN  draft/.hg/store/data/analysis.tex.i
Binary file not shown
View
BIN  draft/.hg/store/data/background.tex.i
Binary file not shown
View
BIN  draft/.hg/store/data/conc.tex.i
Binary file not shown
View
BIN  draft/.hg/store/data/evalu.tex.i
Binary file not shown
View
BIN  draft/.hg/store/data/experiment.tex.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/a.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/idol.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/madd-gdev-nvidia-energy.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/madd-gdev-nvidia-time.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/madd-nvidia-energy.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/madd-nvidia-time.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/madd-time-power.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/mmul-gdev-nvidia-energy.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/mmul-gdev-nvidia-time.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/mmul-nvidia-energy.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/mmul-nvidia-time.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/perf-per-watt-abe.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/perf__per__watt.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/rodinia-energy.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/figures/rodinia-time.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/intro.tex.i
Binary file not shown
View
BIN  draft/.hg/store/data/main.pdf.d
Binary file not shown
View
BIN  draft/.hg/store/data/main.pdf.i
Binary file not shown
View
BIN  draft/.hg/store/data/main.tex.i
Binary file not shown
View
BIN  draft/.hg/store/data/mediabb.sty.i
Binary file not shown
View
BIN  draft/.hg/store/data/multirow.sty.i
Binary file not shown
View
BIN  draft/.hg/store/data/poster__usenix-atc.xlsx.d
Binary file not shown
View
BIN  draft/.hg/store/data/poster__usenix-atc.xlsx.i
Binary file not shown
View
BIN  draft/.hg/store/data/refer.bib.i
Binary file not shown
View
BIN  draft/.hg/store/data/relatedwork.tex.i
Binary file not shown
View
BIN  draft/.hg/store/data/subfigure.sty.i
Binary file not shown
View
BIN  draft/.hg/store/data/usenix-kato.sty.i
Binary file not shown
View
BIN  draft/.hg/store/data/usenix.sty.i
Binary file not shown
View
38 draft/.hg/store/fncache
@@ -0,0 +1,38 @@
+data/background.tex.i
+data/intro.tex.i
+data/figures/perf-per-watt-abe.pdf.i
+data/figures/rodinia-time.pdf.i
+data/figures/madd-nvidia-energy.pdf.i
+data/conc.tex.i
+data/poster_usenix-atc.xlsx.i
+data/figures/mmul-nvidia-energy.pdf.i
+data/relatedwork.tex.i
+data/poster_usenix-atc.xlsx.d
+data/main.pdf.d
+data/mediabb.sty.i
+data/figures/rodinia-energy.pdf.i
+data/main.pdf.i
+data/figures/madd-gdev-nvidia-time.pdf.i
+data/abstract.tex.i
+data/subfigure.sty.i
+data/figures/perf_per_watt.pdf.i
+data/evalu.tex.i
+data/main.tex.i
+data/figures/idol.pdf.i
+data/20120720-HotPower-ja-draft.pdf.i
+data/figures/a.pdf.i
+data/figures/mmul-gdev-nvidia-time.pdf.i
+data/20120720-HotPower-ja-draft.pdf.d
+data/figures/mmul-nvidia-time.pdf.i
+data/multirow.sty.i
+data/refer.bib.i
+data/usenix-kato.sty.i
+data/figures/madd-nvidia-time.pdf.i
+data/HotPower-figure.docx.d
+data/figures/madd-time-power.pdf.i
+data/usenix.sty.i
+data/figures/mmul-gdev-nvidia-energy.pdf.i
+data/figures/madd-gdev-nvidia-energy.pdf.i
+data/analysis.tex.i
+data/HotPower-figure.docx.i
+data/experiment.tex.i
View
BIN  draft/HotPower-figure.docx
Binary file not shown
View
24 draft/abstract.tex
@@ -0,0 +1,24 @@
+\begin{abstract}
+
+ Graphics processing units (GPUs) provide significant improvements in
+ performance and performance-per-watt as compared to traditional
+ multicore CPUs.
+ This energy-efficiency of GPUs has facilitated use of GPUs in many
+ application domains.
+ Albeit energy efficient, GPUs still consume non-trivial power
+ independently of CPUs.
+ It is desired to analyze the power and performance charateristic of
+ GPUs and their causal relation with CPUs.
+ In this paper, we provide a power and performance analysis of
+ GPU-accelerated systems for better understandings of these implications.
+ Our analysis discloses that system energy could be
+ reduced by about 28\% retaining a decrease in performance within 1\%.
+ Specifically, we identify that energy saving is particularly significant
+ when (i) reducing the GPU memory clock for compute-intensive workload
+ and (ii) reducing the GPU core clock for memory-intensive workload.
+ We also demonstrate that voltage and frequency scaling of CPUs is
+ trivial and even should not be applied in GPU-accelerated systems.
+ We believe that these findings are useful to develop dynamic voltage and
+ frequency scaling (DVFS) algorithms for GPU-accelerated systems.
+
+\end{abstract}
View
14 draft/analysis.tex
@@ -0,0 +1,14 @@
+\section{Power and Performance Analysis}
+
+本稿では,GPUとCPUにおける電圧と動作周波数を変化させた場合における性能と消費エ
+ネルギーの解析を行う.GPUでは,メモリとコアの動作周波数を変更可能である.本稿で
+は,GPUがカーネルを実行する際には,動作周波数の変更を行わないものとする.本稿で
+の解析は,最大の動作周波数で実行する場合と比較して消費エネルギーを削減可能な場
+合がどういう場合であるかを調査する.まず,GPUで電圧と動作周波数を下げた場合に,
+消費電力が下がるのかについて調査を行う.その後,CPUとGPU両方のワークロードを含
+むベンチマークプログラムを用い消費エネルギーを削減可能な場合について考察してい
+く.電圧と動作周波数を低下させた場合に,消費エネルギーが削減可能な場合を知るこ
+とで,今後GPUを搭載したシステムにおいてDVFSを適用し,消費エネルギーを削減するア
+ルゴリズムを考案するための知見を得る.したがって,GPUを搭載したシステムにおいて
+は,CPU,GPUのコア,メモリのそれぞれで電圧と動作周波数を変更可能であるため,ど
+ういった場合にどの電圧と動作周波数を下げるべきかについて解析する.
View
17 draft/background.tex
@@ -0,0 +1,17 @@
+\section{Background and Motivation}
+\label{sec:background}
+
+GPU technology has continued to enhance computing capabilities with
+highly parallel architectures.
+NVIDIA GPUs, for example, have integrated on a chip about 250 cores in
+2008, 500 cores in 2010, and 1500 cores in 2012.
+Beyond the number of processing cores, recent GPUs implement
+hierarchical cache boosting memory performance in the presence of
+thousands of threads.
+The resulting peak performance of NVIDIA's latest architecture,
+\textit{a.k.a., Kepler}, exceeds 3GFlops and its performance-per-watt
+reaches 16GFlops.
+All these advantages of GPUs are made available for general-purpose
+applications due to emerging programming frameworks, such as CUDA and
+OpenCL.
+
View
22 draft/bibspacing.sty
@@ -0,0 +1,22 @@
+\newdimen\bibindent
+\setlength\bibindent{1.5em}
+\newdimen\bibspacing
+\setlength\bibspacing\z@
+\renewenvironment{thebibliography}[1]{%
+ \section*{\refname
+ \@mkboth{\MakeUppercase\refname}{\MakeUppercase\refname}}%
+ \list{\@biblabel{\@arabic\c@enumiv}}%
+ {\settowidth\labelwidth{\@biblabel{#1}}%
+ \leftmargin\labelwidth
+ \advance\leftmargin\labelsep
+ \itemsep\z@skip % should this be commented out?
+ \parsep\z@skip % should this be commented out?
+ \@openbib@code
+ \usecounter{enumiv}%
+ \let\p@enumiv\@empty
+ \renewcommand\theenumiv{\@arabic\c@enumiv}}%
+ \sloppy\clubpenalty4000\widowpenalty4000%
+ \sfcode`\.\@m}
+ {\def\@noitemerr
+ {\@latex@warning{Empty `thebibliography' environment}}%
+ \endlist}
View
24 draft/conclusion.tex
@@ -0,0 +1,24 @@
+\section{Conclusion and Future Work}
+\label{sec:conclusion}
+
+We have presented a power and performance analysis of GPU-accelerated
+systems based on the NVIDIA Fermi architecture.
+Our findings include that the CPU is a weak factor for energy savings of
+GPU-accelerated systems unless power gating is suppoted by the GPU.
+In contrast, voltage and frequency scaling of the GPU is significant to
+reduce energy consumption.
+Our experimental results demonstrated that system energy could be reduced
+by about 28\% retaining a decrease in performance within 1\%, if the
+performance level of the GPU can be scaled effectively.
+
+In future work, we plan to develop DVFS algorithms for GPU-accelerated
+systems, using the characteristic identified in this paper.
+We basically consider such an approach that controls the GPU core clock
+for memory-intensive workload while controls the GPU memory clock for
+compute-intensive workload.
+To this end, we integrate PTX code analysis~\cite{Hong2009,Hong2010}
+into DVFS algorithms so that energy optimization can be provided at
+runtime.
+We also consider a further dynamic scheme that scales the performance
+level of the GPU during the execution of GPU code, whereas we restricted
+a scaling point at the boundary of GPU code in this paper.
View
202 draft/evaluation.tex
@@ -0,0 +1,202 @@
+\section{Evaluation}
+\label{sec:evaluation}
+
+\begin{figure}[!t]
+\centering
+ \includegraphics[width=0.45\textwidth]{figures/madd-time-power.pdf}
+ \caption{Power consumption and execution time of the $512\times512$
+ matrix addition program.}
+ \label{fig:madd-time-power}
+\end{figure}
+
+We first investigate the impact of GPU core and memory clocks on
+GPU-intensive workload executing twenty thousands loops of $512\times512$
+matrix addition.
+The voltage and frequency of the GPU is changed three times during the
+operation, while the CPU is fixed at the minimum level to focus on the
+behavior of the GPU.
+Figure \ref{fig:madd-time-power} shows the power consumption of the
+system in this setup, where ``c-*'' and ``m-*'' stand for the GPU core
+and memory clocks respectively, while ``E*'' represents the cummulative
+energy consumption of the corresponding duration.
+What is learned from this experiment is that energy consumption is
+sensitive to the GPU core and memory clocks.
+Lowering the memory clock to 135MHz successfully reduces energy
+consumption, but the further downscaling of the core clock to 405MHz
+counter-increases energy consumption.
+This indeed implies DVFS algorithms dominate the power and performance
+of GPU-accelerated systems.
+
+We next coordinate the GPU and the CPU using more realistic workload
+from the Rodinia benchmark suite.
+To simplify the setup, we consider only high (maximum) and low
+(minimum) core clocks, meaning that we evaluate four configurations of
+(GPU-L, CPU-L), (GPU-H, CPU-L), (GPU-L, CPU-H), and (GPU-H, CPU-H),
+where ``*-L'' and ``*-H'' represent the low and high core clocks.
+In an idle state, however, the clocks are always down-scaled to the
+minimum level.
+We also add another configuration that keeps at the maximum clocks even
+though the GPU is idle, in order to see the impact of elementary
+coordinated DVFS on GPU-accelerated systems.
+Figure~\ref{fig:rodinia-time}~and~\ref{fig:rodinia-energy} respectively
+show the execution time and energy consumption of four representative
+programs of the Rodinia benchmark suite.
+Regarding the execution time, ``all-H'' always takes the shortest
+execution time, as it consistently keeps at the maximum performance level.
+Other configurations however depend on workload.
+For example, the execution time of \texttt{heartwall} -- GPU-intensive
+workload -- can be decreased by setting the high GPU clock, whereas that
+of \texttt{hotspot} is rather affected by the CPU clock.
+The characteristic of energy consumption is more complicated.
+For some workload, lowering the clock causes an increase in energy
+consumption, as the duration of execution is increased, consuming more
+cumulative power consumption.
+In other words, GPU-intensive workload should generally use the high GPU
+clock so that it completes operation as soon as possible to minimize
+energy.
+Apparently, ``all-H'' is not a good idea in terms of energy; the clock
+should be minimized when the device is not used.
+Hence, DVFS is certainly desired but the design of its algorithms is
+left an open issue.
+
+\begin{figure}[!t]
+\centering
+ \includegraphics[width=0.45\textwidth]{figures/rodinia-time.pdf}
+ \caption{Execution time of the Rodinia programs.}
+ \label{fig:rodinia-time}
+\end{figure}
+
+\begin{figure}[!t]
+\centering
+ \includegraphics[width=0.45\textwidth]{figures/rodinia-energy.pdf}
+ \caption{Energy consumption of the Rodinia programs.}
+ \label{fig:rodinia-energy}
+\end{figure}
+
+\begin{figure}[!t]
+\centering
+ \includegraphics[width=0.45\textwidth]{figures/idol.pdf}
+ \caption{Power consumption in an idle state.}
+ \label{fig:idle}
+\end{figure}
+
+\begin{figure*}[!t]
+ \centering
+ \subfigure[\texttt{Matrix Addition}]
+ {\includegraphics[width=0.45\textwidth]{figures/madd-nvidia-time.pdf}
+ \label{madd-time}}
+ \subfigure[\texttt{Matrix Multiplication}]
+ {\includegraphics[width=0.45\textwidth]{figures/mmul-nvidia-time.pdf}
+ \label{mmul-time}}
+ \vspace{-5.0mm}
+ \caption{Execution time of the matrix addition and multiplication
+ programs.}
+ \label{fig:gpu-time}
+\end{figure*}
+
+\begin{figure*}[!t]
+ \centering
+ \subfigure[\texttt{Matrix Addition}]
+ {\includegraphics[width=0.45\textwidth]{figures/madd-nvidia-energy.pdf}
+ }
+ \subfigure[\texttt{Matrix Multiplication}]
+ {\includegraphics[width=0.45\textwidth]{figures/mmul-nvidia-energy.pdf}
+ }
+ \vspace{-5.0mm}
+ \caption{Energy consumption of the matrix addition and multiplication
+ programs.}
+ \label{fig:gpu-energy}
+\end{figure*}
+
+\begin{figure}[!t]
+ \centering
+ \subfigure[\texttt{Execution Time}]
+ {\includegraphics[width=0.45\textwidth]{figures/madd-gdev-nvidia-time.pdf}
+ }
+ \subfigure[\texttt{Energy Consumption}]
+ {\includegraphics[width=0.45\textwidth]{figures/madd-gdev-nvidia-energy.pdf}
+ }
+ \vspace{-5.0mm}
+ \caption{Comparison of the NVIDIA proprietary and the Gdev open-source
+ runtimes and drivers.}
+ \label{fig:nvidia-gdev}
+\end{figure}
+
+In the above experiments, we have never observed that energy consumption
+is reduced by lowering the CPU clock.
+This is because lowering the CPU clock causes the GPU to increase the
+duration of an idle state, and there is no power gating support for the
+GPU at the moment.
+Hence, energy is always wasted when the GPU is idle.
+We demonstrate how energy is wasted in an idle state, when (i) the GPU
+is not present and (ii) is present with three levels of a set of voltage
+and frequency.
+Figure~\ref{fig:idle} shows the average power consumption of those four
+cases obtained by running the system for 60 seconds.
+The CPU consumes no more than 38W on average, whereas the GPU-installed
+systems consume a different scale of power depending on the configured
+set of voltage and frequency.
+This observation encourages the system not to downscale the voltage and
+frequency of the CPU, unless the GPU supports power gating to totally
+cut off its consuming power.
+Another lesson learned from this experiment is that the power
+consumption of the GPU is significant even in an idle state, meaning
+that DVFS is strongly desired for the GPU with whatever overhead it has
+to pay for changing the performance level.
+
+The preceding evaluation indicates that the CPU is a weak factor for
+energy savings of GPU-accelerated systems.
+Henceforth, we restrict our attention to the GPU.
+According to the traditional power modeling~\cite{Hsu2001}, lowering the
+core clock is often effective for memory-intensive workload.
+Our next evaluation verifies if the same is true for the GPU.
+We use matrix addition and multiplication programs with varied sizes of
+data.
+A small size of data reduces memory accesses, while a large size of data
+makes the workload memory-intensive.
+Figure~\ref{fig:gpu-time}~and~\ref{fig:gpu-energy} show the execution
+time and energy consumption of those matrix computations, where ``s-*''
+represents the number of matrix row/column.
+A difference between ``s-64'' and ``s-8192'' explains that memory-clock
+scaling is more effective for such computations that use a smaller size
+of data.
+This is because the execution time of such computations is not
+affected by lowering the memory clock.
+Another observation is that energy cannot be saved by lowering the core
+clock, because these matrix computations are consistently
+compute-intensive.
+If the core clock is downscaled, their execution time is highly
+increased, which results in an increase in cumulative power
+consumption.
+
+Seen from the above experiments, system energy could be reduced
+by about 28\% retaining a decrease in performance within 1\%.
+These experimental results encourage that DVFS algorithms for
+GPU-accelerated systems should be weighted on the GPU rather the CPU,
+though their energy optimization is very challenging, given many factors
+of design knobs including CPU/GPU, core/memory, and workload
+characteristics.
+
+Finally, we compare NVIDIA's proprietary software and Gdev.
+This is an important and practical investigation in that NVIDIA's
+proprietary software does not expose a system interface to change
+the voltage and frequency of the GPU dynamically at runtime, and hence
+the development of DVFS algorithms in future work will inevitably depend
+on Gdev.
+The basic performance of Gdev is competitive to NVIDIA's proprietary
+software~\cite{Kato2012}, but we have to evince that Gdev is also
+reliable for power management.
+The test program exploits matrix addition with varied sizes of data.
+Figure~\ref{fig:nvidia-gdev} shows the execution time and energy
+consumption of the matrix addition programs using different scales of
+GPU core clocks, where the GPU memory clock is fixed at 135MHz.
+In this experiment, ``s-8192'' benefits from lowering the core clock,
+because the workload is memory-bound due to a large size of matrix, and
+the execution time is not much affected by the core clock, while every
+is effectively saved.
+The most remarkable observation is that NVIDIA's proprietary software
+and Gdev exhibit almost the same results on the execution time and energy
+consumption.
+This implies that the result of our on-going research using Gdev could
+be easily propagated to the real product, once energy management
+interfaces are employed by vendor's software.
View
65 draft/experiment.tex
@@ -0,0 +1,65 @@
+\section{Experimental Setup}
+
+本稿で使用するGPUの設定可能の動作周波数を表\ref{GPU-frequency}に示す.また,本
+稿で使用するCPUは,Intel社のCore i5 2400を使用し,選択可能な動作周波数を表
+\ref{CPU-frequency}に示す.主記憶メモリは,4GByteである.用いたベンチマークプロ
+グラムは,ヘテロジニアス環境向けのベンチマークセットである
+Rodinia2.0.1\cite{Che2010}と行列積と行列和を計算するプログラムである.入力サイ
+ズは,Rodiniaに関しては,動作する最大サイズ,行列積と行列和に関しては,動作する
+最大サイズを含む3種類ずつ用意した.
+
+次に,電力測定環境について述べる.電力測定装置には,横河電機社のWT1600を用い
+た.WT1600では,50[ms]ごとに消費電力の測定が可能である.測定する消費電力は,シス
+テム全体の消費電力を測定した.具体的には,電源プラグから電圧と電流を測定し,そ
+れらの積を計算することでシステム全体の消費電力を計測している.測定した消費電力
+の和を計算し消費エネルギーとする.
+
+GPUの電圧と動作周波数を変更する際,Nvidiaの公式ドライバでは,動作周波数をユーザ
+側で任意に変更するためには,ドライバをリロードする必要がある.そのため,ユーザ
+側で任意にプログラム実行中などに電圧と動作周波数の変更を行うことはできない.ま
+た,Nvisdiaのドライバでは,GPUが動作しない状況では最低の動作周波数において待機
+する.GPUが動作する際に,指定した動作周波数で実行が行われる.一方で,Gdevは動作
+周波数を変更する際に,ドライバのリロードを行う必要はないが,メモリの動作周波数
+を変更することができず,135[HMz]で固定された値を取る.Gdevにおいて動作周波数を
+変更する際には,linuxのインターフェイスを用いたCPUの動作周波数を変更するのと同
+様に,あるファイルの値を書き換えることで実現する.Gdevでは,アイドル状態でも動
+作周波数を高く維持することが可能である.本稿中では,Gdevを用いてGPUの動作周波数
+をプログラム実行中に変更する際には,プログラム中からsystem関数を呼び出し,動作
+周波数の変更を行う.
+
+
+\begin{table}[!t]
+ \caption{GTX480の選択可能動作周波数}
+ \vspace{-2.0mm}
+ \label{GPU-frequency}
+ \footnotesize
+ \begin{center}
+ \begin{tabular}{|c|c|c|c|}
+ \hline
+ Clock Domains &Min [MHz]&Low [MHz] & High [MHz]\\
+ \hline\hline
+ Core & 50 & 405 & 700\\
+ \hline
+ Memory & 135 & 324 & 1848\\
+% Core / Shader / Memory & 50 / 101 / 135 & 405 / 810 / 135 & 700 / 1401 / 135\\
+ \hline
+ \end{tabular}
+ \vspace{-5.0mm}
+ \end{center}
+\end{table}
+
+
+\begin{table}[!t]
+ \caption{CPUの選択可能動作周波数}
+ \small
+ \label{CPU-frequency}
+ \begin{center}
+ \begin{tabular}{|c|c|c|c|}
+ \hline
+ Platforms&Min [MHz]&Low [MHz]&High [MHz]\\
+ \hline\hline
+ Core i5-2400 &1600& 2700 & 3300.1\\
+ \hline
+ \end{tabular}
+ \end{center}
+ \end{table}
View
BIN  draft/figures/a.pdf
Binary file not shown
View
BIN  draft/figures/idol.pdf
Binary file not shown
View
BIN  draft/figures/madd-gdev-nvidia-energy.pdf
Binary file not shown
View
BIN  draft/figures/madd-gdev-nvidia-time.pdf
Binary file not shown
View
BIN  draft/figures/madd-nvidia-energy.pdf
Binary file not shown
View
BIN  draft/figures/madd-nvidia-time.pdf
Binary file not shown
View
BIN  draft/figures/madd-time-power.pdf
Binary file not shown
View
BIN  draft/figures/mmul-gdev-nvidia-energy.pdf
Binary file not shown
View
BIN  draft/figures/mmul-gdev-nvidia-time.pdf
Binary file not shown
View
BIN  draft/figures/mmul-nvidia-energy.pdf
Binary file not shown
View
BIN  draft/figures/mmul-nvidia-time.pdf
Binary file not shown
View
BIN  draft/figures/perf-per-watt-abe.pdf
Binary file not shown
View
BIN  draft/figures/perf_per_watt.pdf
Binary file not shown
View
BIN  draft/figures/rodinia-energy.pdf
Binary file not shown
View
BIN  draft/figures/rodinia-time.pdf
Binary file not shown
View
67 draft/introduction.tex
@@ -0,0 +1,67 @@
+\section{Introduction}
+
+Graphics processing units (GPUs) have been increasingly deployed in
+general-purpose application domains due to their significant
+improvements in performance and performance-per-watt.
+As depicted in Figure~\ref{fig:perf_per_watt}, the performance-per-watt
+of GPUs outperforms highly that of traditional multicore CPUs.
+Albeit energy efficient, GPUs still consume non-trivial power during
+operation.
+Commodity system software for GPUs is unfortunately not well designed to
+control their power consumption while primarily tailored to accelerate
+computations.
+To the best of our knowledge, commodity system software does not employ
+even a basic scheme of voltage and frequency scaling for GPUs, though
+most computational pieces of GPU-accelerated systems are offloaded on to
+GPUs.
+In order to develop truly energy-efficient GPU-accelerated systems, it
+is essential to identify the trade-off in power and performance
+of GPUs and its causal relation with CPUs.
+
+\begin{figure}[!t]
+\centering
+ \includegraphics[width=0.46\textwidth]{figures/perf_per_watt.pdf}
+ \label{fig:perf_per_watt}
+ \caption{Performance-per-watt trends on representative NVIDIA GPUs and
+ Intel CPUs.}
+\end{figure}
+
+Despite a rapid growth of GPU technology, there has been not much
+understanding of power and performance implications of GPU-accelerated
+systems.
+According to vendor's specifications, the thermal design power (TDP) of
+state-of-the-art GPUs is around 200W, while that of today's multicore
+CPUs is typically below 100W.
+Such a difference in the scale of TDP between CPUs and GPUs prevents
+system designers from predicting the power and performance of
+GPU-accelerated systems, which makes it difficult, if not impossible, to
+optimize their energy savings.
+Previous work~\cite{Hong2009,Hong2010,Jiao2010,Lee2011,Nagasaka2010} on
+the power and performance analysis of GPU-accelerated systems are based
+on either simulation studies or limited hardware functionality.
+None of previous work has ever disclosed a fundamental approach to the
+power and performance analysis of GPU-accelerated systems.
+
+The contribution of this paper is to provide a power and performance
+analysis of GPU-accelerated systems using NVIDIA's \textit{Fermi}
+architectures.
+Specifically, we identify when to scale the frequency and voltage of
+GPUs and CPUs in order to minimize overall system energy.
+Our analysis opens up important problems of dynamic voltage and
+frequency scaling (DVFS) algorithms for growing GPU-accelerated
+systems.
+We also provide an open method and tool to scale voltage and frequency
+of GPUs.
+The black box feature of current GPU drivers and runtimes prevents
+researchers from tackling correlative power and performance optimization
+problems.
+Sharing such a common method and tool with researchers
+would further facilitate use of GPUs.
+
+The rest of this paper is organized as follows.
+%Section~\ref{sec:background} describes the background and motivation
+%behind this work.
+Section~\ref{sec:platform} presents our system platform.
+Section~\ref{sec:evaluation} provides our evaluation.
+Section~\ref{sec:related_work} discusses related work, and this paper
+concludes in Section~\ref{sec:conclusion}.
View
64 draft/main.aux
@@ -0,0 +1,64 @@
+\relax
+\citation{Hong2009}
+\citation{Hong2010}
+\citation{Jiao2010}
+\citation{Lee2011}
+\citation{Nagasaka2010}
+\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}}
+\newlabel{fig:perf_per_watt}{{1}{1}}
+\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Performance-per-watt trends on representative NVIDIA GPUs and Intel CPUs.}}{1}}
+\citation{BLOB}
+\citation{Kato2012}
+\citation{Che2010}
+\citation{CUDA40}
+\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Performance levels of GTX 480 (GPU).}}{2}}
+\newlabel{tab:GPU-frequency}{{1}{2}}
+\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Performance levels of Core i5 2400 (CPU).}}{2}}
+\newlabel{tab:CPU-frequency}{{2}{2}}
+\@writefile{toc}{\contentsline {section}{\numberline {2}System Platform}{2}}
+\newlabel{sec:platform}{{2}{2}}
+\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Power consumption and execution time of the $512\times 512$ matrix addition program.}}{2}}
+\newlabel{fig:madd-time-power}{{2}{2}}
+\@writefile{toc}{\contentsline {section}{\numberline {3}Evaluation}{2}}
+\newlabel{sec:evaluation}{{3}{2}}
+\citation{Hsu2001}
+\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Execution time of the Rodinia programs.}}{3}}
+\newlabel{fig:rodinia-time}{{3}{3}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Energy consumption of the Rodinia programs.}}{3}}
+\newlabel{fig:rodinia-energy}{{4}{3}}
+\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces Power consumption in an idle state.}}{3}}
+\newlabel{fig:idle}{{5}{3}}
+\@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces Comparison of the NVIDIA proprietary and the Gdev open-source runtimes and drivers.}}{3}}
+\newlabel{fig:nvidia-gdev}{{8}{3}}
+\citation{Kato2012}
+\citation{Nagasaka2010}
+\newlabel{madd-time}{{3}{4}}
+\newlabel{mmul-time}{{3}{4}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces Execution time of the matrix addition and multiplication programs.}}{4}}
+\newlabel{fig:gpu-time}{{6}{4}}
+\@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces Energy consumption of the matrix addition and multiplication programs.}}{4}}
+\newlabel{fig:gpu-energy}{{7}{4}}
+\@writefile{toc}{\contentsline {section}{\numberline {4}Related Work}{4}}
+\newlabel{sec:related_work}{{4}{4}}
+\citation{Hong2009}
+\citation{Hong2010}
+\citation{Lee2011}
+\citation{Jiao2010}
+\citation{ying2011}
+\citation{Hong2009}
+\citation{Hong2010}
+\bibstyle{acm}
+\bibdata{refer}
+\bibcite{Che2010}{1}
+\bibcite{Hong2009}{2}
+\bibcite{Hong2010}{3}
+\bibcite{Hsu2001}{4}
+\bibcite{Jiao2010}{5}
+\bibcite{Lee2011}{6}
+\bibcite{Nagasaka2010}{7}
+\bibcite{CUDA40}{8}
+\bibcite{BLOB}{9}
+\bibcite{Kato2012}{10}
+\bibcite{ying2011}{11}
+\@writefile{toc}{\contentsline {section}{\numberline {5}Conclusion and Future Work}{5}}
+\newlabel{sec:conclusion}{{5}{5}}
View
76 draft/main.bbl
@@ -0,0 +1,76 @@
+\begin{thebibliography}{10}
+
+\bibitem{Che2010}
+{\sc Che, S., Sheaffer, J.~W., Boyer, M., Szafaryn, L.~G., Wang, L., and
+ Skadron, K.}
+\newblock A characterization of the rodinia benchmark suite with comparison to
+ contemporary cmp workloads.
+\newblock In {\em Proc. of the IEEE international symposium on Workload
+ characterization\/} (2010), pp.~1--11.
+
+\bibitem{Hong2009}
+{\sc Hong, S., and Kim, H.}
+\newblock An analytical model for a gpu architecture with memory-level and
+ thread-level parallelism awareness.
+\newblock In {\em Proc. of the annual international symposium on Computer
+ architecture\/} (2009), pp.~152--163.
+
+\bibitem{Hong2010}
+{\sc Hong, S., and Kim, H.}
+\newblock An integrated gpu power and performance model.
+\newblock In {\em Proc. of the annual international symposium on Computer
+ architecture\/} (2010), pp.~280--289.
+
+\bibitem{Hsu2001}
+{\sc Hsu, C.-H., Kremer, U., and Hsiao, M.}
+\newblock Compiler-directed dynamic voltage/frequency scheduling for energy
+ reduction in microprocessors.
+\newblock In {\em Proc. of the international symposium on Low power electronics
+ and design\/} (2001), pp.~275--278.
+
+\bibitem{Jiao2010}
+{\sc Jiao, Y., Lin, H., Balaji, P., and Feng, W.}
+\newblock Power and performance characterization of computational kernels on
+ the gpu.
+\newblock In {\em Proc. of the IEEE/ACM international conference on Green
+ computing and communications \& international conference on Cyber, physical
+ and social computing\/} (2010), pp.~221 --228.
+
+\bibitem{Lee2011}
+{\sc Lee, J., Sathisha, V., Schulte, M., Compton, K., and Kim, N.~S.}
+\newblock Improving throughput of power-constrained gpus using dynamic
+ voltage/frequency and core scaling.
+\newblock In {\em Proc. of the international conference on Parallel
+ architectures and compilation techniques\/} (2011), pp.~111--120.
+
+\bibitem{Nagasaka2010}
+{\sc Nagasaka, H., Maruyama, N., Nukada, A., Endo, T., and Matsuoka, S.}
+\newblock Statistical power modeling of gpu kernels using performance counters.
+\newblock In {\em Proc. of the international conference on Green computing\/}
+ (2010), pp.~115--122.
+
+\bibitem{CUDA40}
+{\sc NVIDIA}.
+\newblock {CUDA 4.0}.
+\newblock \url{http://developer.nvidia.com/cuda-toolkit-40}, 2011.
+
+\bibitem{BLOB}
+{\sc NVIDIA}.
+\newblock {Linux~X64~Display Driver}.
+\newblock
+ \url{http://www.nvidia.com/object/linux-display-amd64-295.59-driver.html},
+ 2012.
+
+\bibitem{Kato2012}
+{\sc Shinpei, K., Michael, M., Carlos, M., and Scott, B.}
+\newblock Gdev: First-class gpu resource management in the operating system.
+\newblock In {\em Proc. of the USENIX conference on USENIX annual technical
+ conference\/} (2012).
+
+\bibitem{ying2011}
+{\sc Zhang, Y., Hu, Y., Li, B., and Peng, L.}
+\newblock Performance and power analysis of ati gpu: A statistical approach.
+\newblock In {\em Proc. of the IEEE international conference on Networking,
+ architecture, and storage\/} (2011), pp.~149 --158.
+
+\end{thebibliography}
View
50 draft/main.blg
@@ -0,0 +1,50 @@
+This is BibTeX, Version 0.99c (Web2C 7.5.6)
+The top-level auxiliary file: main.aux
+The style file: acm.bst
+Database file #1: refer.bib
+Name 3 in "C-H. Hsu and U. Kremer and M. Hsiao," has a comma at the end for entry Hsu2001
+while executing---line 1086 of file acm.bst
+Name 3 in "C-H. Hsu and U. Kremer and M. Hsiao," has a comma at the end for entry Hsu2001
+while executing---line 1127 of file acm.bst
+You've used 11 entries,
+ 2253 wiz_defined-function locations,
+ 591 strings with 6043 characters,
+and the built_in function-call counts, 4786 in all, are:
+= -- 457
+> -- 207
+< -- 0
++ -- 88
+- -- 74
+* -- 316
+:= -- 695
+add.period$ -- 33
+call.type$ -- 11
+change.case$ -- 70
+chr.to.int$ -- 0
+cite$ -- 11
+duplicate$ -- 224
+empty$ -- 411
+format.name$ -- 74
+if$ -- 1068
+int.to.chr$ -- 0
+int.to.str$ -- 11
+missing$ -- 9
+newline$ -- 58
+num.names$ -- 22
+pop$ -- 115
+preamble$ -- 1
+purify$ -- 59
+quote$ -- 0
+skip$ -- 171
+stack$ -- 0
+substring$ -- 303
+swap$ -- 86
+text.length$ -- 0
+text.prefix$ -- 0
+top$ -- 0
+type$ -- 44
+warning$ -- 0
+while$ -- 44
+width$ -- 13
+write$ -- 111
+(There were 2 error messages)
View
BIN  draft/main.dvi
Binary file not shown
View
224 draft/main.log
@@ -0,0 +1,224 @@
+This is pdfTeXk, Version 3.141592-1.40.3 (Web2C 7.5.6) (format=pdflatex 2012.5.15) 20 JUL 2012 13:58
+entering extended mode
+ %&-line parsing enabled.
+**main
+(./main.tex
+LaTeX2e <2005/12/01>
+Babel <v3.8h> and hyphenation patterns for english, usenglishmax, dumylang, noh
+yphenation, arabic, basque, bulgarian, coptic, welsh, czech, slovak, german, ng
+erman, danish, esperanto, spanish, catalan, galician, estonian, farsi, finnish,
+ french, greek, monogreek, ancientgreek, croatian, hungarian, interlingua, ibyc
+us, indonesian, icelandic, italian, latin, mongolian, dutch, norsk, polish, por
+tuguese, pinyin, romanian, russian, slovenian, uppersorbian, serbian, swedish,
+turkish, ukenglish, ukrainian, loaded.
+(/usr/share/texmf/tex/latex/base/article.cls
+Document Class: article 2005/09/16 v1.4f Standard LaTeX document class
+(/usr/share/texmf/tex/latex/base/size10.clo
+File: size10.clo 2005/09/16 v1.4f Standard LaTeX file (size option)
+)
+\c@part=\count79
+\c@section=\count80
+\c@subsection=\count81
+\c@subsubsection=\count82
+\c@paragraph=\count83
+\c@subparagraph=\count84
+\c@figure=\count85
+\c@table=\count86
+\abovecaptionskip=\skip41
+\belowcaptionskip=\skip42
+\bibindent=\dimen102
+) (./usenix-kato.sty
+(/usr/share/texmf/tex/latex/psnfss/times.sty
+Package: times 2005/04/12 PSNFSS-v9.2a (SPQR)
+))
+(/usr/share/texmf/tex/latex/graphics/epsfig.sty
+Package: epsfig 1999/02/16 v1.7a (e)psfig emulation (SPQR)
+
+(/usr/share/texmf/tex/latex/graphics/graphicx.sty
+Package: graphicx 1999/02/16 v1.0f Enhanced LaTeX Graphics (DPC,SPQR)
+
+(/usr/share/texmf/tex/latex/graphics/keyval.sty
+Package: keyval 1999/03/16 v1.13 key=value parser (DPC)
+\KV@toks@=\toks14
+)
+(/usr/share/texmf/tex/latex/graphics/graphics.sty
+Package: graphics 2006/02/20 v1.0o Standard LaTeX Graphics (DPC,SPQR)
+
+(/usr/share/texmf/tex/latex/graphics/trig.sty
+Package: trig 1999/03/16 v1.09 sin cos tan (DPC)
+)
+(/usr/share/texmf/tex/latex/config/graphics.cfg
+File: graphics.cfg 2007/01/18 v1.5 graphics configuration of teTeX/TeXLive
+)
+Package graphics Info: Driver file: pdftex.def on input line 90.
+
+(/usr/share/texmf/tex/latex/pdftex-def/pdftex.def
+File: pdftex.def 2007/01/08 v0.04d Graphics/color for pdfTeX
+\Gread@gobject=\count87
+))
+\Gin@req@height=\dimen103
+\Gin@req@width=\dimen104
+)
+\epsfxsize=\dimen105
+\epsfysize=\dimen106
+)
+(/usr/share/texmf/tex/latex/ltxmisc/endnotes.sty
+\c@endnote=\count88
+\endnotesep=\dimen107
+\@enotes=\write3
+) (./multirow.sty
+\bigstrutjot=\dimen108
+)
+(./subfigure.sty
+\c@subfigure=\count89
+) (/usr/share/texmf/tex/latex/oberdiek/epstopdf.sty
+Package: epstopdf 2006/08/26 v1.3 Conversion with epstopdf on the fly (HO)
+
+
+Package epstopdf Warning: Shell escape feature is not enabled.
+
+) (/usr/share/texmf/tex/latex/comment/comment.sty
+\CommentStream=\write4
+ Excluding comment 'comment')
+(/usr/share/texmf/tex/latex/ltxmisc/url.sty
+\Urlmuskip=\muskip10
+Package: url 2005/06/27 ver 3.2 Verb mode for urls, etc.
+) (./bibspacing.sty
+\bibindent=\dimen109
+\bibspacing=\dimen110
+) (./main.aux)
+\openout1 = `main.aux'.
+
+LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 34.
+LaTeX Font Info: ... okay on input line 34.
+LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 34.
+LaTeX Font Info: ... okay on input line 34.
+LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 34.
+LaTeX Font Info: ... okay on input line 34.
+LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 34.
+LaTeX Font Info: ... okay on input line 34.
+LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 34.
+LaTeX Font Info: ... okay on input line 34.
+LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 34.
+LaTeX Font Info: ... okay on input line 34.
+LaTeX Font Info: Try loading font information for OT1+ptm on input line 34.
+
+(/usr/share/texmf/tex/latex/psnfss/ot1ptm.fd
+File: ot1ptm.fd 2001/06/04 font definitions for OT1/ptm.
+)
+LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <14.4> not available
+(Font) Font shape `OT1/ptm/b/n' tried instead on input line 53.
+LaTeX Font Info: External font `cmex10' loaded for size
+(Font) <12> on input line 53.
+LaTeX Font Info: External font `cmex10' loaded for size
+(Font) <8> on input line 53.
+LaTeX Font Info: External font `cmex10' loaded for size
+(Font) <6> on input line 53.
+ (./abstract.tex
+LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <12> not available
+(Font) Font shape `OT1/ptm/b/n' tried instead on input line 1.
+)
+(./introduction.tex
+<figures/perf_per_watt.pdf, id=1, 559.08875pt x 199.74625pt>
+File: figures/perf_per_watt.pdf Graphic file (type pdf)
+
+<use figures/perf_per_watt.pdf> [1{/usr/share/texmf/fonts/map/pdftex/updmap/pdf
+tex.map}
+
+
+ <./figures/perf_per_watt.pdf>]) (./platform.tex
+LaTeX Font Info: External font `cmex10' loaded for size
+(Font) <5> on input line 10.
+LaTeX Font Info: External font `cmex10' loaded for size
+(Font) <9> on input line 29.
+
+Underfull \hbox (badness 1902) in paragraph at lines 62--72
+[]\OT1/ptm/m/n/10 The ex-per-i-men-tal work-load ex-e-cutes the Ro-dinia
+ []
+
+LaTeX Font Info: External font `cmex10' loaded for size
+(Font) <7> on input line 75.
+) (./evaluation.tex
+<figures/madd-time-power.pdf, id=53, 746.79pt x 279.0425pt>
+File: figures/madd-time-power.pdf Graphic file (type pdf)
+
+<use figures/madd-time-power.pdf>
+LaTeX Font Info: Try loading font information for OT1+pcr on input line 47.
+ (/usr/share/texmf/tex/latex/psnfss/ot1pcr.fd
+File: ot1pcr.fd 2001/06/04 font definitions for OT1/pcr.
+) [2 <./figures/madd-time-power.pdf>]
+<figures/rodinia-time.pdf, id=144, 566.115pt x 283.0575pt>
+File: figures/rodinia-time.pdf Graphic file (type pdf)
+
+<use figures/rodinia-time.pdf>
+<figures/rodinia-energy.pdf, id=145, 569.12625pt x 283.0575pt>
+File: figures/rodinia-energy.pdf Graphic file (type pdf)
+
+<use figures/rodinia-energy.pdf>
+<figures/idol.pdf, id=146, 497.86pt x 144.54pt>
+File: figures/idol.pdf Graphic file (type pdf)
+ <use figures/idol.pdf>
+<figures/madd-nvidia-time.pdf, id=147, 597.23125pt x 268.00125pt>
+File: figures/madd-nvidia-time.pdf Graphic file (type pdf)
+
+<use figures/madd-nvidia-time.pdf>
+<figures/mmul-nvidia-time.pdf, id=148, 597.23125pt x 272.01625pt>
+File: figures/mmul-nvidia-time.pdf Graphic file (type pdf)
+
+<use figures/mmul-nvidia-time.pdf>
+<figures/madd-nvidia-energy.pdf, id=149, 598.235pt x 270.00874pt>
+File: figures/madd-nvidia-energy.pdf Graphic file (type pdf)
+
+<use figures/madd-nvidia-energy.pdf>
+<figures/mmul-nvidia-energy.pdf, id=150, 597.23125pt x 272.01625pt>
+File: figures/mmul-nvidia-energy.pdf Graphic file (type pdf)
+
+<use figures/mmul-nvidia-energy.pdf>
+<figures/madd-gdev-nvidia-time.pdf, id=151, 597.23125pt x 270.00874pt>
+File: figures/madd-gdev-nvidia-time.pdf Graphic file (type pdf)
+
+<use figures/madd-gdev-nvidia-time.pdf>
+<figures/madd-gdev-nvidia-energy.pdf, id=152, 596.2275pt x 269.005pt>
+File: figures/madd-gdev-nvidia-energy.pdf Graphic file (type pdf)
+
+<use figures/madd-gdev-nvidia-energy.pdf>
+Underfull \vbox (badness 1286) has occurred while \output is active []
+
+ [3 <./figures/rodinia-time.pdf> <./figures/rodinia-energy.pdf> <./figures/idol
+.pdf> <./figures/madd-gdev-nvidia-time.pdf> <./figures/madd-gdev-nvidia-energy.
+pdf>])
+(./relatedwork.tex [4 <./figures/madd-nvidia-time.pdf> <./figures/mmul-nvidia-t
+ime.pdf> <./figures/madd-nvidia-energy.pdf> <./figures/mmul-nvidia-energy.pdf>]
+) (./conclusion.tex) (./main.bbl
+Underfull \hbox (badness 3746) in paragraph at lines 53--56
+[]\OT1/ptm/m/sc/7 NVIDIA\OT1/ptm/m/n/7 . CUDA 4.0. $\OT1/pcr/m/n/7 http : / /
+ developer . nvidia . com /
+ []
+
+
+Underfull \hbox (badness 10000) in paragraph at lines 58--63
+\OT1/pcr/m/n/7 object / linux-[]display-[]amd64-[]295 . 59-[]driver . html$\OT1
+/ptm/m/n/7 ,
+ []
+
+) [5] (./main.aux) )
+Here is how much of TeX's memory you used:
+ 1026 strings out of 256216
+ 13555 string characters out of 1917072
+ 65264 words of memory out of 1500000
+ 4274 multiletter control sequences out of 10000+200000
+ 27293 words of font info for 55 fonts, out of 1200000 for 2000
+ 645 hyphenation exceptions out of 8191
+ 32i,8n,21p,213b,276s stack positions out of 5000i,500n,6000p,200000b,15000s
+{/usr/share/texmf/fonts/enc/dvips/base/8r.enc}</usr/share/t
+exmf/fonts/type1/bluesky/cm/cmmi10.pfb></usr/share/texmf/fonts/type1/bluesky/cm
+/cmr10.pfb></usr/share/texmf/fonts/type1/bluesky/cm/cmsy10.pfb></usr/share/texm
+f/fonts/type1/urw/courier/ucrr8a.pfb></usr/share/texmf/fonts/type1/urw/times/ut
+mb8a.pfb></usr/share/texmf/fonts/type1/urw/times/utmr8a.pfb></usr/share/texmf/f
+onts/type1/urw/times/utmri8a.pfb>
+Output written on main.pdf (5 pages, 374278 bytes).
+PDF statistics:
+ 410 PDF objects out of 1000 (max. 8388607)
+ 0 named destinations out of 1000 (max. 131072)
+ 56 words of extra memory for PDF output out of 10000 (max. 10000000)
+
View
BIN  draft/main.pdf
Binary file not shown
View
68 draft/main.tex
@@ -0,0 +1,68 @@
+% TEMPLATE for Usenix papers, specifically to meet requirements of
+% USENIX '05
+% originally a template for producing IEEE-format articles using LaTeX.
+% written by Matthew Ward, CS Department, Worcester Polytechnic Institute.
+% adapted by David Beazley for his excellent SWIG paper in Proceedings,
+% Tcl 96
+% turned into a smartass generic template by De Clarke, with thanks to
+% both the above pioneers
+% use at your own risk. Complaints to /dev/null. make it two column with no
+% page numbering, default is 10 point
+
+% Munged by Fred Douglis <douglis@research.att.com> 10/97 to separate the .sty
+% file from the LaTeX source template, so that people can more easily include
+% the .sty file into an existing document. Also changed to more closely follow
+% the style guidelines as represented by the Word sample file.
+
+% Note that since 2010, USENIX does not require endnotes. If you want foot of
+% page notes, don't include the endnotes package in the usepackage command,
+% below.
+
+% This version uses the latex2e styles, not the very ancient 2.09 stuff.
+\documentclass[letterpaper,twocolumn,10pt]{article}
+\usepackage{usenix-kato,epsfig,endnotes}
+\usepackage{graphicx}
+\usepackage{multirow}
+\usepackage{subfigure}
+\usepackage{epstopdf}
+\usepackage{comment}
+\usepackage{url}
+\usepackage{bibspacing}
+% \usepackage{mediabb}
+
+
+\begin{document}
+
+%don't want date printed
+\date{}
+
+%make title bold and 14 pt font (Latex default is non-bold, 16 pt)
+\title{\Large \bf Power and Performance Analysis of GPU-Accelerated Systems}
+
+%for single author (just remove % characters)
+\author{\and\and{\rm Yuki Abe}\\ Kyushu University \and
+ {\rm Hiroshi Sasaki}\\ Kyushu University \and
+ {\rm Martin Peres}\\ Laboratoire Bordelais de \\ Recherche en Informatique \and
+ {\rm Koji Inoue}\\ Kyushu University \and
+ {\rm Kazuaki Murakami}\\ Kyushu University \and
+ {\rm Shinpei Kato}\\ Nagoya University
+} % end author
+% copy the following lines to add more authors \and {\rm Name}\\
+%Name Institution
+
+\maketitle
+\input{abstract}
+\input{introduction}
+\input{platform}
+\input{evaluation}
+\input{relatedwork}
+\input{conclusion}
+
+\setlength{\bibspacing}{\baselineskip}
+{\scriptsize \bibliographystyle{acm}
+\bibliography{refer}}
+
+% Use the following at camera-ready time to suppress page numbers. Comment it
+% out when you first submit the paper for review.
+\thispagestyle{empty}
+\end{document}
View
20 draft/make.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+TARGET=main
+
+pdflatex ${TARGET}
+bibtex ${TARGET}
+pdflatex ${TARGET}
+pdflatex ${TARGET}
+
+
+# for japanese
+# platex -synctex=1 "${TARGET}" && \
+# dvipdfmx "`basename "${TARGET}" .tex`"
+# bibtex ${TARGET}
+# platex -synctex=1 "${TARGET}" && \
+# dvipdfmx "`basename "${TARGET}" .tex`"
+# platex -synctex=1 "${TARGET}" && \
+# dvipdfmx "`basename "${TARGET}" .tex`"
+
+
View
99 draft/mediabb.sty
@@ -0,0 +1,99 @@
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{mediabb}[2006/10/26 v1.9 iNOUE Koich! <inoue@ma.ns.musashi-tech.ac.jp>]
+\@ifpackageloaded{graphics}{}{\RequirePackage[dvipdfm]{graphicx}}
+\define@key{Gin}{usebb}[true]{\expandafter\let\expandafter\if@usebb\csname if#1\endcsname}
+\let\if@usebb\iffalse
+\define@key{Gin}{mediaboxonly}[/MediaBox]{\@ifundefined{Gread@find@#1}%
+ {\def\Gread@rect@box{/MediaBox}\expandafter\let\expandafter\if@Gread@find@only\csname if#1\endcsname}%
+ {\def\Gread@rect@box{#1}\let\if@Gread@find@only\iftrue}}
+\let\if@Gread@find@only\iffalse
+\def\Gread@rect@box{/MediaBox}
+\define@key{Gin}{autoebb}[always]{\csname Gread@ebb@#1\endcsname}
+\def\Gread@ebb@always{\let\if@Gread@ebb@always\iftrue}
+\let\if@Gread@ebb@always\iffalse
+\let\Gread@ebb@true\Gread@ebb@always
+\def\Gread@ebb@never{\let\if@Gread@ebb@never\iftrue}
+\let\Gread@ebb@false\Gread@ebb@never
+\let\if@Gread@ebb@never\iffalse
+\let\Ginclude@pdf\Ginclude@eps
+\def\Gread@pdf#1{%
+ \begingroup
+ \if@usebb\else
+ \@tempcnta\z@
+ \loop
+ \ifnum\@tempcnta<\@xxxii
+ \catcode\@tempcnta14
+ \advance\@tempcnta\@ne
+ \repeat
+ \@tempcnta127
+ \loop
+ \ifnum\@tempcnta<\@cclvi
+ \catcode\@tempcnta14
+ \advance\@tempcnta\@ne
+ \repeat
+ \let\do\@makeother\dospecials\catcode`\ 10
+ \catcode\endlinechar10
+ \immediate\openin\@inputcheck#1
+ \ifeof\@inputcheck
+ \@latex@error{File `#1' not found}\@ehc
+ \else
+ \Gread@true
+ \let\@tempa\@empty
+ \let\@tempb\@empty
+ \loop
+ \read\@inputcheck to\@tempc
+ \ifeof\@inputcheck
+ \Gread@false
+ \else
+ \edef\@tempa{\@tempb\@tempc}\let\@tempb\@tempc
+ \edef\@tempc{\@tempa\Gread@rect@box}%
+ \csname Gread@find@\Gread@rect@box\expandafter\endcsname\@tempc[\@nnil]\\%
+ \fi
+ \ifGread@
+ \repeat
+ \immediate\closein\@inputcheck
+ \fi
+ \fi
+ \ifGin@bbox
+ \else
+ \Gread@bmp{\Gin@base.bb}%
+ \fi
+ \endgroup
+ \expandafter\Gread@parse@bb\@gtempa\\}
+\def\def@Gread@find#1#2{\ifx\@nnil#2\expandafter\@gobble\else
+ \@namedef{Gread@find@#1}##1#1##2[##3##4]##5\\{%
+ \ifx\@nnil##3\else
+ \gdef\@gtempa{##3##4 }%
+ \if@Gread@find@only
+ \Gread@false
+ \else
+ \def\Gread@rect@box{#1}%
+ \fi
+ \Gin@bboxtrue
+ \fi
+ \if@Gread@find@only\else
+ \csname Gread@find@#2\expandafter\endcsname\@tempa#2[\@nnil]\\%
+ \fi}%
+ \expandafter\def@Gread@find\fi{#2}}
+\def@Gread@find{/MediaBox}{/CropBox}{/BleedBox}{/TrimBox}{/ArtBox}\@nnil
+\@namedef{Gread@find@/ArtBox}#1/ArtBox#2[#3#4]#5\\{%
+ \ifx\@nnil#3\else
+ \gdef\@gtempa{#3#4 }%
+ \Gread@false
+ \Gin@bboxtrue
+ \fi}
+\def\Gread@bmp#1{%
+ \if@Gread@ebb@never\else
+ \if@Gread@ebb@always\else
+ \immediate\openin\@inputcheck#1 %
+ \ifeof\@inputcheck
+ \Gread@ebb@always
+ \else
+ \immediate\closein\@inputcheck
+ \fi
+ \fi
+ \immediate\write18{\if@Gread@ebb@always\else[ '\Gin@base\Gin@ext' -nt '\Gin@base.bb' ] && \fi
+ \ifx\@nnil\filename@area\@nnil\else cd '\filename@area' && \fi ebb '\filename@base\Gin@ext'}%
+ \fi
+ \Gread@eps{#1}}
+\DeclareGraphicsRule{.pdf}{pdf}{*}{}
View
159 draft/multirow.sty
@@ -0,0 +1,159 @@
+%%
+%% multirow.sty V1.6 version (5-May-2004)
+%%
+%% Author: Jerry Leichter <jerrold.leichter@smarts.com>
+%% Piet van Oostrum <piet@cs.uu.nl>
+%%
+%% This file may be distributed under the terms of the LaTeX Project Public
+%% License, as described in lppl.txt in the base LaTeX distribution.
+%% Either version 1 or, at your option, any later version.
+%%
+%% V1.0 was distributed anonymously, based on a Usenet posting that was
+%% not intended for stand-alone use.
+%% V1.1 was modified by Piet van Oostrum <piet@cs.uu.nl> to allow it to
+%% work without bigstrut.sty.
+%% V1.2 was modified by Jerry Leichter for the same goal, but using a
+%% different approach which will work properly in conjunction with
+%% bigstrut.sty.
+%% V1.2a was modified by Piet van Oostrum <piet@cs.uu.nl> to use \vskip
+%% instead of \raise in positioning, avoiding making rows too high
+%% when the adjustment is large.
+%% V1.3 was modified by Piet van Oostrum to work properly in a p{} column
+%% (\leavevmode added)
+%% V1.4 was modified by Piet van Oostrum to check for the special case that
+%% the width is given as an *. In this case the natural
+%% width of the text argument will be used and the argument
+%% is processed in LR-mode.
+%% V1.5 was modified by Piet van Oostrum: Added a % after \hbox{#5}\vfill}.
+%% Added \struts around #5 for better vertical positioning.
+%% Additional coding for negative value of nrows.
+%% V1.6 was modified by Piet van Oostrum: Replace a space by \relax after
+%% \advance\@tempdima#4.
+%%
+% Make an entry that will span multiple rows of a table.
+%
+% \multirow{nrows}[bigstruts]{width}[fixup]{text}
+%
+% nrows is the number of rows to span. It's up to you to leave the other
+% rows empty, or the stuff created by \multirow will over-write it.
+% With a positive value of nrows the spanned colomns are this row and
+% (nrows-1) rows below it. With a negative value of nrows they are
+% this row and (1-nrows) above it.
+%
+% bigstruts is mainly used if you've used bigstrut.sty. In that case it
+% is the total number of uses of \bigstrut within the rows being
+% spanned. Count 2 uses for each \bigstrut, 1 for each \bigstrut[x]
+% where x is either t or b. The default is 0.
+% width is the width to which the text is to be set, or * to indicate that
+% the text argument's natural width is to be used.
+%
+% text is the actual text. If the width was set explicitly, the text will
+% be set in a parbox of that width; you can use \\ to force linebreaks
+% where you like.
+%
+% If the width was given as * the text will be set in LR mode. If you
+% want a multiline entry in this case you should use a tabular or array
+% in the text parameter.
+%
+% The text is centered vertically within the range spanned by nrows.
+%
+% fixup is a length used for fine tuning: The text will be raised (or
+% lowered, if fixup is negative) by that length above (below) wherever
+% it would otherwise have gone.
+%
+% For example (using both multirow and bigstrut)
+%
+% \newcommand{\minitab}[2][l]{\begin{tabular}{#1}#2\end{tabular}}
+% \begin{tabular}{|c|c|}
+% \hline
+% \multirow{4}{1in}{Common g text} & Column g2a\\
+% & Column g2b \\
+% & Column g2c \\
+% & Column g2d \\
+% \hline
+% \multirow{3}[6]*{Common g text} & Column g2a\bigstrut\\\cline{2-2}
+% & Column g2b \bigstrut\\\cline{2-2}
+% & Column g2c \bigstrut\\
+% \hline
+% \multirow{4}[8]{1in}{Common g text} & Column g2a\bigstrut\\\cline{2-2}
+% & Column g2b \bigstrut\\\cline{2-2}
+% & Column g2c \bigstrut\\\cline{2-2}
+% & Column g2d \bigstrut\\
+% \hline
+% \multirow{4}*{\minitab[c]{Common \\ g text}} & Column g2a\\
+% & Column g2b \\
+% & Column g2c \\
+% & Column g2d \\
+% \hline
+% \end{tabular}
+%
+% If any of the spanned rows are unusually large, or if you're using
+% bigstrut.sty and \bigstrut's are used assymetrically about the centerline of
+% the spanned rows, the vertical centering may not come out right. Use the
+% fixup argument in this case.
+%
+% Just before "text" is expanded, the \multirowsetup macro is expanded to
+% set up any special environment. Initially, \multirowsetup contains just
+% \raggedright. It can be redefined with \renewcommand.
+%
+% Bugs: It's just about impossible to deal correctly with descenders. The
+% text will be set up centered, but it may then have a baseline that doesn't
+% match the baseline of the stuff beside it, in particular if the stuff
+% beside it has descenders and "text" does not. This may result in a small
+% missalignment. About all that can be done is to do a final touchup on
+% "text", using the fixup optional argument. (Hint: If you use a measure
+% like .1ex, there's a reasonable chance that the fixup will still be correct
+% if you change the point size.)
+%
+% \multirow is mainly designed for use with table, as opposed to array,
+% environments. It will not work well in an array environment since the lines
+% have an extra \jot of space between them which it won't account for. Fixing
+% this is difficult in general, and doesn't seem worth it. The bigstruts
+% argument can be used to provide a semi-automatic fix: First set
+% \bigstrutjot to .5\jot. Then simply repeat nrows as the bigstruts argument.
+% This will be close, but probably not exact; you can use the fixup argument
+% to refine the result. (If you do this repeatedly, you'll probably want to
+% wrap these steps up in a simple macro. Note that the modified \bigstrutjot
+% value will not give reasonable results if you have bigstruts and use this
+% argument for its intended purpose elsewhere. In that case, you might want
+% to set it locally.)
+%
+% If you use \multirow with the colortbl package you have to take
+% precautions if you want to color the column that has the \multirow in it.
+% colortbl works by coloring each cell separately. So if you use \multirow
+% with a positive nrows value, colortbl will first color the top cell, then
+% \multirow will typeset nrows cells starting with this cell, and later
+% colortbl will color the other cells, effectively hiding the text in that
+% area. This can be solved by putting the \multirow is the last row with a
+% negative nrows value.
+% Example:
+%
+% \begin{tabular}{l>{\columncolor{yellow}}l}
+% aaaa & \\
+% cccc & \\
+% dddd & \multirow{-3}*{bbbb}\\
+% \end{tabular}
+%
+\def\multirowsetup{\raggedright}
+\def\multirow#1{\relax\@ifnextchar
+ [{\@multirow{#1}}{\@multirow{#1}[0]}}
+\def\@multirow#1[#2]#3{\@ifnextchar [{\@xmultirow{#1}[#2]{#3}}%
+ {\@xmultirow{#1}[#2]{#3}[0pt]}}
+\def\@xmultirow#1[#2]#3[#4]#5{\@tempcnta=#1%
+ \@tempdima\@tempcnta\ht\@arstrutbox
+ \advance\@tempdima\@tempcnta\dp\@arstrutbox
+ \ifnum\@tempcnta<0\@tempdima=-\@tempdima\fi
+ \advance\@tempdima#2\bigstrutjot
+ \if*#3\setbox0\vtop to \@tempdima{\vfill\multirowsetup
+ \hbox{\strut#5\strut}\vfill}%
+ \else
+ \setbox0\vtop to \@tempdima{\hsize#3\@parboxrestore
+ \vfill \multirowsetup \strut#5\strut\par\vfill}%
+ \fi
+ \ht0\z@\dp0\z@
+ \ifnum\@tempcnta<0\advance\@tempdima-\dp\@arstrutbox
+ \else\@tempdima=\ht\@arstrutbox
+ \ifnum#2>0 \advance\@tempdima\bigstrutjot \fi
+ \fi
+ \advance\@tempdima#4\relax\leavevmode\vtop{\vskip-\@tempdima\box0\vss}}
+\@ifundefined{bigstrutjot}{\newdimen\bigstrutjot \bigstrutjot\jot}{}
View
79 draft/platform.tex
@@ -0,0 +1,79 @@
+\section{System Platform}
+\label{sec:platform}
+
+\begin{table}[!t]
+ \caption{Performance levels of GTX 480 (GPU).}
+ \vspace{-2.0mm}
+ \label{tab:GPU-frequency}
+ \footnotesize
+ \begin{center}
+ \begin{tabular}{|c|c|c|c|}
+ \hline
+ Clock Domains &Min [MHz]&Low [MHz] & High [MHz]\\
+ \hline
+ \hline
+ Core & 50 & 405 & 700\\
+ \hline
+ Memory & 135 & 324 & 1848\\
+ \hline
+ \end{tabular}
+ \vspace{-5.0mm}
+ \end{center}
+\end{table}
+
+\begin{table}[!t]
+ \caption{Performance levels of Core i5 2400 (CPU).}
+ \small
+ \label{tab:CPU-frequency}
+ \begin{center}
+ \begin{tabular}{|c|c|c|c|}
+ \hline
+ Platforms&Min [MHz]&Low [MHz]&High [MHz]\\
+ \hline
+ \hline
+ Core i5-2400 &1600& 2700 & 3300.1\\
+ \hline
+ \end{tabular}
+ \end{center}
+ \end{table}
+
+We use an NVIDIA GeForce GTX 480 graphics card and Intel Core i5 2400
+processor with the Linux kernel 3.3.0.
+Table~\ref{tab:GPU-frequency}~and~\ref{tab:CPU-frequency} present their
+available performance levels respectively.
+To perform the experiment, we use NVIDIA's proprietary
+software~\cite{BLOB} and Gdev~\cite{Kato2012} case by case.
+NVIDIA's proprietary software does not provide a system interface to
+scale the performance level of the GPU.
+We hence provide the modified BIOS files for the GPU that force the
+binary driver to configure the GPU with the specified performance level
+when loaded.
+This method enables us to choose any set of the GPU core and memory
+clocks, but requires the driver to reload, and the configuration is
+static while the driver is running.
+On the other hand, Gdev allows the system to change the performance
+level of the GPU dynamically at runtime through the Linux ``/proc'' file
+system interface.
+However, it is available only for the GPU core clock at the moment, and
+the GPU memory clock is fixed at 135MHz.
+This is limited due to an open-source implementation of Linux, but will
+be removed in the future release.
+
+The experimental workload executes the Rodinia benchmark suite
+2.0.1~\cite{Che2010} and our original microbenchmark programs of matrix
+computation.
+All input data for the Rodinia programs use the maximum feasible size,
+while the microbenchmark programs vary data size to conduct fine-grained
+measurements, all of which are written in CUDA.
+We use the NVIDIA CUDA Compiler (NVCC) 4.0~\cite{CUDA40} to compile the
+programs.
+Both NVIDIA's proprietary software and Gdev receive the same program
+binary.
+
+The power and energy consumption of the system is measured by the
+YOKOGAWA WT1600 digital power meter.
+This instrument obtains the voltage and electric current every $50ms$
+from the power plug of the machine.
+The power consumption is calculated by multiplying the voltage and
+current, while the energy consumption is derived by accumulation of
+power consumption.
View
BIN  draft/poster_usenix-atc.xlsx
Binary file not shown
View
126 draft/refer.bib
@@ -0,0 +1,126 @@
+@inproceedings{Che2010,
+author = {S. Che and J. W. Sheaffer and M. Boyer and L. G. Szafaryn and L. Wang and K. Skadron },
+title = {A characterization of the Rodinia benchmark suite with comparison to contemporary CMP workloads},
+booktitle = {Proc. of the IEEE international symposium on Workload characterization},
+year = {2010},
+pages = {1--11},
+}
+
+@inproceedings{Hsu2001,
+author = { C-H. Hsu and U. Kremer and M. Hsiao,},
+title = {Compiler-directed dynamic voltage/frequency scheduling for energy reduction in microprocessors},
+booktitle = {Proc. of the international symposium on Low power electronics and design},
+year = {2001},
+pages = {275--278},
+}
+
+
+@inproceedings{Hsu2003,
+author = { C-H. Hsu and U. Kremer },
+title = {The design, implementation, and evaluation of a compiler algorithm for CPU energy reduction},
+booktitle = {Proc. of the ACM SIGPLAN conference on Programming language design and implementation},
+year = {2003},
+pages = {38--48},
+}
+
+@inproceedings{Magklis2003,
+author = {G. Magklis and M. L. Scott and G. Semeraro and D. H. Albonesi and S. Dropsho },
+title = {Profile-based dynamic voltage and frequency scaling for a multiple clock domain microprocessor},
+booktitle = {Proc. of the annual international symposium on Computer architecture},
+pages = {14--27},
+year = {2003},
+}
+
+@inproceedings{Lee2011,
+author = {J. Lee and V. Sathisha and M. Schulte and K. Compton and N. S. Kim},
+title = {Improving Throughput of Power-Constrained GPUs Using Dynamic Voltage/Frequency and Core Scaling},
+booktitle = {Proc. of the international conference on Parallel architectures and compilation techniques},
+year = {2011},
+pages = {111--120},
+}
+
+@inproceedings{Kato2012,
+author = {K. Shinpei and M. Michael and M. Carlos and B. Scott},
+title = {Gdev: First-Class GPU Resource Management in the Operating System},
+booktitle = {Proc. of the USENIX conference on USENIX annual technical conference},
+year = {2012},
+}
+
+
+@misc{NVIDIA_Fermi,
+author = {NVIDIA},
+title = {{NVIDIA's next generation CUDA computer architecture: Fermi}},
+howpublished = {\url{http://www.nvidia.com/content/PDF/fermi_white_papers/NVIDIA_Fermi_Compute_Architechure_Whitepaper.pdf}},
+year = {2009}
+}
+
+@misc{NVIDIA_Kepler,
+author = {NVIDIA},
+title = {{NVIDIA GeForce GTX 680: The fastest, most efficient GPU ever built}},
+howpublished = {\url{http://www.geforce.com/Active/en_US/en_US/pdf/GeForce-GTX-680-Whitepaper-FINAL.pdf}},
+year = {2012}
+}
+
+
+@inproceedings{Hong2009,
+author = {S. Hong and H. Kim},
+title = {An analytical model for a GPU architecture with memory-level and thread-level parallelism awareness},
+booktitle = {Proc. of the annual international symposium on Computer architecture},
+year = {2009},
+pages = {152--163},
+}
+
+
+@inproceedings{Hong2010,
+author = {S. Hong and H. Kim },
+title = {An integrated GPU power and performance model},
+booktitle = {Proc. of the annual international symposium on Computer architecture},
+year = {2010},
+pages = {280--289},
+}
+
+@inproceedings{Jiao2010,
+author={Y. Jiao and H. Lin and P. Balaji and W. Feng},
+title={Power and Performance Characterization of Computational Kernels on the GPU},
+booktitle={Proc. of the IEEE/ACM international conference on Green computing and communications \& international conference on Cyber, physical and social computing},
+year={2010},
+pages={221 -228},
+}
+
+@inproceedings{Nagasaka2010,
+author = {H. Nagasaka and N. Maruyama and A. Nukada and T. Endo and S. Matsuoka},
+title = {Statistical power modeling of GPU kernels using performance counters},
+booktitle = {Proc. of the international conference on Green computing},
+year = {2010},
+pages = {115--122},
+}
+
+@inproceedings{Ying2011,
+author={Y. Zhang and Y. Hu and B. Li and L. Peng},
+title={Performance and Power Analysis of ATI GPU: A Statistical Approach},
+booktitle={Proc. of the IEEE international conference on Networking, architecture, and storage},
+year={2011},
+pages={149 -158},
+}
+@misc{BLOB,
+author = {NVIDIA},
+title = {{Linux~X64~Display Driver}},
+howpublished = {\url{http://www.nvidia.com/object/linux-display-amd64-295.59-driver.html}},
+year = {2012}
+}
+
+@misc{CUDA40,
+author = {NVIDIA},
+title = {{CUDA 4.0}},
+howpublished = {\url{http://developer.nvidia.com/cuda-toolkit-40}},
+year = {2011}
+}
+
+@misc{envytools,
+author = {M. Koscielnicki},
+title = {{envytools}},
+howpublished = {\url{git://0x04.net/envytools.git}},
+year = {2012}
+}
+
+
View
66 draft/relatedwork.tex
@@ -0,0 +1,66 @@
+\section{Related Work}
+\label{sec:related_work}
+
+Nagasaka \textit{et al.} conjectured energy consumption of GPUs based on
+the hardware performance counter~\cite{Nagasaka2010}.
+This performance counter, however, is not adequate in that power
+consumption rises even in an idle state when voltage and frequency are
+scaled, though the performance counter does not change in an idle
+state.
+Hence, this approach would require an additional model to precisely
+analyze the power consumption of the GPU.
+
+Hong \textit{et al.} studied energy savings of GPUs, assuming power
+gating available to limit the number of active
+cores~\cite{Hong2009,Hong2010}.
+In particular, they analyze PTX code to model the power and performance
+of GPUs based on the number of instructions and memory accesses.
+Unfortunately, none of the current GPU architectures yet supports power
+gating, which limited their contribution to simulation studies.
+Therefore, it is questionable if the presented power and performance
+model is applicable to the real-world, and GPU power gating is also not
+a realistic assumption at the moment.
+Nonetheless, we consider that an offline PTX analysis for power and
+performance prediction is a useful approach to the design of DVFS
+algorithms.
+What lacks in this approach, however, is a runtime analysis for input
+data.
+In this paper, we have analyzed the power and performance
+characteristics depending on the size of input data.
+
+Lee \textit{et al.} presented a method to apply DVFS algorithms to the
+GPU.
+They particularly aimed at maximizing performance under the given power
+constraint~\cite{Lee2011}.
+A strong limitation of their work, however, is that the evaluation of
+power consumption is based on a conceptual model but not on real-world
+hardware.
+They also failed to discuss how to determine the voltage and frequency.
+In this paper, we have rather explored how to minimize the energy
+consumption of GPU-accelerated systems using the cutting-edge real-world
+hardware.
+
+Jiao \textit{et al.} evaluated the power and performance of an old
+NVIDIA GTX~280 GPU~\cite{Jiao2010}.
+They examined compute-intensive and memory-intensive programs.
+According to their analysis, energy consumption could often be reduced
+by lowering the core clock when workload is memory-intensive.
+This is exactly the same as what we have identified for an NVIDIA's
+GTX~480 GPU.
+Therefore, we conjecture that this observation and knowledge could be
+applied to future GPU architectures as well.
+In addition, we have disclosed that energy consumption could also be
+reduced by scaling the memory clock.
+This opens up a new insight into DVFS algorithms for GPU-accelerated
+systems.
+
+Ying \textit{et al.} analyzed the power and performance of an AMD
+HD~5870 GPU using a random forest method with the profile
+counter~\cite{ying2011}.
+They revealed that activating a fewer number of ALUs reduces power
+consumption.
+However, this approach incurs an increase in execution time, and does
+not successfully reduce energy consumption.
+This is attributed to the fact that they use only software management.
+Meanwhile, we have demonstrated that energy can be reduced by scaling
+the voltate and frequency of the GPU.
View
104 draft/subfigure.sty
@@ -0,0 +1,104 @@
+%$% USC IRIS/SIPI TeX/LaTeX Macro Library
+%%%
+%$% subfigure.sty
+%$% Subfigure command for use with the LaTeX figure environment.
+%%%
+%$% $Header: subfigure.doc,v 1.1 88/11/02 11:03:46 cochran Exp $
+%%%
+%$% $Log: subfigure.doc,v $
+%$% Revision 1.1 88/11/02 11:03:46 cochran
+%$% Initial revision
+%$%
+%$% 05 Mar 86 cochran @ dworkin.usc.edu (USC-IRIS)
+%$% Created.
+%%%
+%$% Usage: \documentstyle[...,subfigure,...]{...}
+%%%
+%$% \subfigure[CAPTIONtext]{FIGUREbox}
+%%%
+%%% Creates a subfigure box with an optional CAPTION under a FIGURE. The
+%%% FIGURE is centered with \subfigtopskip of vertical space added above
+%%% and \subfigcapskip vertical space added below it followed by the
+%%% CAPTION. The subfigure is followed by another \subfigtopskip of
+%%% vertical space added at the bottom.
+%%%
+%%% If a CAPTION is given (including a null CAPTION '[]') then the subfigure
+%%% is labeled with a counter supplied by the macro '\thesubfigure' which
+%%% returns '(a) ', '(b) ', etc. If desired, this macro may be redefined.
+%%% The counter used for labeling the subfigures is 'subfigure' and in
+%%% incremented for each subfigure regardless of whether a CAPTION was
+%%% printed.
+%%%
+%%% One example of use is to surround the figure in a tabular environment:
+%%% \begin{figure}
+%%% \begin{center}
+%%% \begin{tabular}[t]{c}
+%%% \subfigure[First]{...}
+%%% \subfigure[Second]{...} \\
+%%% \subfigure[Third]{...}
+%%% \end{tabular}
+%%% \end{center}
+%%% \caption{Three Subfigures}
+%%% \end{figure}
+
+%% Subfigure counter.
+\newcounter{subfigure}[figure]
+
+%% Generate CAPTION preface.
+\def\thesubfigure{(\alph{subfigure})\space}
+
+%% Size of the CAPTION font.
+\def\subcapsize{\footnotesize}
+
+%% Length from the top of the subfigure box to the begining of the FIGURE
+%% box. Also from the bottom of the CAPTION to the bottom of the subfigure.
+\def\subfigtopskip{10pt}
+
+%% Length from the bottom of the FIGURE to the begining of the CAPTION.
+\def\subfigcapskip{10pt}
+
+%% Indentation of the caption from the sides of the subfigure box.
+\def\subfigcapmargin{10pt}
+
+%%
+%% Define the subfigure command
+%%
+\def\subfigure{%
+ \leavevmode
+ \@ifnextchar [%
+ \@subfigure
+ {\@subfigure[\@empty]}}
+
+\long\def\@subfigure[#1]#2{%
+ \stepcounter{subfigure}%
+ \setbox\@tempboxa \hbox{#2}%
+ \@tempdima=\wd\@tempboxa
+ \vtop{%
+ \vbox{% Put the figure in the top box
+ \vskip\subfigtopskip
+ \box\@tempboxa}
+ %% Now skip down and add the caption paragraph.
+ \vskip\subfigcapskip
+ \vspace{-0.5em} % added manually by shinpei
+ \begingroup
+ \@parboxrestore
+ \setbox\@tempboxa
+ \ifx #1\@empty
+ \hbox{\subcapsize\strut\hfil}%
+ \else
+ \hbox{\subcapsize\strut\thesubfigure#1}%
+ \fi
+ \@tempdimb=-\subfigcapmargin
+ \multiply\@tempdimb\tw@
+ \advance\@tempdimb\@tempdima
+ \hbox to\@tempdima{%
+ \hfil
+ \ifdim \wd\@tempboxa >\@tempdimb % (Comment too wide for one line)
+ \parbox{\@tempdimb}{\subcapsize\thesubfigure#1}%
+ \else
+ \box\@tempboxa
+ \fi
+ \hfil}
+ \endgroup
+ \vskip\subfigtopskip}}
+
View
94 draft/usenix-kato.sty
@@ -0,0 +1,94 @@
+% usenix.sty - to be used with latex2e for USENIX.
+% To use this style file, look at the template usenix_template.tex
+%
+% $Id: usenix.sty,v 1.2 2005/02/16 22:30:47 maniatis Exp $
+%
+% The following definitions are modifications of standard article.sty
+% definitions, arranged to do a better job of matching the USENIX
+% guidelines.
+% It will automatically select two-column mode and the Times-Roman
+% font.
+
+%
+% USENIX papers are two-column.
+% Times-Roman font is nice if you can get it (requires NFSS,
+% which is in latex2e.
+
+\if@twocolumn\else\input twocolumn.sty\fi
+\usepackage{times}
+
+%
+% USENIX wants margins of: 1" sides, 1" bottom, and 1" top.
+% 0.25" gutter between columns.
+% Gives active areas of 6.5" x 9"
+%
+\setlength{\textheight}{9.0in}
+\setlength{\columnsep}{0.25in}
+\setlength{\textwidth}{6.50in}
+
+\setlength{\topmargin}{0.0in}
+
+\setlength{\headheight}{0.0in}
+
+\setlength{\headsep}{0.0in}
+
+% Usenix wants no page numbers for camera-ready papers, so that they can
+% number them themselves. But submitted papers should have page numbers
+% for the reviewers' convenience.
+%
+%
+% \pagestyle{empty}
+
+%
+% Usenix titles are in 14-point bold type, with no date, and with no
+% change in the empty page headers. The whole author section is 12 point
+% italic--- you must use {\rm } around the actual author names to get
+% them in roman.
+%
+\def\maketitle{\par
+ \begingroup
+ \renewcommand\thefootnote{\fnsymbol{footnote}}%
+ \def\@makefnmark{\hbox to\z@{$\m@th^{\@thefnmark}$\hss}}%
+ \long\def\@makefntext##1{\parindent 1em\noindent
+ \hbox to1.8em{\hss$\m@th^{\@thefnmark}$}##1}%
+ \if@twocolumn
+ \twocolumn[\@maketitle]%
+ \else \newpage
+ \global\@topnum\z@
+ \@maketitle \fi\@thanks
+ \endgroup
+ \setcounter{footnote}{0}%
+ \let\maketitle\relax
+ \let\@maketitle\relax
+ \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
+
+\def\@maketitle{\newpage
+ \vbox to 2.2in{ %% Modified by abe - it was 2.5in originally.
+ \vspace*{\fill}
+ \vskip 2em
+ \begin{center}%
+ {\Large\bf \@title \par}%
+ \vskip 0.375in minus 0.300in
+ {\large\it
+ \lineskip .5em
+ \begin{tabular}[t]{c}\@author
+ \end{tabular}\par}%
+ \end{center}%
+ \par
+ \vspace*{\fill}
+% \vskip 1.5em
+ }
+}
+
+%
+% The abstract is preceded by a 12-pt bold centered heading
+\def\abstract{\begin{center}%
+{\large\bf \abstractname\vspace{-.5em}\vspace{\z@}}%
+\end{center}}
+\def\endabstract{}
+
+%
+% Main section titles are 12-pt bold. Others can be same or smaller.
+%
+\def\section{\@startsection {section}{1}{\z@}{-3.5ex plus-1ex minus
+ -.2ex}{2.3ex plus.2ex}{\reset@font\large\bf}}
View
94 draft/usenix.sty
@@ -0,0 +1,94 @@
+% usenix.sty - to be used with latex2e for USENIX.
+% To use this style file, look at the template usenix_template.tex
+%
+% $Id: usenix.sty,v 1.2 2005/02/16 22:30:47 maniatis Exp $
+%
+% The following definitions are modifications of standard article.sty
+% definitions, arranged to do a better job of matching the USENIX
+% guidelines.
+% It will automatically select two-column mode and the Times-Roman
+% font.
+
+%
+% USENIX papers are two-column.
+% Times-Roman font is nice if you can get it (requires NFSS,
+% which is in latex2e.
+
+\if@twocolumn\else\input twocolumn.sty\fi
+\usepackage{mathptmx} % times roman, including math (where possible)
+
+%
+% USENIX wants margins of: 1" sides, 1" bottom, and 1" top.
+% 0.25" gutter between columns.
+% Gives active areas of 6.5" x 9"
+%
+\setlength{\textheight}{9.0in}
+\setlength{\columnsep}{0.25in}
+\setlength{\textwidth}{6.50in}
+
+\setlength{\topmargin}{0.0in}
+
+\setlength{\headheight}{0.0in}
+
+\setlength{\headsep}{0.0in}
+
+% Usenix wants no page numbers for camera-ready papers, so that they can
+% number them themselves. But submitted papers should have page numbers
+% for the reviewers' convenience.
+%
+%
+% \pagestyle{empty}
+
+%
+% Usenix titles are in 14-point bold type, with no date, and with no
+% change in the empty page headers. The whole author section is 12 point
+% italic--- you must use {\rm } around the actual author names to get
+% them in roman.
+%
+\def\maketitle{\par
+ \begingroup
+ \renewcommand\thefootnote{\fnsymbol{footnote}}%
+ \def\@makefnmark{\hbox to\z@{$\m@th^{\@thefnmark}$\hss}}%
+ \long\def\@makefntext##1{\parindent 1em\noindent
+ \hbox to1.8em{\hss$\m@th^{\@thefnmark}$}##1}%
+ \if@twocolumn
+ \twocolumn[\@maketitle]%
+ \else \newpage
+ \global\@topnum\z@
+ \@maketitle \fi\@thanks
+ \endgroup
+ \setcounter{footnote}{0}%
+ \let\maketitle\relax
+ \let\@maketitle\relax
+ \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
+
+\def\@maketitle{\newpage
+ \vbox to 2.5in{
+ \vspace*{\fill}
+ \vskip 2em
+ \begin{center}%
+ {\Large\bf \@title \par}%
+ \vskip 0.375in minus 0.300in
+ {\large\it
+ \lineskip .5em
+ \begin{tabular}[t]{c}\@author
+ \end{tabular}\par}%
+ \end{center}%
+ \par
+ \vspace*{\fill}
+% \vskip 1.5em
+ }
+}
+
+%
+% The abstract is preceded by a 12-pt bold centered heading
+\def\abstract{\begin{center}%
+{\large\bf \abstractname\vspace{-.5em}\vspace{\z@}}%
+\end{center}}
+\def\endabstract{}
+
+%
+% Main section titles are 12-pt bold. Others can be same or smaller.
+%
+\def\section{\@startsection {section}{1}{\z@}{-3.5ex plus-1ex minus
+ -.2ex}{2.3ex plus.2ex}{\reset@font\large\bf}}
Please sign in to comment.
Something went wrong with that request. Please try again.