From bc279a7c36a75d388deea4074d346062445feaae Mon Sep 17 00:00:00 2001 From: jgp Date: Tue, 26 Mar 2019 17:44:40 +0100 Subject: [PATCH 1/6] roofline_intel --- .../intel_advisor_roofline.py | 158 +++++++ .../src/roofline/API/cscs.py | 67 +++ .../src/roofline/LICENSE | 254 +++++++++++ .../src/roofline/Makefile | 16 + .../src/roofline/TEMPL/cscs.tmpl | 47 +++ .../src/roofline/roofline_template.cpp | 398 ++++++++++++++++++ 6 files changed, 940 insertions(+) create mode 100644 cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py create mode 100644 cscs-checks/tools/profiling_and_debugging/src/roofline/API/cscs.py create mode 100644 cscs-checks/tools/profiling_and_debugging/src/roofline/LICENSE create mode 100644 cscs-checks/tools/profiling_and_debugging/src/roofline/Makefile create mode 100644 cscs-checks/tools/profiling_and_debugging/src/roofline/TEMPL/cscs.tmpl create mode 100644 cscs-checks/tools/profiling_and_debugging/src/roofline/roofline_template.cpp diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py new file mode 100644 index 0000000000..1fb14bfb3f --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py @@ -0,0 +1,158 @@ +import reframe as rfm +import reframe.utility.sanity as sn + + +@rfm.required_version('>=2.14') +@rfm.parameterized_test(*[[repeat, datalayout] + for repeat in ['50000'] + for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR', + 'G3_AOS_VECTOR', 'G3_SOA_VECTOR', + 'G3_SOA_VECTOR_FMAS']]) +class IntelRooflineTest(rfm.RegressionTest): + '''This test checks the values reported by Intel Advisor's roofline model: + https://software.intel.com/en-us/intel-advisor-xe + + The roofline model is based on GFLOPS and Arithmetic Intensity (AI): + "Self GFLOPS" = "Self GFLOP" / "Self Elapsed Time" + "Self GB/s" = "Self Memory GB" / "Self Elapsed Time" + "Self AI" = "Self GFLOPS" / "Self GB/s" + + While a roofline analysis flag exists ('advixe-cl -collect roofline'), it + may not be used to collect data on MPI applications; in that case, the + survey and flops analysis must be collected separately: first run a survey + analysis ('advixe-cl -collect survey') and then run a tripcounts+flops + analysis ('advixe-cl -collect tripcounts -flop') using the same project + directory for both steps. + ''' + def __init__(self, repeat, datalayout): + super().__init__() + self.name = 'Intel_Roofline_%s_%s' % (repeat, datalayout) + self.descr = 'repeat=%s' % repeat + self.valid_systems = ['daint:mc', 'dom:mc'] + # Reporting MFLOPS is not available on Intel Haswell cpus, see + # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/ + # 64-ia-32-architectures-software-developer-vol-1-manual.pdf + self.valid_prog_environs = ['PrgEnv-intel'] + self.prgenv_flags = { + 'PrgEnv-intel': ['-O2', '-g', '-std=c++11'], + } + self.sourcesdir = 'src/roofline' + self.build_system = 'Make' + self.prebuild_cmd = [ + 'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' % + (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp') + ] + self.num_tasks = 1 + self.num_tasks_per_node = 1 + self.num_cpus_per_task = 1 + self.variables = { + 'OMP_NUM_THREADS': str(self.num_cpus_per_task), + 'CRAYPE_LINK_TYPE': 'dynamic', + } + self.pre_run = [ + # Testing with advisor/2018: + # advisor/2019 is broken on dom ("Exceeded job memory limit"), + # and advisor/2019 is not installed on daint, + 'source $INTEL_PATH/../advisor_2018/advixe-vars.sh', + 'advixe-cl -help collect |head -20', + ] + self.executable = 'advixe-cl' + self.exe = './roof.exe' + self.roofdir = './roof.dir' + self.executable_opts = [ + '--collect survey --project-dir=%s --search-dir src:rp=. ' + '--data-limit=0 --no-auto-finalize --trace-mpi -- %s ' % + (self.roofdir, self.exe) + ] + self.version_rpt = 'Intel_Advisor_version.rpt' + self.post_run = [ + # collecting the performance data for the roofline model is a 2 + # steps process: + 'srun %s --collect tripcounts --flop --project-dir=%s ' + '--search-dir src:rp=. --data-limit=0 --no-auto-finalize ' + '--trace-mpi -- %s' % (self.executable, self.roofdir, self.exe), + # check tool's version: + 'advixe-cl -V &> %s' % self.version_rpt, + # "advixe-cl --report" looks for e000/ in the output directory; + # if not found, it will fail with: + # IOError: Survey result cannot be loaded + 'cd %s;ln -s nid* e000;cd -' % self.roofdir, + ] + self.roofline_ref = 'Intel_Advisor_roofline_reference.rpt' + self.roofline_rpt = 'Intel_Advisor_roofline_results.rpt' + self.post_run += [ + # report reference values/boundaries (roofline_ref): + 'advixe-cl --report=roofs --project-dir=%s &> %s' % + (self.roofdir, self.roofline_ref), + 'python2 API/cscs.py %s &> %s' % (self.roofdir, self.roofline_rpt), + # 'advixe-cl --format=csv' seems to be not working (empty report), + # keeping as reference for later check: + # 'advixe-cl --show-all-columns -csv-delimiter=";"' + # ' --report=tripcounts --format=csv --project-dir=%s &> %s' + # This can be used instead (see advisor/config/report/roofs.tmpl): + # 'advixe-cl --report custom --report-template ./TEMPL/cscs.tmpl' + # ' --project-dir=%s &> %s' + ] + self.maintainers = ['JG'] + self.tags = {'production'} + + def setup(self, partition, environ, **job_opts): + super().setup(partition, environ, **job_opts) + environ_name = self.current_environ.name + prgenv_flags = self.prgenv_flags[environ_name] + self.build_system.cxxflags = prgenv_flags + toolsversion = '551025' # 2018 Update 2 (build 551025) + # Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4): + _L1bw = 293*1024**3 + _L2bw = 79*1024**3 + _L3bw = 33*1024**3 + _DPfmabw = 49*1024**3 + _DPaddbw = 12*1024**3 + _ScalarAddbw = 3*1024**3 + self.sanity_patterns = sn.all([ + # check the job status: + sn.assert_found('loop complete.', self.stdout), + # check the tool's version: + sn.assert_eq(sn.extractsingle( + r'I*.\(build\s(?P\d+)\s*.', + self.version_rpt, 'toolsversion'), toolsversion), + # --- roofline boundaries: + # check --report=roofs (L1 bandwidth): + sn.assert_reference(sn.extractsingle( + r'^L1\sBandwidth\s\(single-threaded\)\s+(?P\d+)\s+' + r'memory$', self.roofline_ref, 'L1bw', int), + _L1bw, -0.08, 0.08), + # check --report=roofs (L2 bandwidth): + sn.assert_reference(sn.extractsingle( + r'^L2\sBandwidth\s\(single-threaded\)\s+(?P\d+)\s+' + r'memory$', self.roofline_ref, 'L2bw', int), + _L2bw, -0.08, 0.08), + # check --report=roofs (L3 bandwidth): + sn.assert_reference(sn.extractsingle( + r'^L3\sBandwidth\s\(single-threaded\)\s+(?P\d+)\s+' + r'memory$', self.roofline_ref, 'L3bw', int), + _L3bw, -0.08, 0.08), + # check --report=roofs (DP FMA): + sn.assert_reference(sn.extractsingle( + r'^DP Vector FMA Peak\s\(single-threaded\)\s+' + r'(?P\d+)\s+compute$', self.roofline_ref, + 'DPfmabw', int), _DPfmabw, -0.08, 0.08), + # check --report=roofs (DP Add): + sn.assert_reference(sn.extractsingle( + r'^DP Vector Add Peak\s\(single-threaded\)\s+' + r'(?P\d+)\s+compute$', self.roofline_ref, + 'DPaddbw', int), _DPaddbw, -0.08, 0.08), + # check --report=roofs (Scalar Add): + sn.assert_reference(sn.extractsingle( + r'^Scalar Add Peak\s\(single-threaded\)\s+' + r'(?P\d+)\s+compute$', self.roofline_ref, + 'ScalarAddbw', int), _ScalarAddbw, -0.08, 0.08), + # --- check Arithmetic_intensity: + sn.assert_reference(sn.extractsingle( + r'^returned\sAI\sgap\s=\s(?P.*)', self.roofline_rpt, + 'Intensity', float), 0.0, -0.01, 0.01), + # --- check GFLOPS: + sn.assert_reference(sn.extractsingle( + r'^returned\sGFLOPS\sgap\s=\s(?P.*)', self.roofline_rpt, + 'Flops', float), 0.0, -0.01, 0.01), + ]) diff --git a/cscs-checks/tools/profiling_and_debugging/src/roofline/API/cscs.py b/cscs-checks/tools/profiling_and_debugging/src/roofline/API/cscs.py new file mode 100644 index 0000000000..73bd683e62 --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/src/roofline/API/cscs.py @@ -0,0 +1,67 @@ +# Taken from /opt/intel/advisor_2018/pythonapi/examples/roofline.py +# +# The roofline model is based on GFLOPS and Arithmetic Intensity (AI): +# "Self GFLOPS" = "Self GFLOP" / "Self Elapsed Time" +# "Self GB/s" = "Self Memory GB" / "Self Elapsed Time" +# "Self AI" = "Self GFLOPS" / "Self GB/s" +import sys +try: + import advisor +except ImportError: + print('''Import error: Python could not load advisor python library. + Possible reasons:\n 1. Python cannot resolve path to Advisor\'s pythonapi +directory. To fix, either manually add path to the pythonapi directory into +PYTHONPATH environment variable, or use advixe-vars.* scripts to set up +product environment variables automatically.\n 2. Incompatible runtime +versions used by advisor python library and other packages (such as +matplotlib or pandas). To fix, either try to change import order or update +other package version if possible. 3. cscs: try +sys.path.append(\'/opt/intel/advisor/pythonapi\')''') + sys.exit(1) + +if len(sys.argv) < 2: + print('Usage: "python {} path_to_project_dir"'.format(__file__)) + sys.exit(2) + +project = advisor.open_project(sys.argv[1]) +data = project.load(advisor.SURVEY) +# data = project.load(advisor.ALL) +rows = [{col: row[col] for col in row} for row in data.bottomup] + +# --- Extract values from the report and compute our arithmetic_intensity: +self_elapsed_time = float(rows[0]['self_elapsed_time']) + +self_memory_gb = float(rows[0]['self_memory_gb']) +self_gb_s = float(rows[0]['self_gb_s']) +_self_gb_s = self_memory_gb / self_elapsed_time + +self_gflop = float(rows[0]['self_gflop']) +self_gflops = float(rows[0]['self_gflops']) +_self_gflops = self_gflop / self_elapsed_time + +self_arithmetic_intensity = float(rows[0]['self_arithmetic_intensity']) +_self_arithmetic_intensity = _self_gflops / _self_gb_s + +# --- Reported values: +print('self_elapsed_time' , self_elapsed_time) +print('self_memory_gb' , self_memory_gb) +print('self_gb_s' , self_gb_s) +print('self_gflop' , self_gflop) +print('self_gflops' , self_gflops) +print('self_arithmetic_intensity', self_arithmetic_intensity) + +print('_self_gb_s', _self_gb_s, self_gb_s) +print('_self_gflops', _self_gflops, self_gflops) +print('_self_arithmetic_intensity', _self_arithmetic_intensity, + self_arithmetic_intensity) + +print('gap _self_gb_s', _self_gb_s-self_gb_s) +print('gap _self_gflops', _self_gflops-self_gflops) +print('gap _self_arithmetic_intensity', + _self_arithmetic_intensity-self_arithmetic_intensity) + +# --- Compare the roofline report: +print('returned AI gap = {:.16f}'. + format(_self_arithmetic_intensity-self_arithmetic_intensity)) +print('returned GFLOPS gap = {:.16f}'. + format(_self_gflops-self_gflops)) diff --git a/cscs-checks/tools/profiling_and_debugging/src/roofline/LICENSE b/cscs-checks/tools/profiling_and_debugging/src/roofline/LICENSE new file mode 100644 index 0000000000..d982501d7d --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/src/roofline/LICENSE @@ -0,0 +1,254 @@ +Copyright 2016-2017 Intel(R) Corporation + +http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/ + +Intel Sample Source Code License Agreement + +Code Samples License Agreement (Version December 2015) + +IMPORTANT - READ BEFORE COPYING, INSTALLING OR USING. +Do not copy, install or use the Materials (as defined below) provided under +this license agreement ("Agreement") from Intel Corporation (?Intel?), until +you (?You?) have carefully read the following terms and conditions. By copying, +installing or otherwise using the Materials, You agree to be bound by the terms +of this Agreement. If You do not agree to the terms of this Agreement, do not +copy, install or use the Materials. + +If You are agreeing to the terms and conditions of this Agreement on behalf of +a company or other legal entity (?Legal Entity?), You represent and warrant +that You have the legal authority to bind that Legal Entity to the Agreement, +in which case, "You" or "Your" will mean such Legal Entity. + +By agreeing to this Agreement, You affirm that You are of legal age (18 years +old or older) to enter into this Agreement. If You are not of legal age You may +not enter into this Agreement, and either Your parent, legal guardian or Legal +Entity must agree to the terms and conditions of this Agreement and enter into +this Agreement, in which case, "You" or "Your" will mean such parent, legal +guardian, or Legal Entity. + +Third Party Programs (as defined below), even if included with the distribution +of the Materials, are governed by separate third party license terms, including +without limitation, open source software license terms. Such third party +license terms (and not this Agreement) govern Your use of the Third Party +Programs, and Intel is not liable for the Third Party Programs. + +1. LICENSE DEFINITIONS: + +?Licensed Patent Claims? means the claims of Intel?s patents that are +necessarily and directly infringed by the reproduction and distribution of the +Materials that is authorized in Section 2 below, when the Materials are in its +unmodified form as delivered by Intel to You and not modified or combined with +anything else. Licensed Patent Claims are only those claims that Intel can +license without paying, or getting the consent of, a third party. + +?Materials? means Sample Source Code, Redistributables, and End-User +Documentation but do not include Third Party Programs. + +?Sample Source Code? means Source Code files that are identified as sample code +and which may include example interface or application source code, and any +updates, provided under this Agreement. + +?Source Code? is defined as the software (and not documentation or text) +portion of the Materials provided in human readable format, and includes +modifications that You make or are made on Your behalf as expressly permitted +under the terms of this Agreement. + +?Redistributables? means header, library, and dynamically linkable library +files, and any updates, provided under this Agreement. + +?Third Party Programs? (if any) are the third party software files that may be +included with the Materials for the applicable software that include a separate +third party license agreement in an attached text file. + +?End-User Documentation? means textual materials intended for end users +relating to the Materials. + +2. LICENSE GRANT: + +Subject to the terms and conditions of this Agreement, Intel grants You a +non-exclusive, worldwide, non-assignable, royalty-free limited right and +license: + +A. under its copyrights, to: + +1) Copy, modify, and compile the Sample Source Code and distribute it solely in +Your products in executable and source code form; +2) Copy and distribute the Redistributables solely with Your products; +3) Copy, modify, and distribute the End User Documentation solely with Your +products. + +B. Under its patents, to: + +1) make copies of the Materials internally only; +2) use the Materials internally only; and +3) offer to distribute, and distribute, but not sell, the Materials only as +part of or with Your products, under Intel?s copyright license granted in +Section 2(A) but only under the terms of that copyright license and not as a +sale (but this right does not include the right to sub-license); +4) provided, further, that the license under the Licensed Patent Claims does +not and will not apply to any modifications to, or derivative works of, the +Materials, whether made by You, Your end user (which, for all purposes under +this Agreement, will mean either an end user, a customer, reseller, distributor +or other channel partner), or any third party even if the modification and +creation of derivative works are permitted under 2(A). + +3. LICENSE RESTRICTIONS: + +Except as expressly provided in this Agreement, You may not: + +i. use, copy, distribute or publicly display the Materials; +ii. reverse-assemble, reverse-compile, or otherwise reverse-engineer any +software provided solely in binary form, iii. rent or lease the Materials to +any third party; +iv. assign this Agreement or display the Materials; +v. assign this Agreement or transfer the Materials; +vi. modify, adapt or translate the Materials in whole or in part; +vii. distribute, sublicense or transfer the source code form of the Materials +or derivatives thereof to any third party; viii. distribute the Materials +except as part of Your products; +ix. include the Materials in malicious, deceptive, or unlawful programs or +products; +x. modify, create a derivative work, link or distribute the Materials so that +any part of it becomes subject to an Excluded License. + +Upon Intel's release of an update, upgrade, or new version of the Materials, +you will make reasonable efforts to discontinue distribution of the enclosed +Materials and you will make reasonable efforts to distribute such updates, +upgrades, or new versions to your customers who have received the Materials +herein. + +Distribution of the Materials is also subject to the following limitations. +You: + +i. will be solely responsible to your customers for any update or support +obligation or other liability which may arise from the distribution; +ii. will not make any statement that your product is "certified," or that its +performance is guaranteed, by Intel; +iii. will not use Intel's name or trademarks to market your product without +written permission; +iv. will prohibit disassembly and reverse engineering of the Materials provided +in executable form; +v. will not publish reviews of Materials without written permission by Intel, +and +vi. will indemnify, hold harmless, and defend Intel and its suppliers from and +against any claims or lawsuits, including attorney's fees, that arise or result +from your distribution of any product. + +4. OWNERSHIP: Title to the Materials and all copies thereof remain with Intel +or its suppliers. The Materials are copyrighted and are protected by United +States copyright laws and international treaty provisions. You will not remove +any copyright notice from the Materials. You agree to prevent unauthorized +copying of the Materials. Except as expressly provided herein, Intel does not +grant any express or implied right to you under Intel patents, copyrights, +trademarks, or trade secret information. + +5. NO WARRANTY AND NO SUPPORT: Disclaimer. Intel disclaims all warranties of +any kind and the terms and remedies provided in this Agreement are instead of +any other warranty or condition, express, implied or statutory, including those +regarding merchantability, fitness for any particular purpose, non-infringement +or any warranty arising out of any course of dealing, usage of trade, proposal, +specification or sample. Intel does not assume (and does not authorize any +person to assume on its behalf) any other liability. + +Intel may make changes to the Materials, or to items referenced therein, at any +time without notice, but is not obligated to support, update or provide +training for the Materials. Intel may in its sole discretion offer such +support, update or training services under separate terms at Intel?s +then-current rates. You may request additional information on Intel?s service +offerings from an Intel sales representative. + +6. USER SUBMISSIONS: You agree that any material, information, or other +communication, including all data, images, sounds, text, and other things +embodied therein, you transmit or post to an Intel website will be considered +non-confidential ("Communications"). Intel will have no confidentiality +obligations with respect to the Communications. You agree that Intel and its +designees will be free to copy, modify, create derivative works, publicly +display, disclose, distribute, license and sublicense through multiple tiers of +distribution and licensees, incorporate, and otherwise use the Communications, +including derivative works thereto, for any and all commercial or +non-commercial purposes. + +7. LIMITATION OF LIABILITY: Neither Intel nor its suppliers shall be liable for +any damages whatsoever (including, without limitation, damages for loss of +business profits, business interruption, loss of business information, or other +loss) arising out of the use of or inability to use the Materials, even if +Intel has been advised of the possibility of such damages. Because some +jurisdictions prohibit the exclusion or limitation of liability for +consequential or incidental damages, the above limitation may not apply to You. + +8. TERM AND TERMINATION: This Agreement commences upon Your copying, installing +or using the Materials and continues until terminated. Either You or Intel may +terminate this Agreement at any time upon 30 days prior written notice to the +other party. Intel may terminate this license at any time if you are in breach +of any of its terms and conditions. Upon termination, You will immediately +destroy the Materials or return all copies of the Materials to Intel along with +any copies You have made. After termination, the license grant to any Materials +or Redistributables distributed by You in accordance with the terms and +conditions of this Agreement, prior to the effective date of such termination, +will survive any such termination of this Agreement. + +9. U.S. GOVERNMENT RESTRICTED RIGHTS: The technical data and computer software +covered by this license is a ?Commercial Item,? as such term is defined by the +FAR 2.101 (48 C.F.R. 2.101) and is ?commercial computer software? and +?commercial computer software documentation? as specified under FAR 12.212 (48 +C.F.R. 12.212) or DFARS 227.7202 (48 C.F.R. 227.7202), as applicable. This +commercial computer software and related documentation is provided to end users +for use by and on behalf of the U.S. Government, with only those rights as are +granted to all other end users pursuant to the terms and conditions herein. Use +for or on behalf of the U.S. Government is permitted only if the party +acquiring or using this software is properly authorized by an appropriate U.S. +Government official. This use by or for the U.S. Government clause is in lieu +of, and supersedes, any other FAR, DFARS, or other provision that addresses +Government rights in the computer software or documentation covered by this +license. All copyright licenses granted to the U.S. Government are coextensive +with the technical data and computer software licenses granted herein. The U.S. +Government will only have the right to reproduce, distribute, perform, display, +and prepare derivative works as needed to implement those rights. + +10. APPLICABLE LAWS: All disputes arising out of or related to this Agreement, +whether based on contract, tort, or any other legal or equitable theory, will +in all respects be governed by, and construed and interpreted under, the laws +of the United States of America and the State of Delaware, without reference to +conflict of laws principles. The parties agree that the United Nations +Convention on Contracts for the International Sale of Goods (1980) is +specifically excluded from and will not apply to this Agreement. All disputes +arising out of or related to this Agreement, whether based on contract, tort, +or any other legal or equitable theory, will be subject to the exclusive +jurisdiction of the courts of the State of Delaware or of the Federal courts +sitting in that State. Each party submits to the personal jurisdiction of those +courts and waives all objections to that jurisdiction and venue for those +disputes. + +11. SEVERABILITY: The parties intend that if a court holds that any provision +or part of this Agreement is invalid or unenforceable under applicable law, the +court will modify the provision to the minimum extent necessary to make it +valid and enforceable, or if it cannot be made valid and enforceable, the +parties intend that the court will sever and delete the provision or part from +this Agreement. Any change to or deletion of a provision or part of this +Agreement under this Section will not affect the validity or enforceability of +the remainder of this Agreement, which will continue in full force and effect. + +12. EXPORT: You must comply with all laws and regulations of the United States +and other countries governing the export, re-export, import, transfer, +distribution, use, and servicing of Software. In particular, You must not: (a) +sell or transfer Software to a country subject to sanctions, or to any entity +listed on a denial order published by the United States government or any other +relevant government; or (b) use, sell, or transfer Software for the +development, design, manufacture, or production of nuclear, missile, chemical +or biological weapons, or for any other purpose prohibited by the United States +government or other applicable government; without first obtaining all +authorizations required by all applicable laws. For more details on Your export +obligations, please visit http://www.intel.com/content/www/us/en/legal/ +export-compliance.html. + +13. ENTIRE AGREEMENT: This Agreement contains the complete and exclusive +agreement and understanding between the parties concerning the subject matter +of this Agreement, and supersedes all prior and contemporaneous proposals, +agreements, understanding, negotiations, representations, warranties, +conditions, and communications, oral or written, between the parties relating +to the same subject matter. No modification or amendment to this Agreement will +be effective unless in writing and signed by authorized representatives of each +party, and must specifically identify this Agreement. + +For more complete information about compiler optimizations, see our Optimization +Notice: https://software.intel.com/en-us/articles/optimization-notice#opt-en diff --git a/cscs-checks/tools/profiling_and_debugging/src/roofline/Makefile b/cscs-checks/tools/profiling_and_debugging/src/roofline/Makefile new file mode 100644 index 0000000000..d5900fa44a --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/src/roofline/Makefile @@ -0,0 +1,16 @@ +RM := rm -f +EXECUTABLE := roof.exe +all: $(EXECUTABLE) +OBJS := _roofline.o + +$(OBJS): + $(PREP) $(CXX) $(CXXFLAGS) -c -o $(@) $(@:.o=.cpp) + +$(EXECUTABLE): $(OBJS) + $(PREP) $(CXX) $(CXXFLAGS) -o $(@) $(OBJS) $(LDFLAGS) + +clean: + -$(RM) $(OBJS) + +distclean: + -$(RM) $(OBJS) $(EXECUTABLE) diff --git a/cscs-checks/tools/profiling_and_debugging/src/roofline/TEMPL/cscs.tmpl b/cscs-checks/tools/profiling_and_debugging/src/roofline/TEMPL/cscs.tmpl new file mode 100644 index 0000000000..5d1d1684f3 --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/src/roofline/TEMPL/cscs.tmpl @@ -0,0 +1,47 @@ +# ----------------------------------------------------------------------------- +# Taken from /opt/intel/advisor_2018/config/report/roofs.tmpl +# +# advixe-cl --report custom --report-template ./cscs.tmpl --project-dir=./mydir +# ----------------------------------------------------------------------------- +import advixe_discpythonapi1 as discmodels +import sys +import disccli2 as common + +def create_table(roofs): + data = [] + columns = ["Name", "Bandwidth", "Type"] + data.append(columns) + if not roofs: + return data + for roof in roofs: + row = [roof.name, str(roof.bandwidth)] + if roof.has_type(discmodels.roof_type.compute): + row.append("compute") + else: + row.append("memory") + data.append(row) + return data + +format = environment.cmd_args['output_format'] +output = environment.cmd_args['output_file'] +delim = environment.cmd_args['csv_delimiter'] + +if output: + prev_stdout = sys.stdout + sys.stdout = open(output, 'w') + +project = discmodels.open_project(data_input, _progress,_messenger) +result = project.load(discmodels.ModelType.SURVEY) +if result: +#ok print(result.get_column_descriptions()) + # bottomup: + iterjg = result.get_bottomup_rows() + dictjg0 = next(iterjg) + print(dictjg0['source_location'], + dictjg0['total_elapsed_time'], dictjg0['self_elapsed_time'], + dictjg0['total_time_percent'],dictjg0['self_time_percent'], + dictjg0['self_gflop'],dictjg0['self_gflops'], + ) + +if output: + sys.stdout = prev_stdout diff --git a/cscs-checks/tools/profiling_and_debugging/src/roofline/roofline_template.cpp b/cscs-checks/tools/profiling_and_debugging/src/roofline/roofline_template.cpp new file mode 100644 index 0000000000..f931551d66 --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/src/roofline/roofline_template.cpp @@ -0,0 +1,398 @@ +//============================================================== +// https://software.intel.com/sites/default/files/managed/5f/33/roofline_demo_samples.zip +// +// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT, +// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/ +// +// Copyright 2016-2018 Intel Corporation +// +// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS. +// +// ============================================================= + +/* +* This file is intended for use with the "Roofline Analysis in +* Intel(R) Advisor 2017" tutorial video. The tutorial video +* depicts the use of this code with an Intel(R) Core(TM) i5-6300U +* processor, compiled with the Intel(R) C++ compiler (version +* 17.0). Due to the system-dependent nature of Roofline, no +* guarantee is made that this code will produce the results +* depicted in the video on your machine (which may have different +* roofs and/or cache sizes) or with your compiler (which may alter +* algorithms and therefore arithmetic intensities). +*/ +#include +#include +#include +#include +using namespace std; + +/*******************************/ +/* Control Panel */ +/*******************************/ +//#define GROUP_1 +#ifdef GROUP_1 + //#define G1_AOS_SCALAR + #define G1_SOA_SCALAR + //#define G1_SOA_VECTOR +#endif /************************/ +//#define GROUP_2 +#ifdef GROUP_2 + //#define G2_AOS_SCALAR + //#define G2_SOA_SCALAR + #define G2_SOA_VECTOR +#endif /************************/ +#define GROUP_3 +#ifdef GROUP_3 + #define YYYY + //#define G3_AOS_SCALAR + //#define G3_SOA_SCALAR + //#define G3_AOS_VECTOR + //#define G3_SOA_VECTOR + //#define G3_SOA_VECTOR_FMAS +#endif /************************/ +//#define GROUP_4 +#ifdef GROUP_4 + #define G4_SOA + //#define G4_AOSOA +#endif /************************/ + +#define MAXVALUE 1000000 + +#define ARRAY_SIZE_1 1328 +//#define REPEAT_1 10000000 +//#define REPEAT_1 1000000 +//#define REPEAT_1 100000 +//#define REPEAT_1 10000 +#define REPEAT_1 XXXX + +#define ARRAY_SIZE_2 2000 +#define REPEAT_2 30000000 +#define UNROLL_COUNT 2 +#define VECTOR_LENGTH 4 +void setupArrays(); + +// ---- Timer ---- +std::chrono::time_point start, stop; + +// int elapsed_seconds = std::chrono::duration_cast +// (end-start).count(); +// std::time_t end_time = std::chrono::system_clock::to_time_t(end); +// +// std::cout << "finished computation at " << std::ctime(&end_time) +// << "elapsed time: " << elapsed_seconds << "s\n"; + +//typedef std::chrono::high_resolution_clock Clock; +//typedef std::chrono::time_point TimePoint; +//typedef std::chrono::duration Time; +//TimePoint start, stop; + +/********** Data Set 1 **********/ +// Array of Structures +typedef struct S1_AoS +{ + double a; + double b; + double pad1; + double pad2; +} S1_AoS; +double AoS1_X[ARRAY_SIZE_1]; +S1_AoS AoS1_Y[ARRAY_SIZE_1]; +// Structure of Arrays +typedef struct S1_SoA +{ + double a[ARRAY_SIZE_1]; + double b[ARRAY_SIZE_1]; + double pad1[ARRAY_SIZE_1]; + double pad2[ARRAY_SIZE_1]; +} S1_SoA; +double SoA1_X[ARRAY_SIZE_1]; +S1_SoA SoA1_Y; +/********** Data Set 2 **********/ +// Structure of Arrays +typedef struct S2_SoA +{ + double a[ARRAY_SIZE_2]; + double b[ARRAY_SIZE_2]; +} S2_SoA; +double SoA2_X[ARRAY_SIZE_2]; +S2_SoA SoA2_Y; +// Array of Structure of Arrays +typedef struct AoSoA +{ + double a[ARRAY_SIZE_2 / 2]; + double b[ARRAY_SIZE_2 / 2]; +} AoSoA; +double AoSoA_X[ARRAY_SIZE_2]; +AoSoA AoSoA_Y[2]; + +//int main() +int main(int argc, char *argv[]) +{ + MPI::Init(argc, argv); + setupArrays(); + cout << "Setup complete.\n"; + + //############################## Group 1 ##############################// + // Group 1 is a low arithmetic intensity algorithm intended to display + // roofline behavior which may initially seem counter-intuitive. + #ifdef GROUP_1 + cout << "####################### Group 1 #######################\n" + << " Algorithm: X = Ya + Yb\n Data Set 1: " << ARRAY_SIZE_1 + << " doubles/array.\n"; + #endif + /******************** AOS - Unvectorized ********************/ + #ifdef G1_AOS_SCALAR + for (int r = 0; r < REPEAT_1; r++) + { + #pragma unroll (UNROLL_COUNT) + #pragma novector + for (int i = 0; i < ARRAY_SIZE_1; i++) + { + AoS1_X[i] = AoS1_Y[i].a + AoS1_Y[i].b; + } + } + cout << "Unvectorized AOS loop complete.\n"; + #endif + /******************** SOA - Unvectorized ********************/ + #ifdef G1_SOA_SCALAR + for (int r = 0; r < REPEAT_1; r++) + { + #pragma unroll (UNROLL_COUNT) + #pragma novector + for (int i = 0; i < ARRAY_SIZE_1; i++) + { + SoA1_X[i] = SoA1_Y.a[i] + SoA1_Y.b[i]; + } + } + cout << "Unvectorized SOA loop complete.\n"; + #endif + /********************* SOA - Vectorized *********************/ + #ifdef G1_SOA_VECTOR + for (int r = 0; r < REPEAT_1; r++) + { + #pragma unroll (UNROLL_COUNT) + #pragma omp simd simdlen(VECTOR_LENGTH) + for (int i = 0; i < ARRAY_SIZE_1; i++) + { + SoA1_X[i] = SoA1_Y.a[i] + SoA1_Y.b[i]; + } + } + cout << "Vectorized SOA loop complete.\n"; + #endif + + //############################## Group 2 ##############################// + // Group 2 is not explored in the tutorial video, but it's here if you + // wish to experiment with it. It has an AI between Groups 1 and 3. + #ifdef GROUP_2 + cout << "####################### Group 2 #######################\n" + << " Algorithm: X = Ya + Yb + Yb\n Data Set 1: " << ARRAY_SIZE_1 + << " doubles/array.\n"; + #endif + /******************** AOS - Unvectorized ********************/ + #ifdef G2_AOS_SCALAR + for (int r = 0; r < REPEAT_1; r++) + { + #pragma unroll (UNROLL_COUNT) + #pragma novector + for (int i = 0; i < ARRAY_SIZE_1; i++) + { + AoS1_X[i] = AoS1_Y[i].a + AoS1_Y[i].b + AoS1_Y[i].b; + } + } + cout << "Unvectorized AOS loop complete.\n"; + #endif + /******************** SOA - Unvectorized ********************/ + #ifdef G2_SOA_SCALAR + for (int r = 0; r < REPEAT_1; r++) + { + #pragma unroll (UNROLL_COUNT) + #pragma novector + for (int i = 0; i < ARRAY_SIZE_1; i++) + { + SoA1_X[i] = SoA1_Y.a[i] + SoA1_Y.b[i] + SoA1_Y.b[i]; + } + } + cout << "Unvectorized SOA loop complete.\n"; + #endif + /********************* SOA - Vectorized *********************/ + #ifdef G2_SOA_VECTOR + for (int r = 0; r < REPEAT_1; r++) + { + #pragma unroll (UNROLL_COUNT) + #pragma omp simd simdlen(VECTOR_LENGTH) + for (int i = 0; i < ARRAY_SIZE_1; i++) + { + SoA1_X[i] = SoA1_Y.a[i] + SoA1_Y.b[i] + SoA1_Y.b[i]; + } + } + cout << "Vectorized SOA loop complete.\n"; + #endif + + //############################## Group 3 ##############################// + // Group 3 is a high arithmetic intensity algorithm that is intended + // to demonstrate compute binding and compiler-induced AI changes. + #ifdef GROUP_3 + cout << "####################### Group 3 #######################\n" + << " Algorithm: X = Ya + Ya + Yb + Yb + Yb\n Data Set 1: " << ARRAY_SIZE_1 + << " doubles/array.\n"; + #endif + /******************** AOS - Unvectorized ********************/ + #ifdef G3_AOS_SCALAR + start = std::chrono::system_clock::now(); + for (int r = 0; r < REPEAT_1; r++) + { + #pragma unroll (UNROLL_COUNT) + #pragma novector + for (int i = 0; i < ARRAY_SIZE_1; i++) + { + AoS1_X[i] = AoS1_Y[i].a + AoS1_Y[i].a + AoS1_Y[i].b + AoS1_Y[i].b + AoS1_Y[i].b; + } + } + cout << "Unvectorized AOS loop complete.\n"; + stop = std::chrono::system_clock::now(); + std::cout << "elapsed time: " + << std::chrono::duration_cast (stop-start).count() + << "ms\n"; + + #endif + /******************** SOA - Unvectorized ********************/ + #ifdef G3_SOA_SCALAR + start = std::chrono::system_clock::now(); + for (int r = 0; r < REPEAT_1; r++) + { + #pragma unroll (UNROLL_COUNT) + #pragma novector + for (int i = 0; i < ARRAY_SIZE_1; i++) + { + SoA1_X[i] = SoA1_Y.a[i] + SoA1_Y.a[i] + SoA1_Y.b[i] + SoA1_Y.b[i] + SoA1_Y.b[i]; + } + } + cout << "Unvectorized SOA loop complete.\n"; + stop = std::chrono::system_clock::now(); + std::cout << "elapsed time: " + << std::chrono::duration_cast (stop-start).count() + << "ms\n"; + #endif + /********************* AOS - Vectorized *********************/ + #ifdef G3_AOS_VECTOR + start = std::chrono::system_clock::now(); + for (int r = 0; r < REPEAT_1; r++) + { + #pragma unroll (UNROLL_COUNT) + #pragma omp simd simdlen(VECTOR_LENGTH) + for (int i = 0; i < ARRAY_SIZE_1; i++) + { + AoS1_X[i] = AoS1_Y[i].a + AoS1_Y[i].a + AoS1_Y[i].b + AoS1_Y[i].b + AoS1_Y[i].b; + } + } + cout << "Vectorized AOS loop complete.\n"; + stop = std::chrono::system_clock::now(); + std::cout << "elapsed time: " + << std::chrono::duration_cast (stop-start).count() + << "ms\n"; + #endif + /********************* SOA - Vectorized *********************/ + #ifdef G3_SOA_VECTOR + start = std::chrono::system_clock::now(); + for (int r = 0; r < REPEAT_1; r++) + { + #pragma unroll (UNROLL_COUNT) + #pragma omp simd simdlen(VECTOR_LENGTH) + for (int i = 0; i < ARRAY_SIZE_1; i++) + { + SoA1_X[i] = SoA1_Y.a[i] + SoA1_Y.a[i] + SoA1_Y.b[i] + SoA1_Y.b[i] + SoA1_Y.b[i]; + } + } + cout << "Vectorized SOA loop complete.\n"; + stop = std::chrono::system_clock::now(); + std::cout << "elapsed time: " + << std::chrono::duration_cast (stop-start).count() + << "ms\n"; + #endif + /**************** SOA - Vectorized with FMAs ****************/ + #ifdef G3_SOA_VECTOR_FMAS + start = std::chrono::system_clock::now(); + for (int r = 0; r < REPEAT_1; r++) + { + #pragma unroll (UNROLL_COUNT) + #pragma omp simd simdlen(VECTOR_LENGTH) + for (int i = 0; i < ARRAY_SIZE_1; i++) + { + SoA1_X[i] = (2.0 * SoA1_Y.b[i] + SoA1_Y.b[i]) + SoA1_Y.a[i] * 2.0; + } + } + cout << "Vectorized SOA with FMAs loop complete.\n"; + stop = std::chrono::system_clock::now(); + std::cout << "elapsed time: " + << std::chrono::duration_cast (stop-start).count() + << "ms\n"; + #endif + + //############################## Group 4 ##############################// + // Group 4 uses a different data set than the other Groups, and has + // a medium AI. It is intended to demonstrate cache bandwidth binding. + #ifdef GROUP_4 + cout << "####################### Group 4 #######################\n" + << " Algorithm: X = Ya + Ya + Yb + Yb\n Data Set 2: " << ARRAY_SIZE_2 + << " doubles/array.\n"; + #endif + /**************************** SOA ***************************/ + #ifdef G4_SOA + for (int r = 0; r < REPEAT_2; r++) + { + #pragma nounroll + #pragma omp simd simdlen(VECTOR_LENGTH) + for (int i = 0; i < ARRAY_SIZE_2; i++) + { + SoA2_X[i] = SoA2_Y.a[i] + SoA2_Y.a[i] + SoA2_Y.b[i] + SoA2_Y.b[i]; + } + } + cout << "SOA loop complete.\n"; + #endif + /*************************** AOSOA **************************/ + #ifdef G4_AOSOA + for (int r = 0; r < REPEAT_2; r++) + { + for (int j = 0; j < 2; j++) + { + #pragma nounroll + #pragma omp simd simdlen(VECTOR_LENGTH) + for (int i = 0; i < ARRAY_SIZE_2 / 2; i++) + { + AoSoA_X[(j * (ARRAY_SIZE_2 / 2)) + i] = AoSoA_Y[j].a[i] + AoSoA_Y[j].a[i] + + AoSoA_Y[j].b[i] + AoSoA_Y[j].b[i]; + } + } + } + cout << "AOSOA loop complete.\n"; + #endif + + MPI::Finalize(); + return EXIT_SUCCESS; +} + +void setupArrays() +{ + for (int i = 0; i < ARRAY_SIZE_1; i++) + { + SoA1_Y.a[i] = ((rand() % MAXVALUE) + 1) / 3; + SoA1_Y.b[i] = ((rand() % MAXVALUE) + 1) / 3; + AoS1_Y[i].a = SoA1_Y.a[i]; + AoS1_Y[i].b = SoA1_Y.b[i]; + } + for (int i = 0; i < ARRAY_SIZE_2; i++) + { + SoA2_Y.a[i] = ((rand() % MAXVALUE) + 1) / 3; + SoA2_Y.b[i] = ((rand() % MAXVALUE) + 1) / 3; + } + for (int i = 0; i < ARRAY_SIZE_2 / 2; i++) + { + AoSoA_Y[0].a[i] = SoA2_Y.a[i]; + AoSoA_Y[1].a[i] = SoA2_Y.a[i + (ARRAY_SIZE_2 / 2)]; + AoSoA_Y[0].b[i] = SoA2_Y.b[i]; + AoSoA_Y[1].b[i] = SoA2_Y.b[i + (ARRAY_SIZE_2 / 2)]; + } +} From 323ebae03f5a2ad7e8f236226a8f92b3629e4791 Mon Sep 17 00:00:00 2001 From: jgp Date: Tue, 26 Mar 2019 17:50:58 +0100 Subject: [PATCH 2/6] typo --- .../intel_advisor_roofline.py | 6 +++--- .../src/roofline/API/cscs.py | 20 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py index 1fb14bfb3f..e9b16c6e35 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py @@ -85,13 +85,13 @@ def __init__(self, repeat, datalayout): 'advixe-cl --report=roofs --project-dir=%s &> %s' % (self.roofdir, self.roofline_ref), 'python2 API/cscs.py %s &> %s' % (self.roofdir, self.roofline_rpt), - # 'advixe-cl --format=csv' seems to be not working (empty report), + # 'advixe-cl --format=csv' seems to be not working (empty report), # keeping as reference for later check: # 'advixe-cl --show-all-columns -csv-delimiter=";"' # ' --report=tripcounts --format=csv --project-dir=%s &> %s' # This can be used instead (see advisor/config/report/roofs.tmpl): # 'advixe-cl --report custom --report-template ./TEMPL/cscs.tmpl' - # ' --project-dir=%s &> %s' + # ' --project-dir=%s &> %s' ] self.maintainers = ['JG'] self.tags = {'production'} @@ -155,4 +155,4 @@ def setup(self, partition, environ, **job_opts): sn.assert_reference(sn.extractsingle( r'^returned\sGFLOPS\sgap\s=\s(?P.*)', self.roofline_rpt, 'Flops', float), 0.0, -0.01, 0.01), - ]) + ]) diff --git a/cscs-checks/tools/profiling_and_debugging/src/roofline/API/cscs.py b/cscs-checks/tools/profiling_and_debugging/src/roofline/API/cscs.py index 73bd683e62..99cc3709cb 100644 --- a/cscs-checks/tools/profiling_and_debugging/src/roofline/API/cscs.py +++ b/cscs-checks/tools/profiling_and_debugging/src/roofline/API/cscs.py @@ -1,5 +1,5 @@ # Taken from /opt/intel/advisor_2018/pythonapi/examples/roofline.py -# +# # The roofline model is based on GFLOPS and Arithmetic Intensity (AI): # "Self GFLOPS" = "Self GFLOP" / "Self Elapsed Time" # "Self GB/s" = "Self Memory GB" / "Self Elapsed Time" @@ -43,25 +43,25 @@ _self_arithmetic_intensity = _self_gflops / _self_gb_s # --- Reported values: -print('self_elapsed_time' , self_elapsed_time) -print('self_memory_gb' , self_memory_gb) -print('self_gb_s' , self_gb_s) -print('self_gflop' , self_gflop) -print('self_gflops' , self_gflops) +print('self_elapsed_time', self_elapsed_time) +print('self_memory_gb', self_memory_gb) +print('self_gb_s', self_gb_s) +print('self_gflop', self_gflop) +print('self_gflops', self_gflops) print('self_arithmetic_intensity', self_arithmetic_intensity) print('_self_gb_s', _self_gb_s, self_gb_s) print('_self_gflops', _self_gflops, self_gflops) print('_self_arithmetic_intensity', _self_arithmetic_intensity, - self_arithmetic_intensity) + self_arithmetic_intensity) print('gap _self_gb_s', _self_gb_s-self_gb_s) print('gap _self_gflops', _self_gflops-self_gflops) print('gap _self_arithmetic_intensity', - _self_arithmetic_intensity-self_arithmetic_intensity) + _self_arithmetic_intensity-self_arithmetic_intensity) # --- Compare the roofline report: print('returned AI gap = {:.16f}'. - format(_self_arithmetic_intensity-self_arithmetic_intensity)) + format(_self_arithmetic_intensity-self_arithmetic_intensity)) print('returned GFLOPS gap = {:.16f}'. - format(_self_gflops-self_gflops)) + format(_self_gflops-self_gflops)) From c6eebc6db2ddbcc4980229072f6274c41765b99f Mon Sep 17 00:00:00 2001 From: jgp Date: Wed, 27 Mar 2019 20:10:00 +0100 Subject: [PATCH 3/6] pullrequestreview-219555285 --- .../intel_advisor_roofline.py | 3 +- .../src/roofline/API/cscs.py | 67 --- .../src/roofline/LICENSE | 254 ----------- .../src/roofline/Makefile | 16 - .../src/roofline/TEMPL/cscs.tmpl | 47 --- .../src/roofline/roofline_template.cpp | 398 ------------------ 6 files changed, 2 insertions(+), 783 deletions(-) delete mode 100644 cscs-checks/tools/profiling_and_debugging/src/roofline/API/cscs.py delete mode 100644 cscs-checks/tools/profiling_and_debugging/src/roofline/LICENSE delete mode 100644 cscs-checks/tools/profiling_and_debugging/src/roofline/Makefile delete mode 100644 cscs-checks/tools/profiling_and_debugging/src/roofline/TEMPL/cscs.tmpl delete mode 100644 cscs-checks/tools/profiling_and_debugging/src/roofline/roofline_template.cpp diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py index e9b16c6e35..bd4714f647 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py @@ -36,7 +36,8 @@ def __init__(self, repeat, datalayout): self.prgenv_flags = { 'PrgEnv-intel': ['-O2', '-g', '-std=c++11'], } - self.sourcesdir = 'src/roofline' + self.sourcesdir = os.path.join(self.current_system.resourcesdir, + 'Espresso', 'intel_advisor') self.build_system = 'Make' self.prebuild_cmd = [ 'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' % diff --git a/cscs-checks/tools/profiling_and_debugging/src/roofline/API/cscs.py b/cscs-checks/tools/profiling_and_debugging/src/roofline/API/cscs.py deleted file mode 100644 index 99cc3709cb..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/src/roofline/API/cscs.py +++ /dev/null @@ -1,67 +0,0 @@ -# Taken from /opt/intel/advisor_2018/pythonapi/examples/roofline.py -# -# The roofline model is based on GFLOPS and Arithmetic Intensity (AI): -# "Self GFLOPS" = "Self GFLOP" / "Self Elapsed Time" -# "Self GB/s" = "Self Memory GB" / "Self Elapsed Time" -# "Self AI" = "Self GFLOPS" / "Self GB/s" -import sys -try: - import advisor -except ImportError: - print('''Import error: Python could not load advisor python library. - Possible reasons:\n 1. Python cannot resolve path to Advisor\'s pythonapi -directory. To fix, either manually add path to the pythonapi directory into -PYTHONPATH environment variable, or use advixe-vars.* scripts to set up -product environment variables automatically.\n 2. Incompatible runtime -versions used by advisor python library and other packages (such as -matplotlib or pandas). To fix, either try to change import order or update -other package version if possible. 3. cscs: try -sys.path.append(\'/opt/intel/advisor/pythonapi\')''') - sys.exit(1) - -if len(sys.argv) < 2: - print('Usage: "python {} path_to_project_dir"'.format(__file__)) - sys.exit(2) - -project = advisor.open_project(sys.argv[1]) -data = project.load(advisor.SURVEY) -# data = project.load(advisor.ALL) -rows = [{col: row[col] for col in row} for row in data.bottomup] - -# --- Extract values from the report and compute our arithmetic_intensity: -self_elapsed_time = float(rows[0]['self_elapsed_time']) - -self_memory_gb = float(rows[0]['self_memory_gb']) -self_gb_s = float(rows[0]['self_gb_s']) -_self_gb_s = self_memory_gb / self_elapsed_time - -self_gflop = float(rows[0]['self_gflop']) -self_gflops = float(rows[0]['self_gflops']) -_self_gflops = self_gflop / self_elapsed_time - -self_arithmetic_intensity = float(rows[0]['self_arithmetic_intensity']) -_self_arithmetic_intensity = _self_gflops / _self_gb_s - -# --- Reported values: -print('self_elapsed_time', self_elapsed_time) -print('self_memory_gb', self_memory_gb) -print('self_gb_s', self_gb_s) -print('self_gflop', self_gflop) -print('self_gflops', self_gflops) -print('self_arithmetic_intensity', self_arithmetic_intensity) - -print('_self_gb_s', _self_gb_s, self_gb_s) -print('_self_gflops', _self_gflops, self_gflops) -print('_self_arithmetic_intensity', _self_arithmetic_intensity, - self_arithmetic_intensity) - -print('gap _self_gb_s', _self_gb_s-self_gb_s) -print('gap _self_gflops', _self_gflops-self_gflops) -print('gap _self_arithmetic_intensity', - _self_arithmetic_intensity-self_arithmetic_intensity) - -# --- Compare the roofline report: -print('returned AI gap = {:.16f}'. - format(_self_arithmetic_intensity-self_arithmetic_intensity)) -print('returned GFLOPS gap = {:.16f}'. - format(_self_gflops-self_gflops)) diff --git a/cscs-checks/tools/profiling_and_debugging/src/roofline/LICENSE b/cscs-checks/tools/profiling_and_debugging/src/roofline/LICENSE deleted file mode 100644 index d982501d7d..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/src/roofline/LICENSE +++ /dev/null @@ -1,254 +0,0 @@ -Copyright 2016-2017 Intel(R) Corporation - -http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/ - -Intel Sample Source Code License Agreement - -Code Samples License Agreement (Version December 2015) - -IMPORTANT - READ BEFORE COPYING, INSTALLING OR USING. -Do not copy, install or use the Materials (as defined below) provided under -this license agreement ("Agreement") from Intel Corporation (?Intel?), until -you (?You?) have carefully read the following terms and conditions. By copying, -installing or otherwise using the Materials, You agree to be bound by the terms -of this Agreement. If You do not agree to the terms of this Agreement, do not -copy, install or use the Materials. - -If You are agreeing to the terms and conditions of this Agreement on behalf of -a company or other legal entity (?Legal Entity?), You represent and warrant -that You have the legal authority to bind that Legal Entity to the Agreement, -in which case, "You" or "Your" will mean such Legal Entity. - -By agreeing to this Agreement, You affirm that You are of legal age (18 years -old or older) to enter into this Agreement. If You are not of legal age You may -not enter into this Agreement, and either Your parent, legal guardian or Legal -Entity must agree to the terms and conditions of this Agreement and enter into -this Agreement, in which case, "You" or "Your" will mean such parent, legal -guardian, or Legal Entity. - -Third Party Programs (as defined below), even if included with the distribution -of the Materials, are governed by separate third party license terms, including -without limitation, open source software license terms. Such third party -license terms (and not this Agreement) govern Your use of the Third Party -Programs, and Intel is not liable for the Third Party Programs. - -1. LICENSE DEFINITIONS: - -?Licensed Patent Claims? means the claims of Intel?s patents that are -necessarily and directly infringed by the reproduction and distribution of the -Materials that is authorized in Section 2 below, when the Materials are in its -unmodified form as delivered by Intel to You and not modified or combined with -anything else. Licensed Patent Claims are only those claims that Intel can -license without paying, or getting the consent of, a third party. - -?Materials? means Sample Source Code, Redistributables, and End-User -Documentation but do not include Third Party Programs. - -?Sample Source Code? means Source Code files that are identified as sample code -and which may include example interface or application source code, and any -updates, provided under this Agreement. - -?Source Code? is defined as the software (and not documentation or text) -portion of the Materials provided in human readable format, and includes -modifications that You make or are made on Your behalf as expressly permitted -under the terms of this Agreement. - -?Redistributables? means header, library, and dynamically linkable library -files, and any updates, provided under this Agreement. - -?Third Party Programs? (if any) are the third party software files that may be -included with the Materials for the applicable software that include a separate -third party license agreement in an attached text file. - -?End-User Documentation? means textual materials intended for end users -relating to the Materials. - -2. LICENSE GRANT: - -Subject to the terms and conditions of this Agreement, Intel grants You a -non-exclusive, worldwide, non-assignable, royalty-free limited right and -license: - -A. under its copyrights, to: - -1) Copy, modify, and compile the Sample Source Code and distribute it solely in -Your products in executable and source code form; -2) Copy and distribute the Redistributables solely with Your products; -3) Copy, modify, and distribute the End User Documentation solely with Your -products. - -B. Under its patents, to: - -1) make copies of the Materials internally only; -2) use the Materials internally only; and -3) offer to distribute, and distribute, but not sell, the Materials only as -part of or with Your products, under Intel?s copyright license granted in -Section 2(A) but only under the terms of that copyright license and not as a -sale (but this right does not include the right to sub-license); -4) provided, further, that the license under the Licensed Patent Claims does -not and will not apply to any modifications to, or derivative works of, the -Materials, whether made by You, Your end user (which, for all purposes under -this Agreement, will mean either an end user, a customer, reseller, distributor -or other channel partner), or any third party even if the modification and -creation of derivative works are permitted under 2(A). - -3. LICENSE RESTRICTIONS: - -Except as expressly provided in this Agreement, You may not: - -i. use, copy, distribute or publicly display the Materials; -ii. reverse-assemble, reverse-compile, or otherwise reverse-engineer any -software provided solely in binary form, iii. rent or lease the Materials to -any third party; -iv. assign this Agreement or display the Materials; -v. assign this Agreement or transfer the Materials; -vi. modify, adapt or translate the Materials in whole or in part; -vii. distribute, sublicense or transfer the source code form of the Materials -or derivatives thereof to any third party; viii. distribute the Materials -except as part of Your products; -ix. include the Materials in malicious, deceptive, or unlawful programs or -products; -x. modify, create a derivative work, link or distribute the Materials so that -any part of it becomes subject to an Excluded License. - -Upon Intel's release of an update, upgrade, or new version of the Materials, -you will make reasonable efforts to discontinue distribution of the enclosed -Materials and you will make reasonable efforts to distribute such updates, -upgrades, or new versions to your customers who have received the Materials -herein. - -Distribution of the Materials is also subject to the following limitations. -You: - -i. will be solely responsible to your customers for any update or support -obligation or other liability which may arise from the distribution; -ii. will not make any statement that your product is "certified," or that its -performance is guaranteed, by Intel; -iii. will not use Intel's name or trademarks to market your product without -written permission; -iv. will prohibit disassembly and reverse engineering of the Materials provided -in executable form; -v. will not publish reviews of Materials without written permission by Intel, -and -vi. will indemnify, hold harmless, and defend Intel and its suppliers from and -against any claims or lawsuits, including attorney's fees, that arise or result -from your distribution of any product. - -4. OWNERSHIP: Title to the Materials and all copies thereof remain with Intel -or its suppliers. The Materials are copyrighted and are protected by United -States copyright laws and international treaty provisions. You will not remove -any copyright notice from the Materials. You agree to prevent unauthorized -copying of the Materials. Except as expressly provided herein, Intel does not -grant any express or implied right to you under Intel patents, copyrights, -trademarks, or trade secret information. - -5. NO WARRANTY AND NO SUPPORT: Disclaimer. Intel disclaims all warranties of -any kind and the terms and remedies provided in this Agreement are instead of -any other warranty or condition, express, implied or statutory, including those -regarding merchantability, fitness for any particular purpose, non-infringement -or any warranty arising out of any course of dealing, usage of trade, proposal, -specification or sample. Intel does not assume (and does not authorize any -person to assume on its behalf) any other liability. - -Intel may make changes to the Materials, or to items referenced therein, at any -time without notice, but is not obligated to support, update or provide -training for the Materials. Intel may in its sole discretion offer such -support, update or training services under separate terms at Intel?s -then-current rates. You may request additional information on Intel?s service -offerings from an Intel sales representative. - -6. USER SUBMISSIONS: You agree that any material, information, or other -communication, including all data, images, sounds, text, and other things -embodied therein, you transmit or post to an Intel website will be considered -non-confidential ("Communications"). Intel will have no confidentiality -obligations with respect to the Communications. You agree that Intel and its -designees will be free to copy, modify, create derivative works, publicly -display, disclose, distribute, license and sublicense through multiple tiers of -distribution and licensees, incorporate, and otherwise use the Communications, -including derivative works thereto, for any and all commercial or -non-commercial purposes. - -7. LIMITATION OF LIABILITY: Neither Intel nor its suppliers shall be liable for -any damages whatsoever (including, without limitation, damages for loss of -business profits, business interruption, loss of business information, or other -loss) arising out of the use of or inability to use the Materials, even if -Intel has been advised of the possibility of such damages. Because some -jurisdictions prohibit the exclusion or limitation of liability for -consequential or incidental damages, the above limitation may not apply to You. - -8. TERM AND TERMINATION: This Agreement commences upon Your copying, installing -or using the Materials and continues until terminated. Either You or Intel may -terminate this Agreement at any time upon 30 days prior written notice to the -other party. Intel may terminate this license at any time if you are in breach -of any of its terms and conditions. Upon termination, You will immediately -destroy the Materials or return all copies of the Materials to Intel along with -any copies You have made. After termination, the license grant to any Materials -or Redistributables distributed by You in accordance with the terms and -conditions of this Agreement, prior to the effective date of such termination, -will survive any such termination of this Agreement. - -9. U.S. GOVERNMENT RESTRICTED RIGHTS: The technical data and computer software -covered by this license is a ?Commercial Item,? as such term is defined by the -FAR 2.101 (48 C.F.R. 2.101) and is ?commercial computer software? and -?commercial computer software documentation? as specified under FAR 12.212 (48 -C.F.R. 12.212) or DFARS 227.7202 (48 C.F.R. 227.7202), as applicable. This -commercial computer software and related documentation is provided to end users -for use by and on behalf of the U.S. Government, with only those rights as are -granted to all other end users pursuant to the terms and conditions herein. Use -for or on behalf of the U.S. Government is permitted only if the party -acquiring or using this software is properly authorized by an appropriate U.S. -Government official. This use by or for the U.S. Government clause is in lieu -of, and supersedes, any other FAR, DFARS, or other provision that addresses -Government rights in the computer software or documentation covered by this -license. All copyright licenses granted to the U.S. Government are coextensive -with the technical data and computer software licenses granted herein. The U.S. -Government will only have the right to reproduce, distribute, perform, display, -and prepare derivative works as needed to implement those rights. - -10. APPLICABLE LAWS: All disputes arising out of or related to this Agreement, -whether based on contract, tort, or any other legal or equitable theory, will -in all respects be governed by, and construed and interpreted under, the laws -of the United States of America and the State of Delaware, without reference to -conflict of laws principles. The parties agree that the United Nations -Convention on Contracts for the International Sale of Goods (1980) is -specifically excluded from and will not apply to this Agreement. All disputes -arising out of or related to this Agreement, whether based on contract, tort, -or any other legal or equitable theory, will be subject to the exclusive -jurisdiction of the courts of the State of Delaware or of the Federal courts -sitting in that State. Each party submits to the personal jurisdiction of those -courts and waives all objections to that jurisdiction and venue for those -disputes. - -11. SEVERABILITY: The parties intend that if a court holds that any provision -or part of this Agreement is invalid or unenforceable under applicable law, the -court will modify the provision to the minimum extent necessary to make it -valid and enforceable, or if it cannot be made valid and enforceable, the -parties intend that the court will sever and delete the provision or part from -this Agreement. Any change to or deletion of a provision or part of this -Agreement under this Section will not affect the validity or enforceability of -the remainder of this Agreement, which will continue in full force and effect. - -12. EXPORT: You must comply with all laws and regulations of the United States -and other countries governing the export, re-export, import, transfer, -distribution, use, and servicing of Software. In particular, You must not: (a) -sell or transfer Software to a country subject to sanctions, or to any entity -listed on a denial order published by the United States government or any other -relevant government; or (b) use, sell, or transfer Software for the -development, design, manufacture, or production of nuclear, missile, chemical -or biological weapons, or for any other purpose prohibited by the United States -government or other applicable government; without first obtaining all -authorizations required by all applicable laws. For more details on Your export -obligations, please visit http://www.intel.com/content/www/us/en/legal/ -export-compliance.html. - -13. ENTIRE AGREEMENT: This Agreement contains the complete and exclusive -agreement and understanding between the parties concerning the subject matter -of this Agreement, and supersedes all prior and contemporaneous proposals, -agreements, understanding, negotiations, representations, warranties, -conditions, and communications, oral or written, between the parties relating -to the same subject matter. No modification or amendment to this Agreement will -be effective unless in writing and signed by authorized representatives of each -party, and must specifically identify this Agreement. - -For more complete information about compiler optimizations, see our Optimization -Notice: https://software.intel.com/en-us/articles/optimization-notice#opt-en diff --git a/cscs-checks/tools/profiling_and_debugging/src/roofline/Makefile b/cscs-checks/tools/profiling_and_debugging/src/roofline/Makefile deleted file mode 100644 index d5900fa44a..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/src/roofline/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -RM := rm -f -EXECUTABLE := roof.exe -all: $(EXECUTABLE) -OBJS := _roofline.o - -$(OBJS): - $(PREP) $(CXX) $(CXXFLAGS) -c -o $(@) $(@:.o=.cpp) - -$(EXECUTABLE): $(OBJS) - $(PREP) $(CXX) $(CXXFLAGS) -o $(@) $(OBJS) $(LDFLAGS) - -clean: - -$(RM) $(OBJS) - -distclean: - -$(RM) $(OBJS) $(EXECUTABLE) diff --git a/cscs-checks/tools/profiling_and_debugging/src/roofline/TEMPL/cscs.tmpl b/cscs-checks/tools/profiling_and_debugging/src/roofline/TEMPL/cscs.tmpl deleted file mode 100644 index 5d1d1684f3..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/src/roofline/TEMPL/cscs.tmpl +++ /dev/null @@ -1,47 +0,0 @@ -# ----------------------------------------------------------------------------- -# Taken from /opt/intel/advisor_2018/config/report/roofs.tmpl -# -# advixe-cl --report custom --report-template ./cscs.tmpl --project-dir=./mydir -# ----------------------------------------------------------------------------- -import advixe_discpythonapi1 as discmodels -import sys -import disccli2 as common - -def create_table(roofs): - data = [] - columns = ["Name", "Bandwidth", "Type"] - data.append(columns) - if not roofs: - return data - for roof in roofs: - row = [roof.name, str(roof.bandwidth)] - if roof.has_type(discmodels.roof_type.compute): - row.append("compute") - else: - row.append("memory") - data.append(row) - return data - -format = environment.cmd_args['output_format'] -output = environment.cmd_args['output_file'] -delim = environment.cmd_args['csv_delimiter'] - -if output: - prev_stdout = sys.stdout - sys.stdout = open(output, 'w') - -project = discmodels.open_project(data_input, _progress,_messenger) -result = project.load(discmodels.ModelType.SURVEY) -if result: -#ok print(result.get_column_descriptions()) - # bottomup: - iterjg = result.get_bottomup_rows() - dictjg0 = next(iterjg) - print(dictjg0['source_location'], - dictjg0['total_elapsed_time'], dictjg0['self_elapsed_time'], - dictjg0['total_time_percent'],dictjg0['self_time_percent'], - dictjg0['self_gflop'],dictjg0['self_gflops'], - ) - -if output: - sys.stdout = prev_stdout diff --git a/cscs-checks/tools/profiling_and_debugging/src/roofline/roofline_template.cpp b/cscs-checks/tools/profiling_and_debugging/src/roofline/roofline_template.cpp deleted file mode 100644 index f931551d66..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/src/roofline/roofline_template.cpp +++ /dev/null @@ -1,398 +0,0 @@ -//============================================================== -// https://software.intel.com/sites/default/files/managed/5f/33/roofline_demo_samples.zip -// -// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT, -// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/ -// -// Copyright 2016-2018 Intel Corporation -// -// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT -// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR -// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS. -// -// ============================================================= - -/* -* This file is intended for use with the "Roofline Analysis in -* Intel(R) Advisor 2017" tutorial video. The tutorial video -* depicts the use of this code with an Intel(R) Core(TM) i5-6300U -* processor, compiled with the Intel(R) C++ compiler (version -* 17.0). Due to the system-dependent nature of Roofline, no -* guarantee is made that this code will produce the results -* depicted in the video on your machine (which may have different -* roofs and/or cache sizes) or with your compiler (which may alter -* algorithms and therefore arithmetic intensities). -*/ -#include -#include -#include -#include -using namespace std; - -/*******************************/ -/* Control Panel */ -/*******************************/ -//#define GROUP_1 -#ifdef GROUP_1 - //#define G1_AOS_SCALAR - #define G1_SOA_SCALAR - //#define G1_SOA_VECTOR -#endif /************************/ -//#define GROUP_2 -#ifdef GROUP_2 - //#define G2_AOS_SCALAR - //#define G2_SOA_SCALAR - #define G2_SOA_VECTOR -#endif /************************/ -#define GROUP_3 -#ifdef GROUP_3 - #define YYYY - //#define G3_AOS_SCALAR - //#define G3_SOA_SCALAR - //#define G3_AOS_VECTOR - //#define G3_SOA_VECTOR - //#define G3_SOA_VECTOR_FMAS -#endif /************************/ -//#define GROUP_4 -#ifdef GROUP_4 - #define G4_SOA - //#define G4_AOSOA -#endif /************************/ - -#define MAXVALUE 1000000 - -#define ARRAY_SIZE_1 1328 -//#define REPEAT_1 10000000 -//#define REPEAT_1 1000000 -//#define REPEAT_1 100000 -//#define REPEAT_1 10000 -#define REPEAT_1 XXXX - -#define ARRAY_SIZE_2 2000 -#define REPEAT_2 30000000 -#define UNROLL_COUNT 2 -#define VECTOR_LENGTH 4 -void setupArrays(); - -// ---- Timer ---- -std::chrono::time_point start, stop; - -// int elapsed_seconds = std::chrono::duration_cast -// (end-start).count(); -// std::time_t end_time = std::chrono::system_clock::to_time_t(end); -// -// std::cout << "finished computation at " << std::ctime(&end_time) -// << "elapsed time: " << elapsed_seconds << "s\n"; - -//typedef std::chrono::high_resolution_clock Clock; -//typedef std::chrono::time_point TimePoint; -//typedef std::chrono::duration Time; -//TimePoint start, stop; - -/********** Data Set 1 **********/ -// Array of Structures -typedef struct S1_AoS -{ - double a; - double b; - double pad1; - double pad2; -} S1_AoS; -double AoS1_X[ARRAY_SIZE_1]; -S1_AoS AoS1_Y[ARRAY_SIZE_1]; -// Structure of Arrays -typedef struct S1_SoA -{ - double a[ARRAY_SIZE_1]; - double b[ARRAY_SIZE_1]; - double pad1[ARRAY_SIZE_1]; - double pad2[ARRAY_SIZE_1]; -} S1_SoA; -double SoA1_X[ARRAY_SIZE_1]; -S1_SoA SoA1_Y; -/********** Data Set 2 **********/ -// Structure of Arrays -typedef struct S2_SoA -{ - double a[ARRAY_SIZE_2]; - double b[ARRAY_SIZE_2]; -} S2_SoA; -double SoA2_X[ARRAY_SIZE_2]; -S2_SoA SoA2_Y; -// Array of Structure of Arrays -typedef struct AoSoA -{ - double a[ARRAY_SIZE_2 / 2]; - double b[ARRAY_SIZE_2 / 2]; -} AoSoA; -double AoSoA_X[ARRAY_SIZE_2]; -AoSoA AoSoA_Y[2]; - -//int main() -int main(int argc, char *argv[]) -{ - MPI::Init(argc, argv); - setupArrays(); - cout << "Setup complete.\n"; - - //############################## Group 1 ##############################// - // Group 1 is a low arithmetic intensity algorithm intended to display - // roofline behavior which may initially seem counter-intuitive. - #ifdef GROUP_1 - cout << "####################### Group 1 #######################\n" - << " Algorithm: X = Ya + Yb\n Data Set 1: " << ARRAY_SIZE_1 - << " doubles/array.\n"; - #endif - /******************** AOS - Unvectorized ********************/ - #ifdef G1_AOS_SCALAR - for (int r = 0; r < REPEAT_1; r++) - { - #pragma unroll (UNROLL_COUNT) - #pragma novector - for (int i = 0; i < ARRAY_SIZE_1; i++) - { - AoS1_X[i] = AoS1_Y[i].a + AoS1_Y[i].b; - } - } - cout << "Unvectorized AOS loop complete.\n"; - #endif - /******************** SOA - Unvectorized ********************/ - #ifdef G1_SOA_SCALAR - for (int r = 0; r < REPEAT_1; r++) - { - #pragma unroll (UNROLL_COUNT) - #pragma novector - for (int i = 0; i < ARRAY_SIZE_1; i++) - { - SoA1_X[i] = SoA1_Y.a[i] + SoA1_Y.b[i]; - } - } - cout << "Unvectorized SOA loop complete.\n"; - #endif - /********************* SOA - Vectorized *********************/ - #ifdef G1_SOA_VECTOR - for (int r = 0; r < REPEAT_1; r++) - { - #pragma unroll (UNROLL_COUNT) - #pragma omp simd simdlen(VECTOR_LENGTH) - for (int i = 0; i < ARRAY_SIZE_1; i++) - { - SoA1_X[i] = SoA1_Y.a[i] + SoA1_Y.b[i]; - } - } - cout << "Vectorized SOA loop complete.\n"; - #endif - - //############################## Group 2 ##############################// - // Group 2 is not explored in the tutorial video, but it's here if you - // wish to experiment with it. It has an AI between Groups 1 and 3. - #ifdef GROUP_2 - cout << "####################### Group 2 #######################\n" - << " Algorithm: X = Ya + Yb + Yb\n Data Set 1: " << ARRAY_SIZE_1 - << " doubles/array.\n"; - #endif - /******************** AOS - Unvectorized ********************/ - #ifdef G2_AOS_SCALAR - for (int r = 0; r < REPEAT_1; r++) - { - #pragma unroll (UNROLL_COUNT) - #pragma novector - for (int i = 0; i < ARRAY_SIZE_1; i++) - { - AoS1_X[i] = AoS1_Y[i].a + AoS1_Y[i].b + AoS1_Y[i].b; - } - } - cout << "Unvectorized AOS loop complete.\n"; - #endif - /******************** SOA - Unvectorized ********************/ - #ifdef G2_SOA_SCALAR - for (int r = 0; r < REPEAT_1; r++) - { - #pragma unroll (UNROLL_COUNT) - #pragma novector - for (int i = 0; i < ARRAY_SIZE_1; i++) - { - SoA1_X[i] = SoA1_Y.a[i] + SoA1_Y.b[i] + SoA1_Y.b[i]; - } - } - cout << "Unvectorized SOA loop complete.\n"; - #endif - /********************* SOA - Vectorized *********************/ - #ifdef G2_SOA_VECTOR - for (int r = 0; r < REPEAT_1; r++) - { - #pragma unroll (UNROLL_COUNT) - #pragma omp simd simdlen(VECTOR_LENGTH) - for (int i = 0; i < ARRAY_SIZE_1; i++) - { - SoA1_X[i] = SoA1_Y.a[i] + SoA1_Y.b[i] + SoA1_Y.b[i]; - } - } - cout << "Vectorized SOA loop complete.\n"; - #endif - - //############################## Group 3 ##############################// - // Group 3 is a high arithmetic intensity algorithm that is intended - // to demonstrate compute binding and compiler-induced AI changes. - #ifdef GROUP_3 - cout << "####################### Group 3 #######################\n" - << " Algorithm: X = Ya + Ya + Yb + Yb + Yb\n Data Set 1: " << ARRAY_SIZE_1 - << " doubles/array.\n"; - #endif - /******************** AOS - Unvectorized ********************/ - #ifdef G3_AOS_SCALAR - start = std::chrono::system_clock::now(); - for (int r = 0; r < REPEAT_1; r++) - { - #pragma unroll (UNROLL_COUNT) - #pragma novector - for (int i = 0; i < ARRAY_SIZE_1; i++) - { - AoS1_X[i] = AoS1_Y[i].a + AoS1_Y[i].a + AoS1_Y[i].b + AoS1_Y[i].b + AoS1_Y[i].b; - } - } - cout << "Unvectorized AOS loop complete.\n"; - stop = std::chrono::system_clock::now(); - std::cout << "elapsed time: " - << std::chrono::duration_cast (stop-start).count() - << "ms\n"; - - #endif - /******************** SOA - Unvectorized ********************/ - #ifdef G3_SOA_SCALAR - start = std::chrono::system_clock::now(); - for (int r = 0; r < REPEAT_1; r++) - { - #pragma unroll (UNROLL_COUNT) - #pragma novector - for (int i = 0; i < ARRAY_SIZE_1; i++) - { - SoA1_X[i] = SoA1_Y.a[i] + SoA1_Y.a[i] + SoA1_Y.b[i] + SoA1_Y.b[i] + SoA1_Y.b[i]; - } - } - cout << "Unvectorized SOA loop complete.\n"; - stop = std::chrono::system_clock::now(); - std::cout << "elapsed time: " - << std::chrono::duration_cast (stop-start).count() - << "ms\n"; - #endif - /********************* AOS - Vectorized *********************/ - #ifdef G3_AOS_VECTOR - start = std::chrono::system_clock::now(); - for (int r = 0; r < REPEAT_1; r++) - { - #pragma unroll (UNROLL_COUNT) - #pragma omp simd simdlen(VECTOR_LENGTH) - for (int i = 0; i < ARRAY_SIZE_1; i++) - { - AoS1_X[i] = AoS1_Y[i].a + AoS1_Y[i].a + AoS1_Y[i].b + AoS1_Y[i].b + AoS1_Y[i].b; - } - } - cout << "Vectorized AOS loop complete.\n"; - stop = std::chrono::system_clock::now(); - std::cout << "elapsed time: " - << std::chrono::duration_cast (stop-start).count() - << "ms\n"; - #endif - /********************* SOA - Vectorized *********************/ - #ifdef G3_SOA_VECTOR - start = std::chrono::system_clock::now(); - for (int r = 0; r < REPEAT_1; r++) - { - #pragma unroll (UNROLL_COUNT) - #pragma omp simd simdlen(VECTOR_LENGTH) - for (int i = 0; i < ARRAY_SIZE_1; i++) - { - SoA1_X[i] = SoA1_Y.a[i] + SoA1_Y.a[i] + SoA1_Y.b[i] + SoA1_Y.b[i] + SoA1_Y.b[i]; - } - } - cout << "Vectorized SOA loop complete.\n"; - stop = std::chrono::system_clock::now(); - std::cout << "elapsed time: " - << std::chrono::duration_cast (stop-start).count() - << "ms\n"; - #endif - /**************** SOA - Vectorized with FMAs ****************/ - #ifdef G3_SOA_VECTOR_FMAS - start = std::chrono::system_clock::now(); - for (int r = 0; r < REPEAT_1; r++) - { - #pragma unroll (UNROLL_COUNT) - #pragma omp simd simdlen(VECTOR_LENGTH) - for (int i = 0; i < ARRAY_SIZE_1; i++) - { - SoA1_X[i] = (2.0 * SoA1_Y.b[i] + SoA1_Y.b[i]) + SoA1_Y.a[i] * 2.0; - } - } - cout << "Vectorized SOA with FMAs loop complete.\n"; - stop = std::chrono::system_clock::now(); - std::cout << "elapsed time: " - << std::chrono::duration_cast (stop-start).count() - << "ms\n"; - #endif - - //############################## Group 4 ##############################// - // Group 4 uses a different data set than the other Groups, and has - // a medium AI. It is intended to demonstrate cache bandwidth binding. - #ifdef GROUP_4 - cout << "####################### Group 4 #######################\n" - << " Algorithm: X = Ya + Ya + Yb + Yb\n Data Set 2: " << ARRAY_SIZE_2 - << " doubles/array.\n"; - #endif - /**************************** SOA ***************************/ - #ifdef G4_SOA - for (int r = 0; r < REPEAT_2; r++) - { - #pragma nounroll - #pragma omp simd simdlen(VECTOR_LENGTH) - for (int i = 0; i < ARRAY_SIZE_2; i++) - { - SoA2_X[i] = SoA2_Y.a[i] + SoA2_Y.a[i] + SoA2_Y.b[i] + SoA2_Y.b[i]; - } - } - cout << "SOA loop complete.\n"; - #endif - /*************************** AOSOA **************************/ - #ifdef G4_AOSOA - for (int r = 0; r < REPEAT_2; r++) - { - for (int j = 0; j < 2; j++) - { - #pragma nounroll - #pragma omp simd simdlen(VECTOR_LENGTH) - for (int i = 0; i < ARRAY_SIZE_2 / 2; i++) - { - AoSoA_X[(j * (ARRAY_SIZE_2 / 2)) + i] = AoSoA_Y[j].a[i] + AoSoA_Y[j].a[i] - + AoSoA_Y[j].b[i] + AoSoA_Y[j].b[i]; - } - } - } - cout << "AOSOA loop complete.\n"; - #endif - - MPI::Finalize(); - return EXIT_SUCCESS; -} - -void setupArrays() -{ - for (int i = 0; i < ARRAY_SIZE_1; i++) - { - SoA1_Y.a[i] = ((rand() % MAXVALUE) + 1) / 3; - SoA1_Y.b[i] = ((rand() % MAXVALUE) + 1) / 3; - AoS1_Y[i].a = SoA1_Y.a[i]; - AoS1_Y[i].b = SoA1_Y.b[i]; - } - for (int i = 0; i < ARRAY_SIZE_2; i++) - { - SoA2_Y.a[i] = ((rand() % MAXVALUE) + 1) / 3; - SoA2_Y.b[i] = ((rand() % MAXVALUE) + 1) / 3; - } - for (int i = 0; i < ARRAY_SIZE_2 / 2; i++) - { - AoSoA_Y[0].a[i] = SoA2_Y.a[i]; - AoSoA_Y[1].a[i] = SoA2_Y.a[i + (ARRAY_SIZE_2 / 2)]; - AoSoA_Y[0].b[i] = SoA2_Y.b[i]; - AoSoA_Y[1].b[i] = SoA2_Y.b[i + (ARRAY_SIZE_2 / 2)]; - } -} From ceeeaf7506713caa70dec7641effa6bbca76a71d Mon Sep 17 00:00:00 2001 From: jgp Date: Thu, 28 Mar 2019 12:12:45 +0100 Subject: [PATCH 4/6] common --- .../tools/profiling_and_debugging/intel_advisor_roofline.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py index bd4714f647..72f1e6cc5d 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py @@ -1,3 +1,5 @@ +import os + import reframe as rfm import reframe.utility.sanity as sn @@ -37,7 +39,7 @@ def __init__(self, repeat, datalayout): 'PrgEnv-intel': ['-O2', '-g', '-std=c++11'], } self.sourcesdir = os.path.join(self.current_system.resourcesdir, - 'Espresso', 'intel_advisor') + 'roofline', 'intel_advisor') self.build_system = 'Make' self.prebuild_cmd = [ 'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' % @@ -87,7 +89,7 @@ def __init__(self, repeat, datalayout): (self.roofdir, self.roofline_ref), 'python2 API/cscs.py %s &> %s' % (self.roofdir, self.roofline_rpt), # 'advixe-cl --format=csv' seems to be not working (empty report), - # keeping as reference for later check: + # keeping as reference for a future check: # 'advixe-cl --show-all-columns -csv-delimiter=";"' # ' --report=tripcounts --format=csv --project-dir=%s &> %s' # This can be used instead (see advisor/config/report/roofs.tmpl): From 17cb45b20eacbf8c411f776ddf4a2f9fe293488e Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 29 Mar 2019 08:27:27 +0100 Subject: [PATCH 5/6] pullrequestreview-220231653 --- .../intel_advisor_roofline.py | 110 +++++++++--------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py index 72f1e6cc5d..5890e448b5 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py @@ -5,8 +5,9 @@ @rfm.required_version('>=2.14') -@rfm.parameterized_test(*[[repeat, datalayout] +@rfm.parameterized_test(*[[repeat, toolsversion, datalayout] for repeat in ['50000'] + for toolsversion in ['551025'] for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR', 'G3_AOS_VECTOR', 'G3_SOA_VECTOR', 'G3_SOA_VECTOR_FMAS']]) @@ -26,10 +27,9 @@ class IntelRooflineTest(rfm.RegressionTest): analysis ('advixe-cl -collect tripcounts -flop') using the same project directory for both steps. ''' - def __init__(self, repeat, datalayout): + def __init__(self, repeat, toolsversion, datalayout): super().__init__() - self.name = 'Intel_Roofline_%s_%s' % (repeat, datalayout) - self.descr = 'repeat=%s' % repeat + self.descr = 'Roofline Analysis test with Intel Advisor' self.valid_systems = ['daint:mc', 'dom:mc'] # Reporting MFLOPS is not available on Intel Haswell cpus, see # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/ @@ -53,65 +53,30 @@ def __init__(self, repeat, datalayout): 'CRAYPE_LINK_TYPE': 'dynamic', } self.pre_run = [ - # Testing with advisor/2018: + # Testing with advisor/2018 Update 2 (build 551025): # advisor/2019 is broken on dom ("Exceeded job memory limit"), # and advisor/2019 is not installed on daint, 'source $INTEL_PATH/../advisor_2018/advixe-vars.sh', - 'advixe-cl -help collect |head -20', + 'advixe-cl -help collect | head -20', ] self.executable = 'advixe-cl' - self.exe = './roof.exe' + self.target_executable = './roof.exe' self.roofdir = './roof.dir' self.executable_opts = [ '--collect survey --project-dir=%s --search-dir src:rp=. ' '--data-limit=0 --no-auto-finalize --trace-mpi -- %s ' % - (self.roofdir, self.exe) + (self.roofdir, self.target_executable) ] self.version_rpt = 'Intel_Advisor_version.rpt' - self.post_run = [ - # collecting the performance data for the roofline model is a 2 - # steps process: - 'srun %s --collect tripcounts --flop --project-dir=%s ' - '--search-dir src:rp=. --data-limit=0 --no-auto-finalize ' - '--trace-mpi -- %s' % (self.executable, self.roofdir, self.exe), - # check tool's version: - 'advixe-cl -V &> %s' % self.version_rpt, - # "advixe-cl --report" looks for e000/ in the output directory; - # if not found, it will fail with: - # IOError: Survey result cannot be loaded - 'cd %s;ln -s nid* e000;cd -' % self.roofdir, - ] self.roofline_ref = 'Intel_Advisor_roofline_reference.rpt' self.roofline_rpt = 'Intel_Advisor_roofline_results.rpt' - self.post_run += [ - # report reference values/boundaries (roofline_ref): - 'advixe-cl --report=roofs --project-dir=%s &> %s' % - (self.roofdir, self.roofline_ref), - 'python2 API/cscs.py %s &> %s' % (self.roofdir, self.roofline_rpt), - # 'advixe-cl --format=csv' seems to be not working (empty report), - # keeping as reference for a future check: - # 'advixe-cl --show-all-columns -csv-delimiter=";"' - # ' --report=tripcounts --format=csv --project-dir=%s &> %s' - # This can be used instead (see advisor/config/report/roofs.tmpl): - # 'advixe-cl --report custom --report-template ./TEMPL/cscs.tmpl' - # ' --project-dir=%s &> %s' - ] - self.maintainers = ['JG'] - self.tags = {'production'} - - def setup(self, partition, environ, **job_opts): - super().setup(partition, environ, **job_opts) - environ_name = self.current_environ.name - prgenv_flags = self.prgenv_flags[environ_name] - self.build_system.cxxflags = prgenv_flags - toolsversion = '551025' # 2018 Update 2 (build 551025) # Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4): - _L1bw = 293*1024**3 - _L2bw = 79*1024**3 - _L3bw = 33*1024**3 - _DPfmabw = 49*1024**3 - _DPaddbw = 12*1024**3 - _ScalarAddbw = 3*1024**3 + L1bw = 293*1024**3 + L2bw = 79*1024**3 + L3bw = 33*1024**3 + DPfmabw = 49*1024**3 + DPaddbw = 12*1024**3 + ScalarAddbw = 3*1024**3 self.sanity_patterns = sn.all([ # check the job status: sn.assert_found('loop complete.', self.stdout), @@ -124,32 +89,32 @@ def setup(self, partition, environ, **job_opts): sn.assert_reference(sn.extractsingle( r'^L1\sBandwidth\s\(single-threaded\)\s+(?P\d+)\s+' r'memory$', self.roofline_ref, 'L1bw', int), - _L1bw, -0.08, 0.08), + L1bw, -0.08, 0.08), # check --report=roofs (L2 bandwidth): sn.assert_reference(sn.extractsingle( r'^L2\sBandwidth\s\(single-threaded\)\s+(?P\d+)\s+' r'memory$', self.roofline_ref, 'L2bw', int), - _L2bw, -0.08, 0.08), + L2bw, -0.08, 0.08), # check --report=roofs (L3 bandwidth): sn.assert_reference(sn.extractsingle( r'^L3\sBandwidth\s\(single-threaded\)\s+(?P\d+)\s+' r'memory$', self.roofline_ref, 'L3bw', int), - _L3bw, -0.08, 0.08), + L3bw, -0.08, 0.08), # check --report=roofs (DP FMA): sn.assert_reference(sn.extractsingle( r'^DP Vector FMA Peak\s\(single-threaded\)\s+' r'(?P\d+)\s+compute$', self.roofline_ref, - 'DPfmabw', int), _DPfmabw, -0.08, 0.08), + 'DPfmabw', int), DPfmabw, -0.08, 0.08), # check --report=roofs (DP Add): sn.assert_reference(sn.extractsingle( r'^DP Vector Add Peak\s\(single-threaded\)\s+' r'(?P\d+)\s+compute$', self.roofline_ref, - 'DPaddbw', int), _DPaddbw, -0.08, 0.08), + 'DPaddbw', int), DPaddbw, -0.08, 0.08), # check --report=roofs (Scalar Add): sn.assert_reference(sn.extractsingle( r'^Scalar Add Peak\s\(single-threaded\)\s+' r'(?P\d+)\s+compute$', self.roofline_ref, - 'ScalarAddbw', int), _ScalarAddbw, -0.08, 0.08), + 'ScalarAddbw', int), ScalarAddbw, -0.08, 0.08), # --- check Arithmetic_intensity: sn.assert_reference(sn.extractsingle( r'^returned\sAI\sgap\s=\s(?P.*)', self.roofline_rpt, @@ -159,3 +124,38 @@ def setup(self, partition, environ, **job_opts): r'^returned\sGFLOPS\sgap\s=\s(?P.*)', self.roofline_rpt, 'Flops', float), 0.0, -0.01, 0.01), ]) + self.maintainers = ['JG'] + self.tags = {'production'} + + def setup(self, partition, environ, **job_opts): + super().setup(partition, environ, **job_opts) + environ_name = self.current_environ.name + prgenv_flags = self.prgenv_flags[environ_name] + self.build_system.cxxflags = prgenv_flags + launcher_cmd = ''.join(self.job.launcher.command(self.job)) + self.post_run = [ + # collecting the performance data for the roofline model is a 2 + # steps process: + '%s %s --collect tripcounts --flop --project-dir=%s ' + '--search-dir src:rp=. --data-limit=0 --no-auto-finalize ' + '--trace-mpi -- %s' % + (launcher_cmd, self.executable, self.roofdir, + self.target_executable), + # check tool's version: + 'advixe-cl -V &> %s' % self.version_rpt, + # "advixe-cl --report" looks for e000/ in the output directory; + # if not found, it will fail with: + # IOError: Survey result cannot be loaded + 'cd %s;ln -s nid* e000;cd -' % self.roofdir, + # report reference values/boundaries (roofline_ref): + 'advixe-cl --report=roofs --project-dir=%s &> %s' % + (self.roofdir, self.roofline_ref), + 'python2 API/cscs.py %s &> %s' % (self.roofdir, self.roofline_rpt), + # 'advixe-cl --format=csv' seems to be not working (empty report), + # keeping as reference for a future check: + # 'advixe-cl --show-all-columns -csv-delimiter=";"' + # ' --report=tripcounts --format=csv --project-dir=%s &> %s' + # This can be used instead (see advisor/config/report/roofs.tmpl): + # 'advixe-cl --report custom --report-template ./TEMPL/cscs.tmpl' + # ' --project-dir=%s &> %s' + ] From f5935c8735be783070917ff73a0acc441af179b3 Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 29 Mar 2019 10:07:33 +0100 Subject: [PATCH 6/6] discussion_r270323548 --- .../tools/profiling_and_debugging/intel_advisor_roofline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py index 5890e448b5..bbaafcae0e 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py @@ -132,7 +132,7 @@ def setup(self, partition, environ, **job_opts): environ_name = self.current_environ.name prgenv_flags = self.prgenv_flags[environ_name] self.build_system.cxxflags = prgenv_flags - launcher_cmd = ''.join(self.job.launcher.command(self.job)) + launcher_cmd = ' '.join(self.job.launcher.command(self.job)) self.post_run = [ # collecting the performance data for the roofline model is a 2 # steps process: