Fix profiling and add profiling for cuda

pypr · Sep 5, 2020 · ac454e8 · ac454e8
1 parent 3919f34
commit ac454e8
Show file tree

Hide file tree

Showing 5 changed files with 19 additions and 43 deletions.
diff --git a/pysph/base/gpu_nnps_helper.py b/pysph/base/gpu_nnps_helper.py
@@ -13,7 +13,7 @@ def get_simple_kernel(kernel_name, args, src, wgs, preamble=""):
         kernel_name, preamble=preamble
     )
 
-    return profile_kernel(knl, kernel_name)
+    return profile_kernel(knl, kernel_name, backend='opencl')
 
 
 def get_elwise_kernel(kernel_name, args, src, preamble=""):
@@ -23,7 +23,7 @@ def get_elwise_kernel(kernel_name, args, src, preamble=""):
         ctx, args, src,
         kernel_name, preamble=preamble
     )
-    return profile_kernel(knl, kernel_name)
+    return profile_kernel(knl, kernel_name, backend='opencl')
 
 
 class GPUNNPSHelper(object):

diff --git a/pysph/base/tree/point_tree.py b/pysph/base/tree/point_tree.py
@@ -23,7 +23,7 @@ class IncompatibleTreesException(Exception):
     pass
 
 
-@named_profile('neighbor_count_prefix_sum')
+@named_profile('neighbor_count_prefix_sum', backend='opencl')
 @memoize
 def _get_neighbor_count_prefix_sum_kernel(ctx):
     return GenericScanKernel(ctx, np.int32,
@@ -386,7 +386,8 @@ def set_node_bounds(self):
             params['node_operation'], params['output_expr'],
             preamble=_get_macros_preamble(self.c_type, self.sorted, self.dim)
         )
-        set_node_bounds = profile_kernel(set_node_bounds, 'set_node_bounds')
+        set_node_bounds = profile_kernel(set_node_bounds, 'set_node_bounds',
+                                         backend='opencl')
 
         pa_gpu = self.pa.gpu
         dtype = ctype_to_dtype(self.c_type)
@@ -441,7 +442,9 @@ def find_neighbor_cids(self, tree_src):
             output_expr="cnt[i] = count;"
         )
         find_neighbor_cid_counts = profile_kernel(
-            find_neighbor_cid_counts, 'find_neighbor_cid_count')
+            find_neighbor_cid_counts, 'find_neighbor_cid_count',
+            backend='opencl'
+        )
         find_neighbor_cid_counts(tree_src.pbounds.dev,
                                  neighbor_cid_count.dev)
 
@@ -463,7 +466,7 @@ def find_neighbor_cids(self, tree_src):
             output_expr=""
         )
         find_neighbor_cids = profile_kernel(
-            find_neighbor_cids, 'find_neighbor_cids')
+            find_neighbor_cids, 'find_neighbor_cids', backend='opencl')
         find_neighbor_cids(tree_src.pbounds.dev,
                            neighbor_cid_count.dev, neighbor_cids.dev)
         return neighbor_cid_count, neighbor_cids

diff --git a/pysph/base/tree/tree.py b/pysph/base/tree/tree.py
@@ -176,7 +176,7 @@ def get_M_array_initialization(k):
 # The array of vectors M are just a math trick. M[j] gives a vector with j
 # zeros and k - j ones (With k = 4, M[1] = {0, 1, 1, 1}). Adding this up in
 # the prefix sum directly gives us the required result.
-@named_profile('particle_reordering')
+@named_profile('particle_reordering', backend='opencl')
 @memoize
 def _get_particle_kernel(ctx, k, args, index_code):
     return GenericScanKernel(
@@ -196,7 +196,7 @@ def _get_particle_kernel(ctx, k, args, index_code):
 # offset of first child in next layer + k * (number of non-leaf nodes before
 # given node).
 # If the node is a leaf, we set this value to be -1.
-@named_profile('set_offset')
+@named_profile('set_offset', backend='opencl')
 @memoize
 def _get_set_offset_kernel(ctx, k, leaf_size):
     return GenericScanKernel(
@@ -224,7 +224,7 @@ def _get_set_offset_kernel(ctx, k, leaf_size):
 #
 # Note that unique_cids also gives us the list of leaves / last layer nodes
 # which are not empty.
-@named_profile('unique_cids')
+@named_profile('unique_cids', backend='opencl')
 @memoize
 def _get_unique_cids_kernel(ctx):
     return GenericScanKernel(
@@ -245,7 +245,7 @@ def _get_unique_cids_kernel(ctx):
 
 # A lot of leaves are going to be empty. Not really sure if this guy is of
 # any use.
-@named_profile('leaves')
+@named_profile('leaves', backend='opencl')
 @memoize
 def _get_leaves_kernel(ctx, leaf_size):
     return GenericScanKernel(
@@ -265,7 +265,7 @@ def _get_leaves_kernel(ctx, leaf_size):
     )
 
 
-@named_profile("group_cids")
+@named_profile("group_cids", backend='opencl')
 @memoize
 def _get_cid_groups_kernel(ctx):
     return GenericScanKernel(

diff --git a/pysph/solver/application.py b/pysph/solver/application.py
@@ -1,5 +1,6 @@
 # Standard imports.
-from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+from argparse import ArgumentDefaultsHelpFormatter
+from compyle.utils import ArgumentParser
 import glob
 import inspect
 import json
@@ -430,13 +431,6 @@ def _setup_argparse(self):
             help="Dump output in the specified directory.")
 
         # --openmp
-        parser.add_argument(
-            "--openmp",
-            action="store_true",
-            dest="with_openmp",
-            default=None,
-            help="Use OpenMP to run the "
-            "simulation using multiple cores.")
         parser.add_argument(
             "--no-openmp",
             action="store_false",
@@ -479,22 +473,6 @@ def _setup_argparse(self):
             default=False,
             help="Use local memory with OpenCL (Experimental)"
         )
-        # --profile
-        parser.add_argument(
-            "--profile",
-            action="store_true",
-            dest="profile",
-            default=False,
-            help="Enable profiling with OpenCL.")
-
-        # --use-double
-        parser.add_argument(
-            "--use-double",
-            action="store_true",
-            dest="use_double",
-            default=False,
-            help="Use double precision for OpenCL/CUDA code.")
-
         # --kernel
         all_kernels = list_all_kernels()
         parser.add_argument(
@@ -1586,9 +1564,6 @@ def run(self, argv=None):
         end_time = time.time()
         run_duration = end_time - start_time
         self._message("Run took: %.5f secs" % (run_duration))
-        if self.options.with_opencl and self.options.profile:
-            from compyle.opencl import print_profile
-            print_profile()
         self._write_info(
             self.info_filename, completed=True, cpu_time=run_duration)
 

diff --git a/pysph/sph/acceleration_eval_gpu_helper.py b/pysph/sph/acceleration_eval_gpu_helper.py
@@ -170,11 +170,9 @@ def get_queue(backend):
 
 
 def profile_kernel(knl, backend):
-    if backend == 'cuda':
-        return knl
-    elif backend == 'opencl':
-        from compyle.opencl import profile_kernel
-        return profile_kernel(knl, knl.function_name)
+    if backend == 'cuda' or backend == 'opencl':
+        from compyle.profile import profile_kernel
+        return profile_kernel(knl, knl.function_name, backend=backend)
     else:
         raise RuntimeError('Unsupported GPU backend %s' % backend)