Skip to content

Commit

Permalink
Fix profiling and add profiling for cuda
Browse files Browse the repository at this point in the history
  • Loading branch information
adityapb authored and prabhuramachandran committed Sep 5, 2020
1 parent 3919f34 commit ac454e8
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 43 deletions.
4 changes: 2 additions & 2 deletions pysph/base/gpu_nnps_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def get_simple_kernel(kernel_name, args, src, wgs, preamble=""):
kernel_name, preamble=preamble
)

return profile_kernel(knl, kernel_name)
return profile_kernel(knl, kernel_name, backend='opencl')


def get_elwise_kernel(kernel_name, args, src, preamble=""):
Expand All @@ -23,7 +23,7 @@ def get_elwise_kernel(kernel_name, args, src, preamble=""):
ctx, args, src,
kernel_name, preamble=preamble
)
return profile_kernel(knl, kernel_name)
return profile_kernel(knl, kernel_name, backend='opencl')


class GPUNNPSHelper(object):
Expand Down
11 changes: 7 additions & 4 deletions pysph/base/tree/point_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class IncompatibleTreesException(Exception):
pass


@named_profile('neighbor_count_prefix_sum')
@named_profile('neighbor_count_prefix_sum', backend='opencl')
@memoize
def _get_neighbor_count_prefix_sum_kernel(ctx):
return GenericScanKernel(ctx, np.int32,
Expand Down Expand Up @@ -386,7 +386,8 @@ def set_node_bounds(self):
params['node_operation'], params['output_expr'],
preamble=_get_macros_preamble(self.c_type, self.sorted, self.dim)
)
set_node_bounds = profile_kernel(set_node_bounds, 'set_node_bounds')
set_node_bounds = profile_kernel(set_node_bounds, 'set_node_bounds',
backend='opencl')

pa_gpu = self.pa.gpu
dtype = ctype_to_dtype(self.c_type)
Expand Down Expand Up @@ -441,7 +442,9 @@ def find_neighbor_cids(self, tree_src):
output_expr="cnt[i] = count;"
)
find_neighbor_cid_counts = profile_kernel(
find_neighbor_cid_counts, 'find_neighbor_cid_count')
find_neighbor_cid_counts, 'find_neighbor_cid_count',
backend='opencl'
)
find_neighbor_cid_counts(tree_src.pbounds.dev,
neighbor_cid_count.dev)

Expand All @@ -463,7 +466,7 @@ def find_neighbor_cids(self, tree_src):
output_expr=""
)
find_neighbor_cids = profile_kernel(
find_neighbor_cids, 'find_neighbor_cids')
find_neighbor_cids, 'find_neighbor_cids', backend='opencl')
find_neighbor_cids(tree_src.pbounds.dev,
neighbor_cid_count.dev, neighbor_cids.dev)
return neighbor_cid_count, neighbor_cids
Expand Down
10 changes: 5 additions & 5 deletions pysph/base/tree/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def get_M_array_initialization(k):
# The array of vectors M are just a math trick. M[j] gives a vector with j
# zeros and k - j ones (With k = 4, M[1] = {0, 1, 1, 1}). Adding this up in
# the prefix sum directly gives us the required result.
@named_profile('particle_reordering')
@named_profile('particle_reordering', backend='opencl')
@memoize
def _get_particle_kernel(ctx, k, args, index_code):
return GenericScanKernel(
Expand All @@ -196,7 +196,7 @@ def _get_particle_kernel(ctx, k, args, index_code):
# offset of first child in next layer + k * (number of non-leaf nodes before
# given node).
# If the node is a leaf, we set this value to be -1.
@named_profile('set_offset')
@named_profile('set_offset', backend='opencl')
@memoize
def _get_set_offset_kernel(ctx, k, leaf_size):
return GenericScanKernel(
Expand Down Expand Up @@ -224,7 +224,7 @@ def _get_set_offset_kernel(ctx, k, leaf_size):
#
# Note that unique_cids also gives us the list of leaves / last layer nodes
# which are not empty.
@named_profile('unique_cids')
@named_profile('unique_cids', backend='opencl')
@memoize
def _get_unique_cids_kernel(ctx):
return GenericScanKernel(
Expand All @@ -245,7 +245,7 @@ def _get_unique_cids_kernel(ctx):

# A lot of leaves are going to be empty. Not really sure if this guy is of
# any use.
@named_profile('leaves')
@named_profile('leaves', backend='opencl')
@memoize
def _get_leaves_kernel(ctx, leaf_size):
return GenericScanKernel(
Expand All @@ -265,7 +265,7 @@ def _get_leaves_kernel(ctx, leaf_size):
)


@named_profile("group_cids")
@named_profile("group_cids", backend='opencl')
@memoize
def _get_cid_groups_kernel(ctx):
return GenericScanKernel(
Expand Down
29 changes: 2 additions & 27 deletions pysph/solver/application.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Standard imports.
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from argparse import ArgumentDefaultsHelpFormatter
from compyle.utils import ArgumentParser
import glob
import inspect
import json
Expand Down Expand Up @@ -430,13 +431,6 @@ def _setup_argparse(self):
help="Dump output in the specified directory.")

# --openmp
parser.add_argument(
"--openmp",
action="store_true",
dest="with_openmp",
default=None,
help="Use OpenMP to run the "
"simulation using multiple cores.")
parser.add_argument(
"--no-openmp",
action="store_false",
Expand Down Expand Up @@ -479,22 +473,6 @@ def _setup_argparse(self):
default=False,
help="Use local memory with OpenCL (Experimental)"
)
# --profile
parser.add_argument(
"--profile",
action="store_true",
dest="profile",
default=False,
help="Enable profiling with OpenCL.")

# --use-double
parser.add_argument(
"--use-double",
action="store_true",
dest="use_double",
default=False,
help="Use double precision for OpenCL/CUDA code.")

# --kernel
all_kernels = list_all_kernels()
parser.add_argument(
Expand Down Expand Up @@ -1586,9 +1564,6 @@ def run(self, argv=None):
end_time = time.time()
run_duration = end_time - start_time
self._message("Run took: %.5f secs" % (run_duration))
if self.options.with_opencl and self.options.profile:
from compyle.opencl import print_profile
print_profile()
self._write_info(
self.info_filename, completed=True, cpu_time=run_duration)

Expand Down
8 changes: 3 additions & 5 deletions pysph/sph/acceleration_eval_gpu_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,9 @@ def get_queue(backend):


def profile_kernel(knl, backend):
if backend == 'cuda':
return knl
elif backend == 'opencl':
from compyle.opencl import profile_kernel
return profile_kernel(knl, knl.function_name)
if backend == 'cuda' or backend == 'opencl':
from compyle.profile import profile_kernel
return profile_kernel(knl, knl.function_name, backend=backend)
else:
raise RuntimeError('Unsupported GPU backend %s' % backend)

Expand Down

0 comments on commit ac454e8

Please sign in to comment.