In [37]:
import json
import itertools
from pathlib import Path
from pprint import pprint

In [38]:
bench = Path("../results/simple_matrixmul/simple_matrixmul-64-128-128-32/")
bench = Path("../results/vectorAdd/vectorAdd-100-32/")
trace_dir = bench / "trace"
accel_trace_dir = bench / "accelsim-trace"

In [40]:
allocations_json_path = trace_dir / "allocations.json"
with open(allocations_json_path, "rb") as f:
    allocations = json.load(f)
commands_json_path = trace_dir / "commands.json"
with open(commands_json_path, "rb") as f:
    commands = json.load(f)

pprint([e for e in commands if "KernelLaunch" in e])
pprint(allocations)

[{'KernelLaunch': {'binary_version': 61,
                   'block': {'x': 1024, 'y': 1, 'z': 1},
                   'grid': {'x': 1, 'y': 1, 'z': 1},
                   'id': 0,
                   'local_mem_base_addr': 140508345139200,
                   'name': 'void vecAdd<float>(float*, float*, float*, int)',
                   'num_registers': 8,
                   'nvbit_version': '1.5.5',
                   'shared_mem_base_addr': 140508378693632,
                   'shared_mem_bytes': 0,
                   'stream_id': 0,
                   'trace_file': 'kernel-0.msgpack'}}]
[{'device_ptr': 140507664613376, 'num_bytes': 400},
 {'device_ptr': 140507664613888, 'num_bytes': 400},
 {'device_ptr': 140507664614400, 'num_bytes': 400},
 {'device_ptr': 140507664614912, 'num_bytes': 32},
 {'device_ptr': 140507664615424, 'num_bytes': 32},
 {'device_ptr': 140507664615936, 'num_bytes': 32},
 {'device_ptr': 140507664616448, 'num_bytes': 32},
 {'device_ptr': 140507664616960, 'num_bytes': 32

In [24]:
trace_json_path = trace_dir / "trace.json"
with open(trace_json_path, "rb") as f:
    trace = json.load(f)

TypeError: unhashable type: 'dict'

In [25]:
accel_commands_path = accel_trace_dir / "kernelslist.g"
accel_allocations = []
with open(accel_commands_path, "rb") as f:
    for i, line in enumerate(f.readlines()):
        line = line.decode('utf-8').strip()
        # print(line)
        if line.startswith("MemcpyHtoD"):
            start, size = [s.strip() for s in line.split(",")][-2:]
            accel_allocations.append((int(start, 16), int(size)))

pprint(accel_allocations)

[(140154537771008, 400), (140154537771520, 400), (140154537772032, 400)]


In [27]:
accel_trace_path = accel_trace_dir / "kernel-1.traceg"

class Trace:
    def __init__(
        self,
        block,
        warp,
        pc,
        mask,
        opcode,
        mem_width,
        mem_addresses,
        reg_srcs = None,
        reg_dests = None,
        line_num = None,
    ):
        self.block = block
        self.warp = warp
        
        self.pc = pc
        self.mask = mask
        self.opcode = opcode
        self.mem_width = mem_width
        self.mem_addresses = mem_addresses

        self.reg_dests = reg_dests or []
        self.reg_srcs = reg_srcs or []
        self.line_num = line_num

    @property
    def dest_num(self):
        return len(reg_dests)
        
    @property
    def src_num(self):
        return len(reg_srcs)

    def __repr__(self):
        return str(self.__dict__())
    
    def __dict__(self):
        return dict(
            block = self.block,
            warp = self.warp,
            pc = self.pc,
            mask = self.mask,
            opcode = self.opcode,
            mem_width = self.mem_width,
            mem_addresses = self.mem_addresses,
            reg_srcs = self.reg_srcs,
            reg_dests = self.reg_dests,
            line_num = self.line_num,
        )

def read_accel_trace():
    from collections import deque
    
    with open(accel_trace_path, "rb") as f:
        in_block = False
        warp = None
        block = None
        for i, line in enumerate(f.readlines()):
            line = line.decode('utf-8').strip()
    
            # traces format = [line_num] PC mask dest_num [reg_dests] opcode src_num [reg_srcs] mem_width [adrrescompress?] [mem_addresses]
            if line == "#BEGIN_TB":
                in_block = True
            elif line == "#END_TB":
                in_block = False
            elif line.strip() == "":
                pass
            elif "=" in line:
                key, value = [v.strip() for v in line.split("=")][:2]
                # print(key, value)
                if key == "thread block":
                    assert(in_block)
                    block = [int(x) for x in value.split(",")]
                if key == "warp":
                    assert(in_block)
                    warp = int(value)
            else:
                assert(in_block)
                values = deque([v.strip() for v in line.split(" ")])
                # print(values)
                pc = int(values.popleft(), 16)
                mask = int(values.popleft(), 16)
                dest_num = int(values.popleft())
                reg_dests = [values.popleft() for r in range(dest_num)]
                opcode = values.popleft()
                src_num = int(values.popleft())
                reg_srcs = [values.popleft() for r in range(src_num)]
                mem_width = int(values.popleft())
                mem_addresses = None
                # print(reg_dests)
                # print(reg_srcs)
                # print(mem_width)
                if mem_width > 0:
                    # have memory addresses
                    assert(len(values) > 0)
                    print(line)
                    print(values)
                    LIST_ALL = 0
                    BASE_STRIDE = 1
                    BASE_DELTA = 2
                    type = int(values.popleft())
                    if type == LIST_ALL:
                        mem_addresses = [0] * 32
                        for w in range(32):
                            active = mask & (1 << w) == 1
                            if active:
                                mem_addresses[w] = int(values.popleft(), 16)
                    elif type == BASE_STRIDE:
                        base = int(values.popleft(), 16)
                        stride = int(values.popleft())
                        pass
                    elif type == BASE_DELTA:
                        base = int(values.popleft(), 16)
                        pass
                
                if False:
                    values[0] = int(values[0], 16)
                    assert(len(values) > 4)
                    for vi in range(len(values)):
                        if isinstance(values[vi], str) and "0x" in values[vi]:
                            values[vi] = int(values[vi], 16)
                        elif isinstance(values[vi], str):
                            try:
                                values[vi] = int(values[vi])
                            except ValueError:
                                pass
                                
                trace = Trace(
                    block=block,
                    warp=warp,
                    pc=pc,
                    mask=mask,
                    opcode=opcode,
                    mem_width=mem_width,
                    mem_addresses=mem_addresses,
                    reg_srcs=reg_srcs,
                    reg_dests=reg_dests,
                )
                # print(trace)
                yield trace

accel_trace = list(read_accel_trace())
# accel_trace = list(itertools.islice(read_accel_trace(), 1000))
# pprint(accel_trace[:10])

0098 ffffffff 1 R4 LDG.E.CG 1 R4 4 1 0x7f7845700000 4
deque(['1', '0x7f7845700000', '4'])
00b0 ffffffff 1 R2 LDG.E.CG 1 R2 4 1 0x7f7845700200 4
deque(['1', '0x7f7845700200', '4'])
00d8 ffffffff 0 STG.E 2 R6 R0 4 1 0x7f7845700400 4
deque(['1', '0x7f7845700400', '4'])
0098 ffffffff 1 R4 LDG.E.CG 1 R4 4 1 0x7f7845700080 4
deque(['1', '0x7f7845700080', '4'])
00b0 ffffffff 1 R2 LDG.E.CG 1 R2 4 1 0x7f7845700280 4
deque(['1', '0x7f7845700280', '4'])
00d8 ffffffff 0 STG.E 2 R6 R0 4 1 0x7f7845700480 4
deque(['1', '0x7f7845700480', '4'])
0098 ffffffff 1 R4 LDG.E.CG 1 R4 4 1 0x7f7845700100 4
deque(['1', '0x7f7845700100', '4'])
00b0 ffffffff 1 R2 LDG.E.CG 1 R2 4 1 0x7f7845700300 4
deque(['1', '0x7f7845700300', '4'])
00d8 ffffffff 0 STG.E 2 R6 R0 4 1 0x7f7845700500 4
deque(['1', '0x7f7845700500', '4'])
0098 0000000f 1 R4 LDG.E.CG 1 R4 4 1 0x7f7845700180 4
deque(['1', '0x7f7845700180', '4'])
00b0 0000000f 1 R2 LDG.E.CG 1 R2 4 1 0x7f7845700380 4
deque(['1', '0x7f7845700380', '4'])
00d8 0000000f 0 STG

In [29]:
print(len(accel_trace))
for t in accel_trace:
    if t.mem_addresses is not None:
        print(t)

344


In [32]:
# [Allocation(id=1, name=, range=139699707445248..139699707478016, size=32768)
# Allocation(id=2, name=, range=139699707478016..139699707543552, size=65536)]
# 139699707478016 < 139699707478144 < 139699707478016 + 65536
# accel_trace

all_accesses = set()
# for t in [tt for tt in accel_trace if tt[2] == 728 and tt[1] == 1]:
for t in accel_trace:
     if is_mem(t) and get_accel_relative(t[-2]) is not None:
         all_accesses.add(get_accel_relative(t[-2]))

all_accesses = sorted(list(all_accesses))
#pprint(all_accesses)
#print(len(all_accesses))
         
#subset = [
    # [(to_block_id(t[0]), t[1], t[2], rel_addr) for rel_addr in [get_relative(addr) for addr in t["addrs"]])
#    None if not is_mem(t) else get_accel_relative(t[-2]) for t in accel_trace if t[2] == 728 and t[1] == 1 and is_mem(t)
#]
#pprint(subset)

NameError: name 'is_mem' is not defined

In [34]:
def is_mem(inst):
    return len(inst) > 12

def get_relative(addr):
    for alloc_id, allocation in enumerate(allocations):
        start_addr = allocation["device_ptr"]
        end_addr = start_addr + allocation["num_bytes"]
        if start_addr <= addr <= end_addr:
            return alloc_id, addr - start_addr
    return None

def get_accel_relative(addr):
    for alloc_id, (start, size) in enumerate(accel_allocations):
        start_addr = start
        end_addr = start + size
        if start_addr <= addr <= end_addr:
            return alloc_id, addr - start_addr
    return None

def to_block_id(block_id):
    return (block_id["x"], block_id["y"], block_id["z"])

In [35]:
sorted(list(set([to_block_id(t["thread_id"]) for t in trace])))

[(0, 0, 0),
 (32, 0, 0),
 (64, 0, 0),
 (96, 0, 0),
 (128, 0, 0),
 (160, 0, 0),
 (192, 0, 0),
 (224, 0, 0),
 (256, 0, 0),
 (288, 0, 0),
 (320, 0, 0),
 (352, 0, 0),
 (384, 0, 0),
 (416, 0, 0),
 (448, 0, 0),
 (480, 0, 0),
 (512, 0, 0),
 (544, 0, 0),
 (576, 0, 0),
 (608, 0, 0),
 (640, 0, 0),
 (672, 0, 0),
 (704, 0, 0),
 (736, 0, 0),
 (768, 0, 0),
 (800, 0, 0),
 (832, 0, 0),
 (864, 0, 0),
 (896, 0, 0),
 (928, 0, 0),
 (960, 0, 0),
 (992, 0, 0)]

In [41]:
print("instr offsets")
pprint(sorted(list(set([t["instr_offset"] for t in trace]))))
print("instr data width")
pprint(sorted(list(set([t["instr_data_width"] for t in trace]))))
# pprint(sorted(list(set([t[2] for t in accel_trace]))))

instr offsets
[88, 152, 176, 216, 240]
instr data width
[4]


In [156]:
# trace.sort(key=lambda x: (x["kernel_id"], to_block_id(x["block_id"]), x["warp_id_in_block"]))

In [None]:
all_rel_addr = set([
    [(to_block_id(t[0]), t[1], rel_addr) for rel_addr in [get_relative(addr) for addr in t["addrs"]])
    for t in accel_trace
])
all_block_ids = sorted(list(all_block_ids))
pprint(all_block_ids)

In [138]:
# -grid dim = (4,2,1)
# -block dim = (32,32,1)
all_block_ids = set([
    (to_block_id(t["block_id"]), t["warp_id_in_block"], t["warp_id_in_sm"]) # lane id is stupid, t["lane_id"]) # t["unique_thread_id"], to_block_id(t["thread_id"]))
    for t in trace
])
all_block_ids = sorted(list(all_block_ids))
pprint(all_block_ids)

[((0, 0, 0), 0, 0),
 ((0, 0, 0), 0, 1),
 ((0, 0, 0), 0, 2),
 ((0, 0, 0), 0, 3),
 ((0, 0, 0), 0, 4),
 ((0, 0, 0), 0, 5),
 ((0, 0, 0), 0, 6),
 ((0, 0, 0), 0, 7),
 ((0, 0, 0), 0, 8),
 ((0, 0, 0), 0, 9),
 ((0, 0, 0), 0, 10),
 ((0, 0, 0), 0, 11),
 ((0, 0, 0), 0, 12),
 ((0, 0, 0), 0, 13),
 ((0, 0, 0), 0, 14),
 ((0, 0, 0), 0, 15),
 ((0, 0, 0), 0, 16),
 ((0, 0, 0), 0, 17),
 ((0, 0, 0), 0, 18),
 ((0, 0, 0), 0, 19),
 ((0, 0, 0), 0, 20),
 ((0, 0, 0), 0, 21),
 ((0, 0, 0), 0, 22),
 ((0, 0, 0), 0, 23),
 ((0, 0, 0), 0, 24),
 ((0, 0, 0), 0, 25),
 ((0, 0, 0), 0, 26),
 ((0, 0, 0), 0, 27),
 ((0, 0, 0), 0, 28),
 ((0, 0, 0), 0, 29),
 ((0, 0, 0), 0, 30),
 ((0, 0, 0), 0, 31),
 ((0, 1, 0), 0, 0),
 ((0, 1, 0), 0, 1),
 ((0, 1, 0), 0, 2),
 ((0, 1, 0), 0, 3),
 ((0, 1, 0), 0, 4),
 ((0, 1, 0), 0, 5),
 ((0, 1, 0), 0, 6),
 ((0, 1, 0), 0, 7),
 ((0, 1, 0), 0, 8),
 ((0, 1, 0), 0, 9),
 ((0, 1, 0), 0, 10),
 ((0, 1, 0), 0, 11),
 ((0, 1, 0), 0, 12),
 ((0, 1, 0), 0, 13),
 ((0, 1, 0), 0, 14),
 ((0, 1, 0), 0, 15),
 ((0, 1, 0),

In [100]:
# pprint([(to_block_id(t["block_id"]), to_block_id(t["thread_id"])) for t in trace])

In [132]:
# find trace pc = OP_LDG[pc=648,warp=32]
pprint([
    (t["instr_offset"], t["instr_opcode"], t["warp_id_in_sm"], to_block_id(t["block_id"]), t["warp_id_in_block"], [get_relative(addr) for addr in t["addrs"]])
    for t in trace if all([
        t["instr_offset"] == 648,
        # get_relative(t["addrs"][0])[0] == 0),
        # any([False if a is None else (a[1] == 0) for a in [get_relative(addr) for addr in t["addrs"]]]),
        # t["warp_id_in_sm"] == 31
    ])
])

[]


In [97]:
# sort
trace[0]

{'cuda_ctx': 94237064672928, 'kernel_id': 0, 'block_id': {'x': 2, 'y': 0, 'z': 0}, 'thread_id': {'x': 0, 'y': 8, 'z': 0}, 'warp_id_in_sm': 8, 'warp_id_in_block': 8, 'warp_size': 32, 'line_num': 0, 'instr_data_width': 4, 'instr_opcode': 'EXIT', 'instr_offset': 168, 'instr_idx': 15, 'instr_predicate': {'num': 0, 'is_neg': False, 'is_uniform': False}, 'instr_mem_space': 'None', 'instr_is_mem': False, 'instr_is_load': False, 'instr_is_store': False, 'instr_is_extended': False, 'dest_regs': [0], 'num_dest_regs': 0, 'src_regs': [0, 0, 0, 0, 0], 'num_src_regs': 0, 'active_mask': 0, 'addrs': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [39]:
hex(648)
# this is pc=648
# note: 1 0x7fe781700000 0  is the address in base and stride format (1 is base stride)

# 0288 ffffffff 1 R30 LDG.E 1 R6 4 1 0x7fe781700000 0 

'0x288'

In [91]:
# pprint(accel_trace[:5])
per_pc = list([l for l in accel_trace if l[2] == 648])
# per_pc = [int(per_pc[-2] + per_pc[-1], 16) for per_pc in per_pc]
for i in range(1, len(per_pc)):
    last = per_pc[i-1]
    last_addr = int(last[-2] + last[-1], 16)
    current = per_pc[i]
    current_addr = int(current[-2] + current[-1], 16)

    print(last[0], last[1], last_addr, "<=", current[0], current[1], current_addr)
    assert last_addr <= current_addr

# make sure this is sorted!
# assert sorted(per_pc) == per_pc
# all(l[i] <= l[i+1] for i in range(len(per_pc) - 1))
pprint(per_pc)

[0, 0, 0] 0 2250116572381184 <= [0, 0, 0] 0 2250116572381440
[0, 0, 0] 0 2250116572381440 <= [0, 0, 0] 0 2250116572381696
[0, 0, 0] 0 2250116572381696 <= [0, 0, 0] 0 2250116572381952
[0, 0, 0] 0 2250116572381952 <= [0, 0, 0] 0 2250116572382208
[0, 0, 0] 0 2250116572382208 <= [0, 0, 0] 0 2250116572382464
[0, 0, 0] 0 2250116572382464 <= [0, 0, 0] 0 2250116572382720
[0, 0, 0] 0 2250116572382720 <= [0, 0, 0] 0 2250116572382976
[0, 0, 0] 0 2250116572382976 <= [0, 0, 0] 0 2250116572383232
[0, 0, 0] 0 2250116572383232 <= [0, 0, 0] 0 2250116572383488
[0, 0, 0] 0 2250116572383488 <= [0, 0, 0] 0 2250116572383744
[0, 0, 0] 0 2250116572383744 <= [0, 0, 0] 0 2250116572384000
[0, 0, 0] 0 2250116572384000 <= [0, 0, 0] 0 2250116572384256
[0, 0, 0] 0 2250116572384256 <= [0, 0, 0] 0 2250116572384512
[0, 0, 0] 0 2250116572384512 <= [0, 0, 0] 0 2250116572384768
[0, 0, 0] 0 2250116572384768 <= [0, 0, 0] 0 2250116572385024
[0, 0, 0] 0 2250116572385024 <= [0, 0, 0] 0 2250116572385280
[0, 0, 0] 0 225011657238

AssertionError: 