This repository has been archived by the owner on Mar 23, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bench-pread-wg.cpp
120 lines (104 loc) · 3.01 KB
/
bench-pread-wg.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include <hc.hpp>
#include <hc_syscalls.h>
#include <fstream>
#include <iostream>
#include <string>
#include <cstdio>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <unistd.h>
#include "test.h"
static size_t size = 4096;
static void help(int argc, char *argv[])
{
::std::cerr << "\t--size\tamount of space to allocate (Default: 4kB)\n";
}
static bool parse(const ::std::string &opt, const ::std::string &arg)
{
if (opt == "--size") {
size = ::std::stoi(arg);
return true;
}
return false;
}
static int run_gpu(const test_params &p, ::std::ostream &O, syscalls &sc,
int argc, char *argv[])
{
::std::vector<char> data(size, 'x');
char name[] = "/tmp/XXXXXXX";
FILE * tmpf = init_tmp_file(data, p.parallel * p.serial, name);
int fd = fileno(tmpf);
// HCC is very bad with globals
size_t lsize = size * p.wg_size;
uint64_t lfd = fd;
::std::vector<ssize_t> ret(p.parallel / p.wg_size);
::std::vector<::std::vector<char>> rdata(p.parallel / p.wg_size);
for (auto &s : rdata)
s.resize(lsize);
auto f = [&](hc::tiled_index<1> idx) [[hc]] {
int i = idx.tile[0];
int local_i = idx.local[0];
for (size_t j = 0; j < p.serial; ++j) {
// we don't need to wait here, since
// blocking operation guarantees
// available slots
if (local_i == 0) {
uint64_t buf = (uint64_t)rdata[i].data();
ret[i] = sc.send(SYS_pread64, {lfd, buf, lsize,
lsize * i});
}
idx.barrier.wait();
}
};
auto f_s = [&](hc::tiled_index<1> idx) [[hc]] {
int i = idx.tile[0];
int local_i = idx.local[0];
for (size_t j = 0; j < p.serial; ++j) {
// we don't need to check for available slot here,
// since blocking operation guarantees
// available slots. but we can sync across WGs
idx.barrier.wait();
if (local_i == 0) {
uint64_t buf = (uint64_t)rdata[i].data();
ret[i] = sc.send(SYS_pread64, {lfd, buf, lsize,
lsize * i});
}
idx.barrier.wait();
}
};
auto f_n = [&](hc::tiled_index<1> idx) [[hc]] {
};
auto f_w_n = [&](hc::tiled_index<1> idx) [[hc]] {
};
auto f_s_n = [&](hc::tiled_index<1> tidx) [[hc]] {
};
auto start = ::std::chrono::high_resolution_clock::now();
test_run(p, sc, f, f_s, f_n, f_s_n, f_w_n);
auto end = ::std::chrono::high_resolution_clock::now();
auto us = ::std::chrono::duration_cast<::std::chrono::microseconds>(end - start);
O << us.count() << std::endl;
if (::std::any_of(ret.begin(), ret.end(), [&](ssize_t ret) {
return ret != lsize; }))
::std::cerr << "Failed reads\n";
fclose(tmpf);
remove(name);
for (size_t i = 0; i < ret.size(); ++i) {
if (ret[i] != lsize) {
::std::cerr << "Failed read " << i << " ("
<< (ssize_t)ret[i] << ")\n";
return 1;
}
if (::std::memcmp(data.data(), rdata[i].data(), data.size()) != 0) {
::std::cerr << "GPU read data do not match\n";
return 1;
}
}
return 0;
};
struct test test_instance = {
.run_gpu = run_gpu,
.parse_option = parse,
.help = help,
.name = "pread (work-group scope)",
};