Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 499 lines (431 sloc) 18.699 kB
179fbfe Initial import
Matt Johnson authored
1 #ifndef _RIGEL_H_
2 #define _RIGEL_H_
3
4
5 #ifdef __cplusplus
6 extern "C" {
7 #endif
8
9 // Labels for global data and data used in inline __asm__ blocks
10 #define RIGEL_GLOBAL volatile
11 #define RIGEL_ASM volatile
12
13 // Get the includes shared with RigelSim
14 #include <rigellib.h>
15 #include <stdint.h>
16 #include <stdlib.h>
17 #include "rigel-sim.h"
18 #include "machine/suds_primops.h"
19
20 // Aligned data structure to mitigate the effect of false sharing
21 typedef struct rigel_line_aligned_t {
22 uint32_t val;
23 uint32_t pad[__CACHE_LINE_SIZE_WORDS-1];
24 } rigel_line_aligned_t __attribute__ ((aligned(32)));
25
26 typedef struct rigel_aligned_int32_t {
27 int32_t pad0[__CACHE_LINE_SIZE_WORDS];
28 int32_t val __attribute__ ((aligned(32)));
29 int32_t pad1[__CACHE_LINE_SIZE_WORDS-1];
30 } __attribute__ ((aligned(32))) rigel_aligned_int32_t;
31
32 typedef struct rigel_aligned_f32_t {
33 int32_t pad0[__CACHE_LINE_SIZE_WORDS];
34 float val;
35 int32_t pad[__CACHE_LINE_SIZE_WORDS-1];
36 } rigel_aligned_f32_t __attribute__ ((aligned(32)));
37
38 // ??? Is this used anywhere or is it depricated??!?!
39 typedef struct BarrierInfo {
40 int corenum;
41 int numcores;
42 int local_sense;
43 int *global_sense;
44 int *barrier_count;
45 } BarrierInfo;
46
47 // for barrier supporting multithreading
48 typedef struct BarrierMTInfo {
49 int corenum;
50 //int numcores;
51 int threadnum;
52 int numthreads;
53 int local_sense;
54 int *global_sense;
55 int *barrier_count;
56 } BarrierMTInfo;
57
58 extern int RigelIncoherentMallocEnabled();
59
60
61 extern int RigelGetCoreNum();
62 extern int RigelGetClusterNum();
63 extern int RigelGetThreadNum();
64 extern uint32_t RigelGetCycle();
65
66
67 // Base address of the hybrid CC table.
68 extern uint32_t hybridCC_base_addr;
69 void RigelSetCoherentLine(intptr_t addr);
70 void RigelSetIncoherentLine(intptr_t addr);
71
72 extern int RigelGetNumCores();
73 extern int RigelGetNumCoresPerCluster();
74 extern int RigelGetNumThreads();
75 extern int RigelGetNumThreadsPerCluster();
76 extern int RigelGetNumThreadsPerCore();
77
78 void RigelBreakpoint();
79
80 void RigelBarrier(BarrierInfo *bi);
81 float LocalReduceFloat(BarrierInfo *bi, float val, volatile float *ReduceVals);
82
83 // Enable/diable non-blocking atomics on a thread-by-thread basis. The SPR for
84 // non-blocking atomics is $r14.
85 #define ENABLE_NONBLOCKING_ATOMICS() do { \
86 __asm__ __volatile__ ( " ori $1, $zero, 1;\n" \
87 " mtsr $14, $1; "); \
88 } while (0);
89 #define DISABLE_NONBLOCKING_ATOMICS() do { \
90 __asm__ __volatile__ ( "mtsr $14, $zero; "); \
91 } while (0);
92
93 // TODO: make the rf num take the enum value?
94 #define SIM_SLEEP_ON() do { \
95 int tid = RigelGetThreadNum(); \
96 if (tid == 0) { \
97 __asm__ __volatile__ ( " ori $1, $zero, 1;\n" \
98 " mtsr $9, $1; "); \
99 } \
100 } while (0);
101
102 // TODO: make the rf num take the enum value?
103 #define SIM_SLEEP_OFF() do { \
104 int tid = RigelGetThreadNum(); \
105 if (tid == 0) { \
106 __asm__ __volatile__ ( "mtsr $9, $zero; "); \
107 } \
108 } while (0);
109
110 // XXX: Fast timers using single instructions
111 #define StartTimer_FAST(x) do {\
112 __asm__ __volatile__ ( "timer.start %0"\
113 :\
114 : "r"(x)\
115 ); \
116 } while(0);
117
118 #define StopTimer_FAST(x) do {\
119 __asm__ __volatile__ ( "timer.stop %0"\
120 :\
121 : "r"(x)\
122 ); \
123 } while(0);
124
125 // TODO: Implement new instruction
126 #define ClearTimer_FAST(x) do {\
127 __asm__ __volatile__ ( "timer.clear %0"\
128 :\
129 : "r"(x)\
130 ); \
131 } while(0);
132
133 #define RigelAbort() do { asm volatile ("abort"); } while (0)
134
135 #define RigelCountLeadingZeros(leadingZeros, x) do { \
136 __asm__ __volatile__ ( "clz %0, %1; \n" \
137 : "=r"(leadingZeros) \
138 : "r"(x) \
139 ); \
140 } while (0);
141
142 #define RigelGetNumClusters(num_clusters) do { \
143 __asm__ __volatile__ ( "mfsr %0, $5; \n" \
144 : "=r"(num_clusters) \
145 ); \
146 } while (0);
147
148
149 #if 0
150 #define RigelGetClusterNum(cluster_id) do { \
151 __asm__ __volatile__ ( "mfsr %0, $4;" \
152 : "=r"(cluster_id) \
153 ); \
154 } while (0);
155 #endif
156 // XXX: BLock prefetch instruction
157 // Inputs are the starting address and # of lines to fetch
158 #define RigelPrefetchBlock(addr, lines) do { \
159 __asm__ __volatile__ ( "pref.b.gc %1, %0, 0 \n" \
160 : \
161 : "r"(addr), "r"(lines)); \
162 } while (0);
163
164 #define RigelPrefetchBlockCC(addr, lines) do { \
165 __asm__ __volatile__ ( "pref.b.cc %1, %0, 0 \n"\
166 : \
167 : "r"(addr), "r"(lines));\
168 } while (0);
169
170 // XXX: GLOBAL ATOMIC DEC - Decrement ['addr'] and put the result in 'val'
171 #define RigelAtomicDEC(val, addr) { \
172 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
173 int temp; \
174 __asm__ __volatile__ ( "ldw %1, %2; \n"\
175 "atom.dec %1, %1, 0; \n"\
176 "ori %0, %1, 0; \n"\
177 : "=r"(val), "=r"(temp) \
178 : "m"(_addr) \
179 : "1", "memory"); \
180 }
181
182 // XXX: GLOBAL ATOMIC INC - Increment ['addr'] and put the result in 'val'
183 #define RigelAtomicINC(val, addr) { \
184 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
185 int temp; \
186 __asm__ __volatile__ ( "ldw %1, %2; \n"\
187 "atom.inc %1, %1, 0; \n"\
188 "ori %0, %1, 0; \n"\
189 : "=r"(val), "=r"(temp) \
190 : "m"(_addr) \
191 : "1", "memory"); \
192 }
193
194 // XXX: GLOBAL ATOMIC EXCHANGE - Exchange 'val' for data at 'addr'
195 #define RigelAtomicXCHG(val, addr) { \
196 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
197 int temp0, temp1; \
198 __asm__ __volatile__ ( "or %1, $0, %4; \n"\
199 "ldw %2, %3; \n"\
200 "atom.xchg %1, %2, 0; \n"\
201 "or %0, %1, $0; \n"\
202 : "=r"(val), "=r"(temp0), "=r"(temp1) \
203 : "m"(_addr), "r"(val) \
204 : "memory"); \
205 } while (0);
206
207 // XXX: GLOBAL COMPARE+SWAP: If 'compare' == ['addr'] then swapval <- ['addr']
208 // and ['addr'] <- swapval
209
210 /*
211 "printreg $22; " \
212 "printreg $23; " \
213 "printreg $1; " \
214 */
215
216 // FIXME: This could be made a bit more streamlined
217 #define RigelAtomicADDU(addval, addr, retval) { \
218 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
219 int temp0, temp1, temp2; \
220 __asm__ __volatile__ ( "or %2, $0, %5; \n"\
221 "ldw %3, %4; \n"\
222 "atom.addu %1, %2, %3; \n"\
223 "or %0, %1, $0; \n"\
224 : "=r"(retval), "=r"(temp0), "=r"(temp1), "=r"(temp2) \
225 : "m"(_addr), "r"(addval) \
226 : "memory"); \
227 }
228
229 #define RigelAtomicMAX(addval, addr, retval) { \
230 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
231 int temp0, temp1, temp2; \
232 __asm__ __volatile__ ( "or %2, $0, %5; \n"\
233 "ldw %3, %4; \n"\
234 "atom.max %1, %2, %3; \n"\
235 "or %0, %1, $0; \n"\
236 : "=r"(retval), "=r"(temp0), "=r"(temp1), "=r"(temp2) \
237 : "m"(_addr), "r"(addval) \
238 : "memory"); \
239 }
240
241 #define RigelAtomicMIN(addval, addr, retval) { \
242 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
243 int temp0, temp1, temp2; \
244 __asm__ __volatile__ ( "or %2, $0, %5; \n"\
245 "ldw %3, %4; \n"\
246 "atom.min %1, %2, %3; \n"\
247 "or %0, %1, $0; \n"\
248 : "=r"(retval), "=r"(temp0), "=r"(temp1), "=r"(temp2) \
249 : "m"(_addr), "r"(addval) \
250 : "memory"); \
251 }
252
253 #define RigelAtomicOR(addval, addr, retval) { \
254 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
255 int temp0, temp1, temp2; \
256 __asm__ __volatile__ ( "or %2, $0, %5; \n"\
257 "ldw %3, %4; \n"\
258 "atom.or %1, %2, %3; \n"\
259 "or %0, %1, $0; \n"\
260 : "=r"(retval), "=r"(temp0), "=r"(temp1), "=r"(temp2) \
261 : "m"(_addr), "r"(addval) \
262 : "memory"); \
263 }
264
265
266 #define RigelAtomicAND(addval, addr, retval) { \
267 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
268 int temp0, temp1, temp2; \
269 __asm__ __volatile__ ( "or %2, $0, %5; \n"\
270 "ldw %3, %4; \n"\
271 "atom.and %1, %2, %3; \n"\
272 "or %0, %1, $0; \n"\
273 : "=r"(retval), "=r"(temp0), "=r"(temp1), "=r"(temp2) \
274 : "m"(_addr), "r"(addval) \
275 : "memory"); \
276 }
277
278 #define RigelAtomicXOR(addval, addr, retval) { \
279 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
280 int temp0, temp1, temp2; \
281 __asm__ __volatile__ ( "or %2, $0, %5; \n"\
282 "ldw %3, %4; \n"\
283 "atom.xor %1, %2, %3; \n"\
284 "or %0, %1, $0; \n"\
285 : "=r"(retval), "=r"(temp0), "=r"(temp1), "=r"(temp2) \
286 : "m"(_addr), "r"(addval) \
287 : "memory"); \
288 }
289
290 #define RigelAtomicCAS(compare, swapval, addr) do { \
291 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
292 __asm__ __volatile__ ( "or $26, $0, %2; \n" \
293 "or $27, $0, %3; \n" \
294 "ldw $1, %1; \n" \
295 "atom.cas $26, $27, $1; \n" \
296 "or %0, $26, $0; " \
297 : "=r"(swapval) \
298 : "m"(_addr), "r"(swapval), "r"(compare) \
299 : "1", "26", "27", "memory"); \
300 } while (0);
301
302 // XXX: GLOBAL MEMORY ACCESSES - Load/Store value 'val' into the address 'addr'.
303
304
305
306 #define RigelGlobalLoadX(LocalValue, GlobalValue) do { \
307 volatile unsigned int *_addr = (unsigned int *)(&GlobalValue); \
308 __asm__ __volatile__ ( "g.ldw %0, %1, 0 \n"\
309 : "=r"(LocalValue) \
310 : "r"(_addr) \
311 ); \
312 } while (0);
313
314 #define RigelGlobalStoreX(LocalValue, GlobalValue) do { \
315 volatile unsigned int *_addr = (unsigned int *)(&GlobalValue); \
316 __asm__ __volatile__ ( "g.stw %0, %1, 0 \n"\
317 : \
318 : "r"(LocalValue), "r"(_addr) \
319 : "memory" \
320 ); \
321 } while (0);
322
323 #define RigelGlobalLoad(val, addr) do { \
324 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
325 __asm__ __volatile__ ( "ldw $1, %1; \n" \
326 "g.ldw $1, $1, 0; \n" \
327 "or %0, $1, $0;" \
328 : "=r"(val) \
329 : "m"(_addr) \
330 : "1", "memory"); \
331 } while (0);
332
333 #define RigelGlobalStore(val, addr) do { \
334 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
335 __asm__ __volatile__ ( "ldw $1, %1; \n" \
336 "g.stw %0, $1, 0; " \
337 : \
338 : "r"(val), "m"(_addr) \
339 : "memory"); \
340 } while (0);
341
342
343
344 // XXX: CLUSTER-LEVEL ATOMICS - Load-linked and store conditional. Load-Link
345 // is nothing special from an API point of view. Store-conditional has an extra
346 // parameter that is set to '1' when the STC succeeds and '0' should it fail
347 #define RigelStoreCond(val, addr, succ) do { \
348 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
349 __asm__ __volatile__ ( "ldw $1, %2; \n" \
350 "stc $1, %1, $1; " \
351 "ori %0, $1, 0; " \
352 : "=r"(succ) \
353 : "r"(val), "m"(_addr) \
354 : "memory"); \
355 } while (0);
356
357 #define RigelLoadLinked(val, addr) do { \
358 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
359 __asm__ __volatile__ ( "ldw $1, %1; \n" \
360 "ldl $1, $1; \n" \
361 "or %0, $1, $0;" \
362 : "=r"(val) \
363 : "m"(_addr) \
364 : "1", "memory"); \
365 } while (0);
366
367 // XXX: BROADCASTS - Write a value to memory and send notifications to all of
368 // the cluster caches informing them of its modification
369 #define RigelBroadcastUpdate(val, addr) do { \
370 volatile unsigned int *_addr = (volatile unsigned int *)(&addr); \
371 __asm__ __volatile__ ( "ldw $1, %1; \n" \
372 "bcast.u %0, $1, 0; " \
373 : \
374 : "r"(val), "m"(_addr) \
375 : "memory"); \
376 } while (0);
377
378
379 // XXX: PIPELINE SYNC - Enforces all in-flight instructions to complete before
380 // retiring the SYNC. When SYNC retires, fetch is also restarted.
381 #define RigelSync() do { \
382 __asm__ ( "sync;" ); \
383 } while (0);
384 // XXX: MEMORY BARRIER - Enforces that all previous memory operations leaving
385 // the cluster cache complete before allowing any future requests to be
386 // serviced
387 #define RigelMemoryBarrier() do { \
388 __asm__ ( "mb;" ); \
389 } while (0);
390 // XXX: WRITEBACK LINE - Flushes the line back to the global cache but does not
391 // invalidate it in the cluster cache.
392 // void RigelWritebackLine(void *addr)
393 #define RigelWritebackLine(addr) \
394 do { __asm__ ( "line.wb %0;" \
395 : \
396 : "r"(addr) \
397 : "memory" \
398 ); \
399 } while (0);
400 // XXX: INVALIDATE LINE - Invalidate the line in the cluster cache without
401 // writing it back. If the line is not present, the request is dropped.
402 // void RigelInvalidateLine(void *addr)
403 #define RigelInvalidateLine(addr) \
404 do { __asm__ ( "line.inv %0;" \
405 : \
406 : "r"(addr) \
407 : "memory" \
408 ); \
409 } while (0);
410 // XXX: FLUSH LINE - Flushes the line back to the global cache and invalidate it
411 // in the cluster cache.
412 // void RigelFlushLine(void *addr)
413 #define RigelFlushLine(addr) \
414 do { __asm__ ( "line.wb %0;" \
415 "line.inv %0;" \
416 : \
417 : "r"(addr) \
418 : "memory" \
419 ); \
420 } while (0);
421 // XXX: PREFETCH LINE - Brings line associated with 'addr' into the cluster
422 // cache if it is not already present
423 // void RigelPrefetchLine(void *addr)
424 #define RigelPrefetchLine(addr) \
425 do { __asm__ ( "pref.l %0, 0;" \
426 : \
427 : "r"(addr) \
428 : "memory" \
429 ); \
430 } while (0);
431 #define RigelPrefetchNoGlobalAlloc(addr) \
432 do { __asm__ ( "pref.nga %0, 0;" \
433 : \
434 : "r"(addr) \
435 : "memory" \
436 ); \
437 } while (0);
438 // void RigelPrint(uint32 x)
439
440 #define RigelPrint(x) \
441 __asm__ ( "printreg %0;" : : "r"(x))
442
443 /*
444 //#define RigelPrint(x) \
445 // do { asm(" addi $28, %0, 0;"\
446 // " printreg $28;" \
447 // : \
448 // : "r"(x) \
449 // ); } while(0);
450 */
451
452 void RigelPrintFunc(uint32_t x);
453
454 // Definition of timer libraries
455 int StartTimer(int timer_num);
456 int GetTimer(int timer_num);
457 int StopTimer(int timer_num);
458 int ClearTimer(int timer_num);
459
460 //Syscalls to seed/generate random numbers
461 //Seed the RNG with 64 bits of state.
462 //Use this to make sure subsequent runs with identical timing have identical results.
463 static inline void RigelSRand(unsigned int s0, unsigned int s1)
464 {
465 __suds_syscall(0x40, s0, s1, 0);
466 }
467
468 //Random float between min and max (inclusive)
469 static inline float RigelRandFloat(const float min, const float max)
470 {
471 uint32_t minu = *((const uint32_t *)&min);
472 uint32_t maxu = *((const uint32_t *)&max);
473 uint32_t ret = __suds_syscall(0x3D, minu, maxu, 0);
474 return *((float *)&ret);
475 }
476
477 //Random (signed) int between min and max (inclusive)
478 static inline int RigelRandInt(const int min, const int max)
479 {
480 uint32_t minu = *((const uint32_t *)&min);
481 uint32_t maxu = *((const uint32_t *)&max);
482 return (int)(__suds_syscall(0x3E, minu, maxu, 0));
483 }
484
485 //Random unsigned int between min and max (inclusive)
486 static inline unsigned int RigelRandUInt(const unsigned int min, const unsigned int max)
487 {
488 return __suds_syscall(0x3F, min, max, 0);
489 }
490
491 //Returns 1 if we're running in RigelSim, 0 if otherwise (real hardware, RTL, etc.)
492 int RigelIsSim(void);
493
494 #ifdef __cplusplus
495 }
496 #endif
497
498 #endif
Something went wrong with that request. Please try again.