-
Notifications
You must be signed in to change notification settings - Fork 53
/
Copy pathabtx_prof.h
2385 lines (2245 loc) · 93.8 KB
/
abtx_prof.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* See COPYRIGHT in top-level directory.
*/
/*
* How to use ABTX_profiler
*
* This header file implements an example of a simple profiler that measures the
* basic performance numbers. This example profiler should be sufficiently
* useful in practice.
*
* 1. Compile Argobots with --enable-tool
* 2. Modify your program as follows:
* 2.1 Copy this header file and include it.
* 2.2 Right after ABT_init(), call ABTX_prof_init().
* 2.3 When you want to start a profiler, call ABTX_prof_start().
* 2.4 When you want to stop a profiler, call ABTX_prof_stop().
* 2.5 Call ABTX_prof_print() to print out the obtained results.
* 2.6 Repeat 2.3 - 2.5 if needed.
* 2.7 Right before ABT_finalize(), call ABTX_prof_finalize().
* 3. Compile your program and run it with tool-enabled Argobots. You might
* need to add -lpthread to compile your code with this header.
*
* Your program will be modified as follows:
*
* ============================================================================
*
* #include "abtx_prof.h" // Added
*
* ABTX_prof_context g_prof_context; // Added
* int main_func() {
* ...
* ABT_init(args, argc);
* ABTX_prof_init(&g_prof_context) // Added
* ...
* ABTX_prof_start(g_prof_context, ABTX_PROF_MODE_BASIC) // Added
* for (int iter = 0; iter < niters; iter++) {
* ...
* ...
* ...
* }
* ABTX_prof_stop(g_prof_context); // Added
* ABTX_prof_print(g_prof_context, stdout,
* ABTX_PRINT_MODE_SUMMARY
* | ABTX_PRINT_MODE_FANCY); // Added
* ...
* ABTX_prof_finalize(g_prof_context) // Added
* ABT_finalize();
* ...
* }
*
* ============================================================================
*
* - Profiling mode (prof_mode)
*
* Users can pass either ABTX_PROF_MODE_BASIC or ABTX_PROF_MODE_DETAILED.
* ABTX_PROF_MODE_BASIC is lighter but can gather less information.
* ABTX_PROF_MODE_DETAILED is quite heavy, so please use ABTX_PROF_MODE_BASIC
* if it is enough (and in many cases, it should be enough).
*
* - Print mode (print_mode)
*
* Users can pass [either ABTX_PRINT_MODE_RAW or ABTX_PRINT_MODE_SUMMARY] and
* [either ABTX_PRINT_MODE_CSV or ABTX_PRINT_MODE_FANCY]. For the first
* trial, "(ABTX_PRINT_MODE_SUMMARY | ABTX_PRINT_MODE_FANCY)" (where "|" is a
* bitwise OR) is recommended.
*
* ABTX_PRINT_MODE_SUMMARY prints the common performance metrics, while
* ABTX_PRINT_MODE_RAW shows the raw performance data. If users do not deeply
* understand the mechanism of Argobots, ABTX_PRINT_MODE_SUMMARY is
* recommended.
*
* ABTX_PRINT_MODE_CSV outputs the data in a CSV format while the text
* displayed by ABTX_PRINT_MODE_FANCY is easier to read in the terminal.
*
* Note that profiling imposes certain overheads. For accurate performance
* analysis, please compare the performance with the original performance
* without a profiler.
*/
#ifndef ABTX_PROF_H_INCLUDED
#define ABTX_PROF_H_INCLUDED
#include <stdio.h>
#define ABTX_PROF_MODE_BASIC 0
#define ABTX_PROF_MODE_DETAILED 1
#define ABTX_PRINT_MODE_RAW 0x1
#define ABTX_PRINT_MODE_SUMMARY 0x2
#define ABTX_PRINT_MODE_CSV 0x4
#define ABTX_PRINT_MODE_FANCY 0x8
typedef struct ABTX_prof_context_opaque *ABTX_prof_context;
static int ABTX_prof_init(ABTX_prof_context *p_new_context);
static int ABTX_prof_start(ABTX_prof_context context, int prof_mode);
static int ABTX_prof_stop(ABTX_prof_context context);
static int ABTX_prof_clean(ABTX_prof_context context);
static int ABTX_prof_print(ABTX_prof_context context, FILE *stream,
int print_mode);
static int ABTX_prof_finalize(ABTX_prof_context context);
/*
* Profiler configurations that might affect compilation.
*
* - ABTX_PROF_USE_BUILTIN_EXPECT
* Set 0 if your compiler does not support __builtin_expect(). This builtin
* should be supported by sufficiently new GCC, Clang, ICC, and XLC compilers.
* Setting it to 0 may result in poor code optimization.
*
* - ABTX_PROF_USE_ALWAYS_INLINE
* Set 0 if your compiler does not support __attribute__((always_inline)).
* This attribute should be supported by sufficiently new GCC, Clang, ICC, and
* XLC compilers.
*
* - ABTX_PROF_ASSUME_SCHED_ALWAYS_ACTIVE
* Set 0 if schedulers might be not scheduled in your program because of
* oversubscription of OS-level threads (=Pthreads) or scheduler sleep.
* Although many applications create as many execution streams as a number of
* cores and let them burn all the CPU resources, this assumption does not
* hold in some cases. for example, if Argobots is used as a backend event
* engine, Argobots should avoid wasting CPU cores if there is no work.
* Setting 0 enables the profiler to use a heavy per-thread timer.
*
* - ABTX_PROF_USE_HARDWARE_CYCLES
* Set 0 if your compiler does not support assembly code that gets
* architecture-dependent hardware clock. This code should work with
* sufficiently new GCC, Clang, ICC, and XLC compilers.
*
*/
#ifndef ABTX_PROF_USE_BUILTIN_EXPECT
#if defined(__SUNPRO_C) && __SUNPRO_C < 0x5150
/* Solaris Studio <= 12.5 (Sun C 5.14) does not support __builtin_expect() */
#define ABTX_PROF_USE_BUILTIN_EXPECT 0
#else
#define ABTX_PROF_USE_BUILTIN_EXPECT 1
#endif
#endif /* ABTX_PROF_USE_BUILTIN_EXPECT */
#ifndef ABTX_PROF_USE_ALWAYS_INLINE
#define ABTX_PROF_USE_ALWAYS_INLINE 1
#endif
#ifndef ABTX_PROF_ASSUME_SCHED_ALWAYS_ACTIVE
#define ABTX_PROF_ASSUME_SCHED_ALWAYS_ACTIVE 1
#endif
#ifndef ABTX_PROF_USE_HARDWARE_CYCLES
#define ABTX_PROF_USE_HARDWARE_CYCLES 1
#endif
/*
* Internal implementation. This should not be modified by profiler users.
*/
#include <stdint.h>
#include <pthread.h>
#include <string.h>
#include <time.h>
#include <assert.h>
#include <stdarg.h>
#include <abt.h>
#if ABTX_PROF_USE_BUILTIN_EXPECT
#define ABTXI_prof_likely(cond) __builtin_expect(!!(cond), 1)
#define ABTXI_prof_unlikely(cond) __builtin_expect(!!(cond), 0)
#else
#define ABTXI_prof_likely(cond) (cond)
#define ABTXI_prof_unlikely(cond) (cond)
#endif
#if ABTX_PROF_USE_ALWAYS_INLINE
#define ABTXI_prof_always_inline inline __attribute__((always_inline))
#else
#define ABTXI_prof_always_inline inline
#endif
#undef ABTXI_PROF_USE_SYNC_BUILTIN
#if defined(__PGIC__) || defined(__ibmxl__)
/* Their __atomic implementations are not trustworthy. See #162 and #211. */
#define ABTXI_PROF_USE_SYNC_BUILTIN
#endif
#ifndef ABTXI_PROF_MEM_BLOCK_SIZE
#define ABTXI_PROF_MEM_BLOCK_SIZE (32 * 1024) /* bytes */
#endif
#ifndef ABTXI_PROF_MAX_DEPTH
#define ABTXI_PROF_MAX_DEPTH 4
#endif
#ifndef ABTXI_PROF_DEFAULT_NUM_XSTREAMS
#define ABTXI_PROF_DEFAULT_NUM_XSTREAMS 32
#endif
#undef ABTXI_PROF_USE_CYCLES
#if ABTX_PROF_USE_HARDWARE_CYCLES
/* Use "faster" hardware cycles. */
#if defined(__x86_64__)
/* x86/64 (Intel and AMD) */
static inline uint64_t ABTXI_prof_get_cycles()
{
unsigned hi, lo;
__asm__ __volatile__("rdtscp" : "=a"(lo), "=d"(hi)::"rcx");
uint64_t cycles = ((uint64_t)lo) | (((int64_t)hi) << 32);
return cycles;
}
#define ABTXI_PROF_USE_CYCLES 1
#elif defined(__aarch64__)
/* 64-bit ARM */
static inline uint64_t ABTXI_prof_get_cycles()
{
register uint64_t cycles;
__asm__ __volatile__("isb; mrs %0, cntvct_el0" : "=r"(cycles));
return cycles;
}
#define ABTXI_PROF_USE_CYCLES 1
#elif defined(__powerpc__)
/* POWER */
static inline uint64_t ABTXI_prof_get_cycles()
{
register uint64_t cycles;
__asm__ __volatile__("mfspr %0, 268" : "=r"(cycles));
return cycles;
}
#define ABTXI_PROF_USE_CYCLES 1
/* Unknown hardware. */
#endif
#endif /* ABTX_PROF_USE_HARDWARE_CYCLES */
#ifdef ABTXI_PROF_USE_CYCLES
#define ABTXI_PROF_T int64_t
#define ABTXI_PROF_T_INVALID ((int64_t)0xFFFFFFFFFFFFFFFF)
#define ABTXI_PROF_T_ZERO ((int64_t)0)
#define ABTXI_prof_get_time() ABTXI_prof_get_cycles()
#define ABTXI_PROF_T_STRING "HW cycles"
static double ABTXI_prof_get_time_to_sec()
{
double t_sec1 = ABT_get_wtime();
ABTXI_PROF_T t_start = ABTXI_prof_get_cycles();
double t_sec2 = ABT_get_wtime();
double t_start_s = (t_sec2 + t_sec1) / 2.0;
while (ABT_get_wtime() < t_start_s + 1.0)
;
double t_sec3 = ABT_get_wtime();
ABTXI_PROF_T t_end = ABTXI_prof_get_cycles();
double t_sec4 = ABT_get_wtime();
double t_end_s = (t_sec4 + t_sec3) / 2.0;
return (t_end_s - t_start_s) / (t_end - t_start);
}
#else
#define ABTXI_PROF_T double
#define ABTXI_PROF_T_INVALID ((double)-1.0)
#define ABTXI_PROF_T_ZERO ((double)0.0)
#define ABTXI_prof_get_time() ABT_get_wtime()
#define ABTXI_PROF_T_STRING "s"
#define ABTXI_prof_get_time_to_sec() 1.0
#endif
#if ABTX_PROF_ASSUME_SCHED_ALWAYS_ACTIVE
#define ABTXI_PROF_USE_TIME_LOCAL 0
#else
#define ABTXI_PROF_USE_TIME_LOCAL 1
#define ABTXI_PROF_LOCAL_T_INVALID ((double)-1.0)
#define ABTXI_PROF_LOCAL_T_ZERO ((double)0.0)
#define ABTXI_PROF_LOCAL_T double
static ABTXI_PROF_LOCAL_T ABTXI_prof_get_time_local()
{
/* Return a per-thread timer. */
struct timespec t;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
return t.tv_sec + t.tv_nsec * 1.0e-9;
}
#define ABTXI_PROF_LOCAL_T_STRING "s"
#define ABTXI_prof_get_local_time_to_sec() 1.0
#endif
#define ABTXI_PROF_EVENT_THREAD_CREATE 0
#define ABTXI_PROF_EVENT_THREAD_JOIN 1
#define ABTXI_PROF_EVENT_THREAD_FREE 2
#define ABTXI_PROF_EVENT_THREAD_REVIVE 3
#define ABTXI_PROF_EVENT_THREAD_RUN 4
#define ABTXI_PROF_EVENT_THREAD_FINISH 5
#define ABTXI_PROF_EVENT_THREAD_CANCEL 6
#define ABTXI_PROF_EVENT_THREAD_YIELD 7
#define ABTXI_PROF_EVENT_THREAD_SUSPEND 8
#define ABTXI_PROF_EVENT_THREAD_RESUME 9
#define ABTXI_PROF_EVENT_END_ 10
#define ABTXI_PROF_WU_TIME_THREAD_ELAPSED 0
#define ABTXI_PROF_WU_TIME_THREAD_CREATE_FIRST_RUN 1
#define ABTXI_PROF_WU_TIME_THREAD_FIRST_RUN_LAST_FINISH 2
#define ABTXI_PROF_WU_TIME_THREAD_CREATE_LAST_FINISH 3
#define ABTXI_PROF_WU_TIME_THREAD_CREATE_FREE 4
#define ABTXI_PROF_WU_TIME_END_ 5
#define ABTXI_PROF_WU_LOCAL_TIME_THREAD_ELAPSED 0
#define ABTXI_PROF_WU_LOCAL_TIME_END_ 1
#define ABTXI_PROF_WU_COUNT_THREAD_NUM_REVIVALS 0
#define ABTXI_PROF_WU_COUNT_THREAD_NUM_YIELDS 1
#define ABTXI_PROF_WU_COUNT_THREAD_NUM_SUSPENSIONS 2
#define ABTXI_PROF_WU_COUNT_THREAD_NUM_XSTREAM_CHANGES 3
#define ABTXI_PROF_WU_COUNT_END_ 4
#define ABTXI_PROF_TIME_LAST_RUN_INVALID ABTXI_PROF_T_INVALID
#define ABTXI_PROF_TIME_LAST_RUN_LOCAL_INVALID ABTXI_PROF_LOCAL_T_INVALID
static const char *ABTXI_get_prof_event_name(int event)
{
switch (event) {
case ABTXI_PROF_EVENT_THREAD_CREATE:
return "ULT/create";
case ABTXI_PROF_EVENT_THREAD_JOIN:
return "ULT/join";
case ABTXI_PROF_EVENT_THREAD_FREE:
return "ULT/free";
case ABTXI_PROF_EVENT_THREAD_REVIVE:
return "ULT/revive";
case ABTXI_PROF_EVENT_THREAD_RUN:
return "ULT/run";
case ABTXI_PROF_EVENT_THREAD_FINISH:
return "ULT/finish";
case ABTXI_PROF_EVENT_THREAD_CANCEL:
return "ULT/cancel";
case ABTXI_PROF_EVENT_THREAD_YIELD:
return "ULT/yield";
case ABTXI_PROF_EVENT_THREAD_SUSPEND:
return "ULT/suspend";
case ABTXI_PROF_EVENT_THREAD_RESUME:
return "ULT/resume";
default:
return "ERR";
}
}
static const char *ABTXI_get_prof_wu_time_name(int wu_time)
{
switch (wu_time) {
case ABTXI_PROF_WU_TIME_THREAD_ELAPSED:
return "ULT/elapsed";
case ABTXI_PROF_WU_TIME_THREAD_CREATE_FIRST_RUN:
return "ULT/T_firstrun-T_create";
case ABTXI_PROF_WU_TIME_THREAD_FIRST_RUN_LAST_FINISH:
return "ULT/T_lastfinish-T_firstrun";
case ABTXI_PROF_WU_TIME_THREAD_CREATE_LAST_FINISH:
return "ULT/T_lastfinish-T_create";
case ABTXI_PROF_WU_TIME_THREAD_CREATE_FREE:
return "ULT/T_free-T_create";
default:
return "ERR";
}
}
#if ABTXI_PROF_USE_TIME_LOCAL
static const char *ABTXI_get_prof_wu_local_time_name(int wu_local_time)
{
switch (wu_local_time) {
case ABTXI_PROF_WU_LOCAL_TIME_THREAD_ELAPSED:
return "ULT/actual_elapsed";
default:
return "ERR";
}
}
#endif
static const char *ABTXI_get_prof_wu_count_name(int wu_count)
{
switch (wu_count) {
case ABTXI_PROF_WU_COUNT_THREAD_NUM_REVIVALS:
return "ULT/revive";
case ABTXI_PROF_WU_COUNT_THREAD_NUM_YIELDS:
return "ULT/yield";
case ABTXI_PROF_WU_COUNT_THREAD_NUM_SUSPENSIONS:
return "ULT/suspend";
case ABTXI_PROF_WU_COUNT_THREAD_NUM_XSTREAM_CHANGES:
return "ULT/ES-change";
default:
return "ERR";
}
}
#define ABTXI_prof_max(a, b) ((a) > (b) ? (a) : (b))
#define ABTXI_prof_min(a, b) ((a) < (b) ? (a) : (b))
#define ABTXI_prof_div_s(a, b) ((b) != 0 ? ((a) / (b)) : 0)
static inline double ABTXI_prof_pow(double base, int exp)
{
if (exp == 0) {
return 1.0;
} else if (exp <= -1) {
return 1.0 / ABTXI_prof_pow(base, -exp);
} else {
double val = ABTXI_prof_pow(base, exp / 2);
return val * val * (exp % 2 == 0 ? 1.0 : base);
}
}
static inline int ABTXI_prof_digit(double val)
{
if (-1.0e-10 < val && val < 1.0e-10) {
/* Too small. This is zero. */
return -99;
} else if (-1.0 < val && val < 1.0) {
return -1 + ABTXI_prof_digit(val * 10.0);
} else if (val < -10.0 || 10.0 < val) {
return 1 + ABTXI_prof_digit(val * 0.1);
} else {
return 0;
}
}
typedef struct ABTXI_prof_wu_time ABTXI_prof_wu_time;
typedef struct ABTXI_prof_wu_count ABTXI_prof_wu_count;
#if ABTXI_PROF_USE_TIME_LOCAL
typedef struct ABTXI_prof_wu_local_time ABTXI_prof_wu_local_time;
#endif
typedef struct ABTXI_prof_thread_info ABTXI_prof_thread_info;
typedef struct ABTXI_prof_thread_data ABTXI_prof_thread_data;
typedef struct ABTXI_prof_xstream_data ABTXI_prof_xstream_data;
typedef struct ABTXI_prof_xstream_info ABTXI_prof_xstream_info;
typedef struct ABTXI_prof_global ABTXI_prof_global;
typedef struct ABTXI_prof_data_table ABTXI_prof_data_table;
typedef struct ABTXI_prof_str_mem ABTXI_prof_str_mem;
typedef struct ABTXI_prof_spinlock ABTXI_prof_spinlock;
struct ABTXI_prof_spinlock {
volatile int val;
};
struct ABTXI_prof_wu_time {
ABTXI_PROF_T max_val, min_val, sum;
uint64_t cnt;
};
struct ABTXI_prof_wu_count {
uint64_t max_val, min_val, sum;
uint64_t cnt;
};
#if ABTXI_PROF_USE_TIME_LOCAL
struct ABTXI_prof_wu_local_time {
ABTXI_PROF_LOCAL_T max_val, min_val, sum;
uint64_t cnt;
};
#endif
struct ABTXI_prof_thread_data {
int num_revivals;
int num_yields;
int num_suspensions;
int num_xstream_changes; /* At least one if it runs on only one xstream */
ABTXI_PROF_T time_created;
ABTXI_PROF_T time_first_run;
ABTXI_PROF_T time_last_run;
#if ABTXI_PROF_USE_TIME_LOCAL
ABTXI_PROF_LOCAL_T time_last_run_local;
#endif
ABTXI_PROF_T time_last_finish;
ABTXI_PROF_T time_elapsed;
#if ABTXI_PROF_USE_TIME_LOCAL
ABTXI_PROF_LOCAL_T time_elapsed_local;
#endif
ABT_xstream prev_xstream;
ABT_thread owner;
};
struct ABTXI_prof_thread_info {
ABTXI_prof_thread_data d;
ABTXI_prof_thread_info *p_next_unused; /* p_thread_unused. */
ABTXI_prof_thread_info *p_next_all; /* p_thread_all */
};
struct ABTXI_prof_xstream_data {
int cur_depth; /* Stack depth value "+1" (0: uninitialized) */
uint64_t num_events[ABTXI_PROF_EVENT_END_];
#if !ABTXI_PROF_USE_TIME_LOCAL
ABTXI_PROF_T times_last_run[ABTXI_PROF_MAX_DEPTH];
#else
/* First and last values of ABTXI_prof_get_time_local() */
ABTXI_PROF_LOCAL_T time_first_run_local;
ABTXI_PROF_LOCAL_T time_last_run_local;
ABTXI_PROF_LOCAL_T times_last_run_local[ABTXI_PROF_MAX_DEPTH];
#endif
#if !ABTXI_PROF_USE_TIME_LOCAL
ABTXI_PROF_T times_elapsed[ABTXI_PROF_MAX_DEPTH]; /* index = depth */
#else
ABTXI_PROF_LOCAL_T times_elapsed_local[ABTXI_PROF_MAX_DEPTH];
#endif
ABTXI_prof_wu_time wu_times[ABTXI_PROF_WU_TIME_END_];
ABTXI_prof_wu_count wu_counts[ABTXI_PROF_WU_COUNT_END_];
#if ABTXI_PROF_USE_TIME_LOCAL
ABTXI_prof_wu_local_time wu_local_times[ABTXI_PROF_WU_LOCAL_TIME_END_];
#endif
};
struct ABTXI_prof_xstream_info {
int rank; /* -1 if external threads */
int tag; /* Tag for consistency. Odd value means dirty. */
ABTXI_prof_xstream_data d;
/* Memory pool */
ABTXI_prof_thread_info *p_thread_all;
ABTXI_prof_thread_info *p_thread_unused;
void *p_memblock_head; /* List of memory blocks */
ABTXI_prof_global *p_global;
ABTXI_prof_xstream_info *p_next;
};
struct ABTXI_prof_data_table {
int num_columns;
const char **column_names;
int num_rows;
const char **row_names;
double *values; /* [row * num_cols + col] */
};
struct ABTXI_prof_str_mem {
int len, cursor;
char *s;
ABTXI_prof_str_mem *p_next;
};
#define ABTXI_PROF_GLOBAL_STATE_CLEAN 0
#define ABTXI_PROF_GLOBAL_STATE_RUNNING 1
#define ABTXI_PROF_GLOBAL_STATE_STOPPED 2
struct ABTXI_prof_global {
ABT_key prof_key;
int prof_mode;
double to_sec; /* ABTXI_PROF_T -> double [s] */
int state; /* 0: clean, 1: running, 2: stopped */
ABTXI_PROF_T start_prof_time;
ABTXI_PROF_T stop_prof_time;
/* xstream_info_key is for external threads. */
pthread_key_t xstream_info_key;
ABTXI_prof_spinlock xstreams_lock; /* spinlock */
ABTXI_prof_xstream_info *p_xstream_info_head;
int len_p_xstreams; /* Length of p_xstreams. */
ABTXI_prof_xstream_info **p_xstreams; /* Can be referenced by "rank" */
void *mem_p_xstreams; /* Memory list of p_xstreams */
};
static inline void ABTXI_prof_spin_init(ABTXI_prof_spinlock *p_lock)
{
#ifndef ABTXI_PROF_USE_SYNC_BUILTIN
__atomic_clear(&p_lock->val, __ATOMIC_RELAXED);
#else
p_lock->val = 0;
#endif
}
static inline void ABTXI_prof_spin_destroy(ABTXI_prof_spinlock *p_lock)
{
#ifndef ABTXI_PROF_USE_SYNC_BUILTIN
__atomic_clear(&p_lock->val, __ATOMIC_RELAXED);
#else
p_lock->val = 0;
#endif
}
static inline void ABTXI_prof_spin_lock(ABTXI_prof_spinlock *p_lock)
{
#ifndef ABTXI_PROF_USE_SYNC_BUILTIN
while (__atomic_test_and_set(&p_lock->val, __ATOMIC_ACQUIRE))
;
#else
while (__sync_lock_test_and_set(&p_lock->val, 1))
__sync_synchronize();
#endif
}
static inline void ABTXI_prof_spin_unlock(ABTXI_prof_spinlock *p_lock)
{
#ifndef ABTXI_PROF_USE_SYNC_BUILTIN
__atomic_clear(&p_lock->val, __ATOMIC_RELEASE);
#else
__sync_lock_release(&p_lock->val);
#endif
}
static inline int ABTXI_prof_atomic_relaxed_load_int(int *p_int)
{
#ifndef ABTXI_PROF_USE_SYNC_BUILTIN
return __atomic_load_n(p_int, __ATOMIC_RELAXED);
#else
return *((volatile int *)p_int);
#endif
}
static inline int ABTXI_prof_atomic_acquire_load_int(int *p_int)
{
#ifndef ABTXI_PROF_USE_SYNC_BUILTIN
return __atomic_load_n(p_int, __ATOMIC_RELAXED);
#else
int ret;
__sync_synchronize();
ret = *((volatile int *)p_int);
__sync_synchronize();
return ret;
#endif
}
static inline void ABTXI_prof_atomic_relaxed_store_int(int *p_int, int val)
{
#ifndef ABTXI_PROF_USE_SYNC_BUILTIN
__atomic_store_n(p_int, val, __ATOMIC_RELAXED);
#else
*((volatile int *)p_int) = val;
#endif
}
static inline void ABTXI_prof_atomic_release_store_int(int *p_int, int val)
{
#ifndef ABTXI_PROF_USE_SYNC_BUILTIN
__atomic_store_n(p_int, val, __ATOMIC_RELEASE);
#else
__sync_synchronize();
*((volatile int *)p_int) = val;
__sync_synchronize();
#endif
}
static inline void *ABTXI_prof_atomic_relaxed_load_ptr(void **p_ptr)
{
#ifndef ABTXI_PROF_USE_SYNC_BUILTIN
return __atomic_load_n(p_ptr, __ATOMIC_RELAXED);
#else
return *((void *volatile *)p_ptr);
#endif
}
static inline void *ABTXI_prof_atomic_acquire_load_ptr(void **p_ptr)
{
#ifndef ABTXI_PROF_USE_SYNC_BUILTIN
return __atomic_load_n(p_ptr, __ATOMIC_RELAXED);
#else
void *ret;
__sync_synchronize();
ret = *((void *volatile *)p_ptr);
__sync_synchronize();
return ret;
#endif
}
static inline void ABTXI_prof_atomic_relaxed_store_ptr(void **p_ptr, void *val)
{
#ifndef ABTXI_PROF_USE_SYNC_BUILTIN
__atomic_store_n(p_ptr, val, __ATOMIC_RELAXED);
#else
*((void *volatile *)p_ptr) = val;
#endif
}
static inline void ABTXI_prof_atomic_release_store_ptr(void **p_ptr, void *val)
{
#ifndef ABTXI_PROF_USE_SYNC_BUILTIN
__atomic_store_n(p_ptr, val, __ATOMIC_RELEASE);
#else
__sync_synchronize();
*((void *volatile *)p_ptr) = val;
__sync_synchronize();
#endif
}
static inline void ABTXI_prof_wu_time_add(ABTXI_prof_wu_time *p_wu_time,
ABTXI_PROF_T val)
{
if (ABTXI_prof_unlikely(p_wu_time->cnt == 0)) {
p_wu_time->max_val = val;
p_wu_time->min_val = val;
} else {
if (p_wu_time->max_val < val)
p_wu_time->max_val = val;
if (p_wu_time->min_val > val)
p_wu_time->min_val = val;
}
p_wu_time->sum += val;
p_wu_time->cnt += 1;
}
static void ABTXI_prof_wu_time_merge(ABTXI_prof_wu_time *p_dest,
const ABTXI_prof_wu_time *p_src)
{
if (p_dest->cnt == 0) {
p_dest->max_val = p_src->max_val;
p_dest->min_val = p_src->min_val;
} else if (p_src->cnt != 0) {
if (p_dest->max_val < p_src->max_val)
p_dest->max_val = p_src->max_val;
if (p_dest->min_val > p_src->min_val)
p_dest->min_val = p_src->min_val;
}
p_dest->sum += p_src->sum;
p_dest->cnt += p_src->cnt;
}
static inline void ABTXI_prof_wu_count_add(ABTXI_prof_wu_count *p_wu_count,
uint64_t val)
{
if (ABTXI_prof_unlikely(p_wu_count->cnt == 0)) {
p_wu_count->max_val = val;
p_wu_count->min_val = val;
} else {
if (p_wu_count->max_val < val)
p_wu_count->max_val = val;
if (p_wu_count->min_val > val)
p_wu_count->min_val = val;
}
p_wu_count->sum += val;
p_wu_count->cnt += 1;
}
static void ABTXI_prof_wu_count_merge(ABTXI_prof_wu_count *p_dest,
const ABTXI_prof_wu_count *p_src)
{
if (p_dest->cnt == 0) {
p_dest->max_val = p_src->max_val;
p_dest->min_val = p_src->min_val;
} else if (p_src->cnt != 0) {
if (p_dest->max_val < p_src->max_val)
p_dest->max_val = p_src->max_val;
if (p_dest->min_val > p_src->min_val)
p_dest->min_val = p_src->min_val;
}
p_dest->sum += p_src->sum;
p_dest->cnt += p_src->cnt;
}
#if ABTXI_PROF_USE_TIME_LOCAL
static inline void
ABTXI_prof_wu_local_time_add(ABTXI_prof_wu_local_time *p_wu_local_time,
ABTXI_PROF_LOCAL_T val)
{
if (ABTXI_prof_unlikely(p_wu_local_time->cnt == 0)) {
p_wu_local_time->max_val = val;
p_wu_local_time->min_val = val;
} else {
if (p_wu_local_time->max_val < val)
p_wu_local_time->max_val = val;
if (p_wu_local_time->min_val > val)
p_wu_local_time->min_val = val;
}
p_wu_local_time->sum += val;
p_wu_local_time->cnt += 1;
}
static void
ABTXI_prof_wu_local_time_merge(ABTXI_prof_wu_local_time *p_dest,
const ABTXI_prof_wu_local_time *p_src)
{
if (p_dest->cnt == 0) {
p_dest->max_val = p_src->max_val;
p_dest->min_val = p_src->min_val;
} else if (p_src->cnt != 0) {
if (p_dest->max_val < p_src->max_val)
p_dest->max_val = p_src->max_val;
if (p_dest->min_val > p_src->min_val)
p_dest->min_val = p_src->min_val;
}
p_dest->sum += p_src->sum;
p_dest->cnt += p_src->cnt;
}
#endif
static ABTXI_prof_str_mem *ABTXI_prof_str_mem_alloc(int reserved)
{
ABTXI_prof_str_mem *p_str =
(ABTXI_prof_str_mem *)malloc(sizeof(ABTXI_prof_str_mem) + reserved);
p_str->s = ((char *)p_str) + sizeof(ABTXI_prof_str_mem);
p_str->len = reserved;
p_str->cursor = 0;
p_str->p_next = 0;
return p_str;
}
static char *ABTXI_prof_sprintf(ABTXI_prof_str_mem *p_str, size_t max_n,
const char *format, ...)
{
va_list args;
va_start(args, format);
while (p_str->p_next) {
p_str = p_str->p_next;
}
if (p_str->len - p_str->cursor < (int)max_n) {
int newlen = max_n > 4096 ? max_n : 4096;
ABTXI_prof_str_mem *p_new = ABTXI_prof_str_mem_alloc(newlen);
p_str->p_next = p_new;
p_str = p_new;
}
char *s = p_str->s + p_str->cursor;
int len = vsprintf(s, format, args);
p_str->cursor += len + 1;
va_end(args);
return s;
}
static void ABTXI_prof_str_mem_free(ABTXI_prof_str_mem *p_str)
{
while (p_str) {
ABTXI_prof_str_mem *p_next = p_str->p_next;
free(p_str);
p_str = p_next;
}
}
static void ABTXI_prof_xstream_info_alloc_thread_info(
ABTXI_prof_xstream_info *p_xstream_info)
{
void *p_memblock = calloc(1, ABTXI_PROF_MEM_BLOCK_SIZE);
/* Add the newly allocate memblock to p_memblock_head */
*(void **)p_memblock = p_xstream_info->p_memblock_head;
p_xstream_info->p_memblock_head = p_memblock;
/* Extract empty thread_info from memblock. */
size_t offset = 128; /* 128 bytes for safe alignment. Note that the first
* block contains a pointer to the next memblock */
ABTXI_prof_thread_info *p_head_unused = p_xstream_info->p_thread_unused;
ABTXI_prof_thread_info *p_head_all = p_xstream_info->p_thread_all;
while (offset + sizeof(ABTXI_prof_thread_info) <=
ABTXI_PROF_MEM_BLOCK_SIZE) {
ABTXI_prof_thread_info *p_new =
(ABTXI_prof_thread_info *)(((char *)p_memblock) + offset);
p_new->p_next_unused = p_head_unused;
p_new->p_next_all = p_head_all;
p_head_unused = p_new;
p_head_all = p_new;
offset += sizeof(ABTXI_prof_thread_info);
}
p_xstream_info->p_thread_unused = p_head_unused;
/* p_thread_all must be updated atomically since it might be read by a print
* thread asynchronously. */
ABTXI_prof_atomic_release_store_ptr((void **)&p_xstream_info->p_thread_all,
(void *)p_head_all);
}
static inline void
ABTXI_prof_init_thread_info(ABTXI_prof_thread_info *p_thread_info)
{
/* Zero clear. */
memset(&p_thread_info->d, 0, sizeof(ABTXI_prof_thread_data));
}
static void ABTXI_prof_reset_thread_info(ABTXI_prof_thread_info *p_thread_info)
{
/* Basically zero clear. */
memset(&p_thread_info->d, 0, sizeof(ABTXI_prof_thread_data));
p_thread_info->p_next_unused = p_thread_info->p_next_all;
}
static inline ABTXI_prof_thread_info *
ABTXI_prof_get_thread_info(ABTXI_prof_global *p_global,
ABTXI_prof_xstream_info *p_xstream_info,
ABT_thread thread)
{
/* Multiple thread events will not be invoked for the same thread. */
ABTXI_prof_thread_info *p_thread_info;
ABT_key prof_key = p_global->prof_key;
ABT_thread_get_specific(thread, prof_key, (void **)&p_thread_info);
/* owner can be changed if thread_info has been reset by restarting the
* profiler. If it is the case, this p_thread_info is no longer belonging
* to this thread, so a new one must be allocated. */
if (ABTXI_prof_likely(p_thread_info && p_thread_info->d.owner == thread)) {
return p_thread_info;
} else {
if (!p_xstream_info->p_thread_unused) {
ABTXI_prof_xstream_info_alloc_thread_info(p_xstream_info);
}
p_thread_info = p_xstream_info->p_thread_unused;
/* This p_thread_info has been already initialized. */
p_xstream_info->p_thread_unused = p_thread_info->p_next_unused;
ABT_thread_set_specific(thread, prof_key, (void *)p_thread_info);
p_thread_info->d.owner = thread;
return p_thread_info;
}
}
static inline void
ABTXI_prof_merge_thread_data(ABTXI_prof_xstream_data *p_xstream_data,
const ABTXI_prof_thread_data *p_thread_data,
ABTXI_PROF_T time_freed)
{
/* Update statistics (counts) */
ABTXI_prof_wu_count_add(&p_xstream_data->wu_counts
[ABTXI_PROF_WU_COUNT_THREAD_NUM_REVIVALS],
p_thread_data->num_revivals);
ABTXI_prof_wu_count_add(&p_xstream_data->wu_counts
[ABTXI_PROF_WU_COUNT_THREAD_NUM_YIELDS],
p_thread_data->num_yields);
ABTXI_prof_wu_count_add(&p_xstream_data->wu_counts
[ABTXI_PROF_WU_COUNT_THREAD_NUM_SUSPENSIONS],
p_thread_data->num_suspensions);
ABTXI_prof_wu_count_add(
&p_xstream_data
->wu_counts[ABTXI_PROF_WU_COUNT_THREAD_NUM_XSTREAM_CHANGES],
p_thread_data->num_xstream_changes);
/* Update statistics (times) */
ABTXI_PROF_T time_created = p_thread_data->time_created;
ABTXI_PROF_T time_first_run = p_thread_data->time_first_run;
ABTXI_PROF_T time_last_finish = p_thread_data->time_last_finish;
ABTXI_PROF_T time_elapsed = p_thread_data->time_elapsed;
if (ABTXI_prof_likely(time_elapsed != ABTXI_PROF_T_ZERO)) {
ABTXI_prof_wu_time_add(&p_xstream_data->wu_times
[ABTXI_PROF_WU_TIME_THREAD_ELAPSED],
time_elapsed);
}
if (ABTXI_prof_likely(time_created != ABTXI_PROF_T_ZERO &&
time_first_run != ABTXI_PROF_T_ZERO)) {
ABTXI_prof_wu_time_add(
&p_xstream_data
->wu_times[ABTXI_PROF_WU_TIME_THREAD_CREATE_FIRST_RUN],
time_first_run - time_created);
}
if (ABTXI_prof_likely(time_first_run != ABTXI_PROF_T_ZERO &&
time_last_finish != ABTXI_PROF_T_ZERO)) {
ABTXI_prof_wu_time_add(
&p_xstream_data
->wu_times[ABTXI_PROF_WU_TIME_THREAD_FIRST_RUN_LAST_FINISH],
time_last_finish - time_first_run);
}
if (ABTXI_prof_likely(time_created != ABTXI_PROF_T_ZERO &&
time_last_finish != ABTXI_PROF_T_ZERO)) {
ABTXI_prof_wu_time_add(
&p_xstream_data
->wu_times[ABTXI_PROF_WU_TIME_THREAD_CREATE_LAST_FINISH],
time_last_finish - time_created);
}
if (ABTXI_prof_likely(time_created != ABTXI_PROF_T_ZERO &&
time_freed != ABTXI_PROF_T_ZERO)) {
ABTXI_prof_wu_time_add(&p_xstream_data->wu_times
[ABTXI_PROF_WU_TIME_THREAD_CREATE_FREE],
time_freed - time_created);
}
#if ABTXI_PROF_USE_TIME_LOCAL
/* Update statistics (local times) */
ABTXI_PROF_LOCAL_T time_elapsed_local = p_thread_data->time_elapsed_local;
if (ABTXI_prof_likely(time_elapsed_local != ABTXI_PROF_LOCAL_T_ZERO)) {
ABTXI_prof_wu_local_time_add(
&p_xstream_data
->wu_local_times[ABTXI_PROF_WU_LOCAL_TIME_THREAD_ELAPSED],
time_elapsed_local);
}
#endif
}
static inline void
ABTXI_prof_release_thread_info(ABTXI_prof_xstream_info *p_xstream_info,
ABTXI_prof_thread_info *p_thread_info,
ABTXI_PROF_T time_freed)
{
/* Update statistics */
ABTXI_prof_merge_thread_data(&p_xstream_info->d, &p_thread_info->d,
time_freed);
/* Return to the memory pool. */
ABTXI_prof_init_thread_info(p_thread_info);
p_thread_info->p_next_unused = p_xstream_info->p_thread_unused;
p_xstream_info->p_thread_unused = p_thread_info;
}
static void
ABTXI_prof_init_xstream_info(ABTXI_prof_global *p_global,
ABTXI_prof_xstream_info *p_xstream_info, int rank)
{
/* Zero clear. */
memset(p_xstream_info, 0, sizeof(ABTXI_prof_xstream_info));
p_xstream_info->p_global = p_global;
p_xstream_info->rank = rank;
}
static void
ABTXI_prof_reset_xstream_info(ABTXI_prof_xstream_info *p_xstream_info)
{
/* Basically zero clear. */
ABTXI_prof_thread_info *p_thread_all = p_xstream_info->p_thread_all;
memset(&p_xstream_info->d, 0, sizeof(ABTXI_prof_xstream_data));
/* Reset thread_info */
ABTXI_prof_thread_info *p_thread_cur = p_thread_all;
while (p_thread_cur) {
ABTXI_prof_reset_thread_info(p_thread_cur);
p_thread_cur = p_thread_cur->p_next_all;
}
p_xstream_info->p_thread_unused = p_thread_all;
}
static void
ABTXI_prof_destroy_xstream_info(ABTXI_prof_xstream_info *p_xstream_info)
{