/
sincostan.c
1102 lines (839 loc) · 34.7 KB
/
sincostan.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* sincostan.c -- sin, cos, and tan for standard math library.
These are preliminary versions of sin, cos, and tan written to provide
argument reduction using the correct value of pi, rather than pi rounded to
66-bits as used by the Intel trigonometric instructions. Versions to
enhance accuracy and speed may follow.
*/
#include <math.h>
// Describe the destination floating-point type.
#define SignificandBits 53
#define ExponentBits 11
/* Define a structure for accessing the internal components of a double.
s.exponent is the raw exponent field.
s.significand2 is the least two significant bits of the significand field.
The other fields are unused.
*/
typedef union
{
double d;
struct
{
#if defined __BIG_ENDIAN__
unsigned int sign : 1;
unsigned int exponent : ExponentBits;
unsigned int significand0 : SignificandBits-1-32;
unsigned int significand1 : 30;
unsigned int significand2 : 2;
#else // defined __BIG_ENDIAN__
unsigned int significand2 : 2;
unsigned int significand1 : 30;
unsigned int significand0 : SignificandBits-1-32;
unsigned int exponent : ExponentBits;
unsigned int sign : 1;
#endif // defined __BIG_ENDIAN__
} s;
} Double;
/* Each of the following numbers is a strict upper bound on some technique
for calculating sin, cos, or tan. That is, the technique is known to apply
when |x| < bound, where x is the function argument. (ReduceFull also
requires that \x| be greater than some amount around 0x1p-622, to avoid
underflow or loss of accuracy.)
If |x| < BoundDenormal, x is denormal or zero.
If |x| < BoundTrivialSin, x is the correctly rounded sine of x.
If |x| < BoundTrivialCos, 1 is the correctly rounded cosine of x.
If |x| < BoundTrivialTan, x is the correctly rounded tangent of x.
If |x| < BoundPolynomial, sinp(x) or cosp(x) may be used for sin(x) or
cos(x), with no argument reduction.
If |x| < BoundMedium, ReduceMedium may be used to reduce x.
If |x| < BoundFull, ReduceFull may be used to reduce x.
Otherwise, x is an infinity or a NaN.
*/
#define BoundDenormal 0x1p-1022
#define BoundTrivialSin 0x1.7137449123ef7p-26
#define BoundTrivialCos 0x1.6A09E667F3BCDp-27
#define BoundTrivialTan 0x1.250BFE1B082F5p-26
#define BoundPolynomial 0x3.243f6a8885a31p-2
#define BoundMedium 0x1.000013be57a40p19
#define BoundFull INFINITY
/* Below, we define methods to test whether a function argument, x, is within
bounds suitable for a techique for implementing the function. Several
methods are defined, using natural floating-point comparisons or using
integer comparisons on 32 or 64 bits of the floating-point representation.
Implementations may choose whichever method is best for performance.
To use, first prepare the key used for checking bounds:
BoundKeyType xk = Key(x);
That is done once at the start of a routine. Then, as many times as
desired, bounds are checked:
if (Within(xk, bound))
...
The "bound" must be one of the names defined above, such as BoundMedium.
If Within(xk, bound) evaluates to true, then |x| < bound. The converse is
not necessarily true; if Within(xk, bound) is false, |x| might be less than
bound. This is because some implementations of Within do not check all
bits of xk and bound. They are intended only as fast screens that get the
majority of values. However, if floor(log[2](|x|)) < floor(log[2](bound)),
Within(xk, bound) is guaranteed to be true -- its evaluation includes at
least all of the floating-point exponent.
*/
#if defined __i386__ // (Add any other desired architectures.)
/* This method uses integer comparisons with the most significant 32 bits
of the IEEE-754 representation of the floating-point number.
*/
#include <inttypes.h>
typedef uint32_t BoundKeyType;
typedef union
{
double d;
#if defined __BIG_ENDIAN__
struct { uint32_t key, notkey; } s;
#else
struct { uint32_t notkey, key; } s;
#endif
} DoubleForBound;
// Prepare constants containing the bounds in our desired form.
#define DefineBound(bound) \
static const DoubleForBound Double##bound = { bound };
DefineBound(BoundDenormal)
DefineBound(BoundTrivialSin)
DefineBound(BoundTrivialCos)
DefineBound(BoundTrivialTan)
DefineBound(BoundPolynomial)
DefineBound(BoundMedium)
DefineBound(BoundFull)
// Get the key for |x|.
static BoundKeyType Key(double x)
{
const DoubleForBound X = { x };
BoundKeyType t = X.s.key;
return t & 0x7fffffff;
}
#define Within(xk, bound) ((xk) < Double##bound.s.key)
#elif defined __x86_64__ // (Add any other desired architectures.)
/* This method uses integer comparisons with all 64 bits of the IEEE-754
representation of the floating-point number.
*/
#include <inttypes.h>
typedef uint64_t BoundKeyType;
typedef union
{
double d;
uint64_t key;
} DoubleForBound;
// Prepare constants containing the bounds in our desired form.
#define DefineBound(bound) \
static const DoubleForBound Double##bound = { bound };
DefineBound(BoundDenormal)
DefineBound(BoundTrivialSin)
DefineBound(BoundTrivialCos)
DefineBound(BoundTrivialTan)
DefineBound(BoundPolynomial)
DefineBound(BoundMedium)
DefineBound(BoundFull)
// Get the key for |x|.
static BoundKeyType Key(double x)
{
const DoubleForBound X = { x };
BoundKeyType t = X.key;
return t & 0x7fffffffffffffff;
}
#define Within(xk, bound) ((xk) < Double##bound.key)
#else
// If no other method is selected, use regular floating-point comparison.
typedef double BoundKeyType;
#define DefineBound(bound)
// Get the key for |x|.
static BoundKeyType Key(double x)
{
#if defined __i386__ || defined __x86_64__
// On Intel, just clear the sign bit.
double xk = -0.;
__asm__("andnpd %[x], %[xk]" : [xk] "+x" (xk) : [x] "x" (x));
return xk;
#elif 1
return x < 0 ? -x : +x;
#else
return fabs(x);
#endif
}
#define Within(xk, bound) ((xk) < bound)
#endif
/* Several argument-reduction routines follow. Each has this specification:
static void Reduce<Name>(double *xp, int *a, double x).
Input:
x is a number to be reduced modulo pi/2. Each routine has
qualifications on what values it supports for x. Generally, x must not
be a NaN or infinity.
Output:
*xp is set to a residue of x modulo pi/2. Mostly, -pi/4 <= x <= +pi/4,
but *xp may be outside this interval by as much as 3.24128e-11.
*a is set to the arc of the circle x is in. 0 <= *a < 4.
On output, x is approximately k * 2*pi + *a * pi/2 + *xp for some
integer k. This holds even if x is outside [-pi/4, pi/4]; *a matches
*xp.
This means all the reduction routines are returning *xp in radians.
ReduceFull goes through steps in which it has period-fraction units (0 to 1
corresponding to 0 to pi/2 radians), so the restoration of radians is
forced and introduces additional rounding error and execution time. We
might want to do something about that.
Since *xp is a single double, it is insufficient to represent the residue
precisely enough to compute a faithfully rounded function. For that, we
will have to return extended precision, perhaps a long double or perhaps
two doubles.
*/
/* static void ReduceFull(double *xp, int *a, double x).
Input:
x is a number to be reduced modulo pi/2. This routine requires 1 <=
|x|. (The actual limit is something smaller, perhaps around 0x1p-622,
but 1 suffices.)
Output:
*xp is set to x modulo pi/2.
*a is set to the arc of the circle x is in. 0 <= *a < 4.
On output, x is approximately k * 2*pi + *a * pi/2 + *xp for some
integer k.
This routine was adapted from the _sin function in double_fns.h (which
implements vvsin).
*/
static void ReduceFull(double *xp, int *a, double x)
{
// The rows are 8-bit shifts of 27-bit windows on 2/pi * 0x1p400.
static const double TwoOverPiWithOffset[4][45] =
{
{
+0x0.0000000p+0, +0x1.45f3070p+399, -0x1.1b1bbe8p+372,
-0x1.6b01ec8p+345, +0x1.5f47d50p+318, -0x1.6447e48p+291,
-0x1.3ad4ce0p+263, +0x1.e21c820p+237, +0x1.fe51640p+208,
-0x1.5085110p+182, +0x1.586dca0p+154, -0x1.c8e2df0p+129,
+0x1.374b800p+102, +0x1.924bbb0p+74, -0x1.f62e6e0p+48,
+0x1.cfe1df0p+20, -0x1.38d3b58p-6, -0x1.63045e0p-34,
+0x1.1afa980p-63, -0x1.44bb7b0p-88, -0x1.6638fe0p-116,
+0x1.ad17e00p-142, -0x1.bec66e0p-168, -0x1.4e33e58p-195,
+0x1.9cfa4e0p-223, +0x1.08bf178p-249, -0x1.036be40p-279,
+0x1.8ffc4c0p-303, -0x1.0fd3000p-339, -0x1.fc04340p-358,
-0x1.dce94c0p-385, +0x1.4da3ee0p-413, -0x1.64c0980p-439,
-0x1.b069ec8p-465, -0x1.1617380p-493, -0x1.32c3400p-521,
-0x1.5d28ae0p-548, +0x1.eeb1fb0p-574, -0x1.a0e8500p-604,
+0x1.e839cf8p-627, +0x1.e294a48p-654, +0x1.d4d7f68p-681,
+0x1.fb11f90p-708, -0x1.517bd50p-735, +0x1.9823800p-767
},
{
+0x0.0000000p+0, +0x1.45f3000p+399, +0x1.b727240p+377,
-0x1.f56b020p+353, +0x1.3abe900p+325, -0x1.5964480p+299,
+0x1.b6c52b0p+271, +0x1.93c4390p+244, +0x1.07f9480p+214,
-0x1.38a8428p+191, -0x1.0ea7920p+162, -0x1.b7238c0p+135,
+0x1.09374b8p+110, +0x1.924c000p+74, -0x1.15f62e8p+56,
+0x1.21cfe20p+28, -0x1.0a71a70p+1, -0x1.acb1820p-25,
-0x1.77dca08p-52, -0x1.68a25d8p-79, -0x1.ec598e0p-106,
-0x1.fb29740p-133, -0x1.037d8d0p-161, +0x1.1d63980p-188,
+0x1.a99cfa0p-215, +0x1.3908bf0p-241, +0x1.77bf250p-269,
+0x1.d8ffc80p-299, -0x1.a000880p-322, +0x1.6603fc0p-350,
-0x1.a1dce90p-377, -0x1.2fac970p-403, -0x1.2593040p-433,
+0x1.9e4f960p-457, +0x1.36e9e90p-485, -0x1.c099620p-512,
+0x1.7fa8b60p-538, -0x1.5b08a70p-565, -0x1.41a0e80p-596,
-0x1.30be300p-622, -0x1.821d6b8p-646, +0x1.25d4d80p-673,
-0x1.2813b80p-702, -0x1.ca8be00p-730, +0x1.580cc10p-754
},
{
+0x0.0000000p+0, +0x1.4600000p+399, -0x1.9f246c0p+386,
-0x1.bbead60p+360, -0x1.ec54100p+329, -0x1.c159648p+307,
+0x1.c0db628p+280, +0x1.5993c40p+252, +0x1.c820ff0p+225,
+0x1.458eaf0p+198, +0x1.ebbc560p+172, +0x1.b7246e0p+144,
+0x1.d2126f0p+117, -0x1.a3ff370p+91, +0x1.2eea0a0p+64,
-0x1.736f180p+37, -0x1.e214e40p+8, +0x1.62534e8p-17,
-0x1.177dc80p-48, -0x1.0568a28p-71, +0x1.1213a68p-98,
-0x1.c7eca60p-127, +0x1.7df9040p-154, +0x1.cc8eb20p-179,
-0x1.9f2b318p-206, -0x1.6c6f780p-237, +0x1.f8bbdf8p-260,
+0x1.283b200p-288, -0x1.da00080p-318, -0x1.fa67f00p-344,
-0x1.0d0ee80p-372, +0x1.6b414e0p-397, -0x1.7049650p-423,
+0x1.fb3c9f0p-450, +0x1.6136ea0p-477, -0x1.7381320p-505,
-0x1.8680578p-530, +0x1.aea4f78p-557, -0x1.38141a0p-584,
-0x1.d098600p-613, +0x1.ce7de28p-638, +0x1.4a4baa0p-666,
-0x1.404a050p-692, +0x1.1f8d5d0p-720, +0x1.0ac0680p-749
},
{
+0x0.0000000p+0, +0x1.8000000p+399, -0x1.d067c90p+396,
-0x1.b1bbeb0p+368, +0x1.4fe13b0p+341, -0x1.05c1598p+315,
+0x1.bb81b70p+287, -0x1.d6a66c0p+260, -0x1.de37df0p+233,
-0x1.ae9c400p+200, -0x1.4214438p+180, -0x1.4f246e0p+153,
+0x1.b8e9090p+126, +0x1.ba5c010p+99, -0x1.b6d1160p+72,
+0x1.3a32440p+43, -0x1.80f10a0p+17, -0x1.c69dac8p-9,
-0x1.8c11780p-36, +0x1.1afa978p-63, -0x1.12edec8p-90,
+0x1.338e050p-117, -0x1.4ba0818p-144, -0x1.f633718p-171,
+0x1.8e60d50p-198, -0x1.8c16c70p-225, +0x1.17e2f00p-254,
-0x1.036be28p-279, +0x1.ff89800p-308, -0x1.0fd3400p-339,
+0x1.fde5e00p-365, +0x1.18b5a10p-388, -0x1.64b8248p-414,
-0x1.9302618p-441, -0x1.834f648p-468, -0x1.6173820p-497,
+0x1.9a797f8p-522, +0x1.45aea50p-549, -0x1.14e0500p-578,
-0x1.a0e84c0p-604, -0x1.7c63040p-631, -0x1.d6b5b40p-658,
-0x1.59404a0p-684, -0x1.3b81cc0p-714, +0x1.7421580p-738
}
};
Double X = { x };
/* Set ec to the unbiased exponent minus 33. Why 33? I do not know.
This was in the vvsin code in double_fns.h.
*/
int ec = X.s.exponent - (1023+33);
// Set k to ceiling(ec / 27) and m to residue.
int k = (ec + 26) * (607*4) >> 16;
int m = 27*k - ec;
// offset is used to select a row in the reduction table. See below.
int offset = m >> 3;
/* The reduction table, TwoOverPiWithOffset, contains bits of 2/pi.
First, all entries are scaled by 2**400 to avoid overflow/underflow
issues. Each entry contains 27 bits of 2/pi, except in the first two
columns. The first column contains zeroes because no reduction is done
for numbers that are already small. The entries in the second row
start before (at a bit with higher value) the leading bit of 2/pi, so
they contain some leading zeroes. The sign bits form part of the 27
bits represented.
Each row forms a contiguous string of bits of 2/pi. That is, adding
all the entries with sufficient precision yields a single bit string
representing 2/pi. The rows differ in their starting points; row 0
begins with the leading bit of 2/pi, and rows 1, 2, and 3 begin 8, 16,
and 24 bits before that.
*/
/* Scale x to avoid overflow in Dekker split. This is compensated for
in the entries in TwoOverPiWithOffset.
*/
x *= 0x1p-400;
/* Use Dekker's algorithm to split x into 26 bits and 27 bits. This
requires round-to-nearest mode.
*/
double xDekker = x * (0x1p27 + 1);
double x0 = xDekker - (xDekker - x);
double x1 = x - x0;
// Get address of starting point in table.
const double *p0 = &TwoOverPiWithOffset[offset][k];
// Get table entries.
const double fp0 = p0[0];
const double fp1 = p0[1];
const double fp2 = p0[2];
const double fp3 = p0[3];
// Get high bits of x * f, where f is the part of 2/pi we are using.
const double f0 = x1 * fp0 + fp1 * x0;
double f = x1 * fp1 + fp2 * x0;
// Combine to do integer work.
const double fi = f0 + f;
static const double IntegerBias = 0x1.8p52;
// Force the integer bits into a specific position.
Double Fi = { fi + IntegerBias };
/* |fi| is less than 0x1p36, so fi + IntegerBias is well within
[0x1p52, 0x1p53), so it has a known exponent, and the bits with
weight 2 and 1 are the least significant bits in its significand.
We know |fi| is less than 0x1p36 because it is the sum of x1 * fp0,
fp1 * x0, and something small in f. x0 has the same exponent as
x. Say 2**e <= |x| < 2**(e+1). (That is the original x, before we
scaled it by 0x1p-400.) The table entry we look up for fp0 has
magnitude less than 2**(400+27-27*ceiling((e-33)/27)), and fp1 is
less, bounded by 2**(400-27*ceiling((e-33)/27)). Including the
scaling in x, fp1 * x0 is less than 2**(e+1-400) *
2**(400-27*ceiling((e-33)/27)) <= 2**34. Similarly, fp0 * x1 is
less than 2**34, so their sum is less than 2**35.
*/
// Get the two least significant integer bits.
*a = Fi.s.significand2;
double fint = Fi.d - IntegerBias;
const double fp4 = p0[4];
const double fp5 = p0[5];
const double fp6 = p0[6];
f = f0 - fint + f;
f += x1 * fp2 + fp3 * x0;
f += x1 * fp3 + fp4 * x0;
f += x1 * fp4 + fp5 * x0;
f += x1 * fp5 + fp6 * x0;
// Convert to radians by multiplying by pi/2.
*xp = f * 0x3.243F6A8885A3p-1;
}
/* static void ReduceMedium(double *xp, int *a, double x).
Input:
x is a number to be reduced modulo pi/2. This routine requires |x| <=
X. X is described below.
Output:
*xp is set to x modulo pi/2.
*a is set to the arc of the circle x is in. 0 <= *a < 4.
On output, x is approximately k * 2*pi + *a * pi/2 + *xp for some
integer k.
Nomenclature:
p is the period, pi/2.
X is the maximum value x may have. X is 0x1.000013be57a3fp19.
InversePeriod is 1/p, rounded to the nearest double, ties to even.
n is x * InversePeriod, rounded to the nearest double, ties to even,
and then rounded to the nearest integer, ties to even.
N is the maximum value n may have. N is X * InversePeriod, rounded to
the nearest double, ties to even, and then rounded to the nearest
integer, ties to even. N is 333773.
Notes:
In comments in this routine, unquoted expressions are mathematical and
quoted expressions are floating-point. Thus, "a + b" refers to the
floating-point operation of addition, including rounding.
"n * Period[0]" is exact for all |n| <= N. It is inexact for n = N+1,
so this is the source of the bound on x. X is the greatest value for
which |n| <= N. (It is happenstance that Period[0] limits x; if the
period were different, its bits might result in Period[1] causing the
limit. n has some other roles in the errors in this routine that might
limit x when used with other periods or different precisions.)
The difference between InversePeriod and 1/p can cause n to differ from
the ideal value when x is near a multiple of p. The result is that
instead of producing a result inside the target interval [-p/2, p/2], a
result slightly outside the interval is produced. The result may be as
much as 3.24128e-11 outside the interval. The polynomial and the
polynomial evaluation must be satisfactory over this extended interval.
*/
static void ReduceMedium(double *xp, int *a, double x)
{
static const double InversePeriod =
2 / 0x3.243f6a8885a308d313198a2e03707344ap0;
/* Period is an extended-precision representation of the reduction period.
Each element except the last has the property that multiplication by
any integer with magnitude at most N is exact.
*/
static const double Period[] = {
+0x1.921FB54440000p-0,
+0x1.68C234C4C0000p-39,
+0x1.98A2E03707345p-77
};
// Estimate x / p and round to nearest integer.
double n = x * InversePeriod + 0x1.8p52 - 0x1.8p52;
// Record which arc of the circle x lies in.
*a = (int) n & 3;
// This is exact, per design of Period.
double np0 = n * Period[0];
// This is exact, per design of Period.
double np1 = n * Period[1];
/* The error here is at most 1/2 ULP(n * Period[2]), which is
1/2 ULP(n * +0x1.98A2E03707345p-77) = ULP(n * +0x1.98A2E03707345p-78).
n <= N = 333773, so an absolute bound on that is ULP(333773 *
+0x1.98A2E03707345p-78) = 0x1p-111, but we need a bound dependent on n
to partition a proof below.
This is called error (a).
*/
double np2 = n * Period[2];
/* Set x00 to x - n * Period[0]. np0 is very near x, so this is exact,
per David Goldberg, _What Every Computer Scientist Should Know About
Floating-Point Arithmetic_, Theorem 11.
The nomenclature used here is:
x0 is x - n * (Period[0]),
x1 is x - n * (Period[0] + Period[1]), and
x2 is x - n * (Period[0] + Period[1] + Period[2]).
x00 is the first and only "word" of an extended precision
representation of x0.
x10 is the first "word" of x1; it has the most significant bits.
x11 is the next "word" of x1; it has the following bits.
*/
double x00 = x - np0;
/* Subtract n * Period[1] from x0 to produce x1. These operations yield
an exact result, per Knuth, _The Art of Computer Programming_ 2, second
edition, page 221, section 4.2.2, Theorem C, in the sense that x10 +
x11 will exactly equal x0 - n * Period[1]. The first statement puts
the high bits in x10, and the second statement determines what was
rounded away in x10 and puts it in x11.
Knuth assumes |np1| <= |x00|. This is often the case, but perhaps
cancellation has reduced the magnitude of x00. Hoever, if |x00| <
|np1|, this arithmetic is exact, because x00's least significant bit is
at least as large as the ULP of Period[1].
*/
double x10 = x00 - np1;
double x11 = x00 - x10 - np1;
/* This comment demonstrates the rounding error in the following
statement, "t = x11 - np2", is tiny relative to x10.
This is called error (b).
Let LSB(x) be the weight of the least significant bit set in a
floating-point number x. E.g, LSB(1.25) is 1/4, although ULP(1.25) is
2**-52.
Here, x10 and x11 are each multiples of LSB(Period[1]). If |x10| <
2**53 LSB(Period[1]), then x11 is zero, because all the bits of x10 +
x11 fit into x10. In that case, "x11 - np2" is exact.
Otherwise, 2**53 LSB(Period[1]) <= |x10|, so 2 LSB(Period[1]) <=
ULP(x10). LSB(Period[1]) = 0x1p-73, so 0x1p-72 <= ULP(x10).
By construction of x10 and x11, |x11| <= 1/2 ULP(x10).
|np2| <= |"n * Period[2]"| <= |n * Period[2] * (1+2**-53)| = 333733 *
0x1.98A2E03707345p-77 * (1+2**-53) < 0x1.0426p-58.
|"x11 - np2"| <= 2*max(|x11|, |np2|) <= 2*max(1/2 ULP(x10),
0x1.0426p-58) = max(ULP(x10), 0x1.0426p-57). The rounding error in
"x11 - np2" is at most 2**-53 times this, so it is at most max(2**-53
ULP(x10), 0x1p-110). Since ULP(x10) is at least 0x1p-72, the rounding
error is at most 0x1p-38 ULP(x10).
*/
double t = x11 - np2;
/* Add t to x1 to produce x2. As before, we could use Knuth's technique
to produce an exact sum, x20 + x21 = x10 + t. For now, we only return
x20. When we want extended precision, we could return x21 as well.
(If so, be careful to show that |t| <= |x10|, as required for Knuth's
technique, or otherwise show the arithmetic is sufficiently accurate.)
The difference between x20 and x10 + t is called error (c). It is at
most 1/2 ULP(x20).
*/
double x20 = x10 + t;
// double x21 = x10 - x20 + t;
*xp = x20;
/* Having subtracted n * (Period[0] + Period[1] + Period[2]) from x, we
have an error caused by the difference between p and Period[0] +
Period[1] + Period[2], which is less than 0x1.6fdb2p-131. So the error
is less than n * 0x1.6fdb2p-131.
n <= N = 333773, so an absolute bound on that is 333773 *
0x1.6fdb2p-131 < 0x1.d45fp-113, but we need a bound dependent on n to
partition a proof below.
This is called error (d).
Our reduction has four errors:
(a), which is at most ULP(n * +0x1.98A2E03707345p-78) <= 0x1p-111.
(b), which is at most 0x1p-38 ULP(x10).
(c), which is at most 1/2 ULP(x20).
(d), which is at most n * 0x1.6fdb2p-131 < 0x1.d45fp-113.
From the discussion of (b), if x11 is not zero, then 0x1p-72 <=
ULP(x10). Then x10 dominates the result, because np2 is so small that
subtracting it from x1 to produce x2 cannot reduce the exponent by more
than one. So the errors (a), (b), and (d) are tiny relative to x10,
and the total error is very nearly (c), 1/2 ULP(x20).
However, if x11 is zero, then (b) is zero, and we only need to consider
(a), (c), and (d). We partition this into two cases.
Case 0: |x| < 0x1p12.
The double-precision floating-point number closest to a multiple of
pi/2 in that interval is 0x1.6c6cbc45dc8dep6 (according to Maple
code from Muller, _Elementary Functions_), and it is about
0x1.6d61b58c99c43p-60 away from a multiple of pi/2.
n is at most 0x1p12 * InversePi, rounded to a double, so n < 2608.
Then:
(a) is at most ULP(2608 * +0x1.98A2E03707345p-78) = 0x1p-119.
(b) is zero.
(c) is at most 1/2 ULP(x20).
(d) is at most 2608 * 0x1.6fdb2p-131 < 0x1p-119.
The final result might be as low as about 0x1.6d6p-60. That has an
ULP of 0x1p-113, so (a) and (d) are small relative to it, and the
total error is nearly (c), 1/2 ULP(x20).
Case 1: 0x1p12 <= |x|.
We still have |x| <= 0x1.000013be57a3fp19. The double-precision
floating-point number closest to a multiple of pi/2 in that set is
0x1.6c6cbc45dc8dep11 (Muller again), and it is about
0x1.6d61b58c99c43p-55 away from a multiple of pi/2.
Then:
(a) is at most ULP(333773 * +0x1.98A2E03707345p-78) = 0x1p-111.
(b) is zero.
(c) is at most 1/2 ULP(x20).
(d) is at most 333773 * 0x1.6fdb2p-131 < 0x1.d45fp-113.
The final result might be as low as about 0x1.6d6p-55. That has an
ULP of 0x1p-108, so (a) and (d) are small relative to it, and the
total error is nearly (c), 1/2 ULP(x20).
Therefore, the total error is never much more than 1/2 ULP of the value
returned in *xp.
*/
}
/* static double sinp(double r)
Return sine(r) using only a polynomial, no reduction.
Input:
r, with |r| < pi/4 + 3.24128e-11.
Output:
Approximately sine(r) is returned.
*/
static double sinp(double r)
{
double rr = r * r;
/* Derived from Cephes Math Library Release 2.8: June, 2000. Maple's
infnorm routine says this polynomial is within .0688 ULP of
sine(r) for |r| < pi/4 + 3.24128e-11.
*/
return r + r * rr * ((((((
+ 1.58962301576546568060E-10) * rr
- 2.50507477628578072866E-8 ) * rr
+ 2.75573136213857245213E-6 ) * rr
- 1.98412698295895385996E-4 ) * rr
+ 8.33333333332211858878E-3 ) * rr
- 1.66666666666666307295E-1 );
}
/* static double cosp(double r)
Return cosine(r) using only a polynomial, no reduction.
Input:
r, with |r| < pi/4 + 3.24128e-11.
Output:
Approximately cosine(r) is returned.
*/
static double cosp(double r)
{
double rr = r * r;
/* Derived from Cephes Math Library Release 2.8: June, 2000. Maple's
infnorm routine says this polynomial is within .00709 ULP of
cosine(r) for |r| < pi/4 + 3.24128e-11.
*/
return (((((((
- 1.13585365213876817300E-11) * rr
+ 2.08757008419747316778E-9 ) * rr
- 2.75573141792967388112E-7 ) * rr
+ 2.48015872888517045348E-5 ) * rr
- 1.38888888888730564116E-3 ) * rr
+ 4.16666666666665929218E-2 ) * rr
- .5 ) * rr
+ 1;
}
/* double sin(double x).
Notes:
Citations in parentheses below indicate the source of a requirement.
"C" stands for ISO/IEC 9899:TC2.
The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
requirements since it defers to C and requires errno behavior only if
we choose to support it by arranging for "math_errhandling &
MATH_ERRNO" to be non-zero, which we do not.
Return value:
For +/- zero, return zero with same sign (C F.9 12 and F.9.1.6).
For +/- infinity, return a NaN (C F.9.1.6).
For a NaN, return the same NaN (C F.9 11 and 13).
Otherwise:
If the rounding mode is round-to-nearest, return sine(x) within a
few ULP. The maximum error of this routine is not precisely
known. The maximum error of the reduction might be around 3 ULP,
although this is partly a guess. The polynomials have small
errors. The polynomial evaluation might have an error under 1
ULP. So the worst error for this routine might be under 4 ULP.
Not currently implemented: In other rounding modes, return sine(x)
possibly with slightly worse error, not necessarily honoring the
rounding mode (Ali Sazegari narrowing C F.9 10).
All results are in [-1, 1].
Exceptions:
Raise underflow for a denormal result (C F.9 7 and Draft Standard for
Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the
smallest normal, underflow may or may not be raised. This is stricter
than the older 754 standard.
May or may not raise inexact, even if the result is exact (C F.9 8).
Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
of C 4.2.1) or an infinity (C F.9.1.6) but not if the input is a quiet
NaN (C F.9 11).
May not raise exceptions otherwise (C F.9 9).
Properties:
Desired to be monotonic. Not yet proven!
*/
double sin(double x)
{
// Get |x| in form used by the bound-check operations for a key.
BoundKeyType xk = Key(x);
double r;
int j0;
/* Handle denormal numbers and zero here to generate underflow for
denormals numbers and to avoid generating inexact for zero.
*/
if (Within(xk, BoundDenormal))
return x * (1 - 0x1p-53);
/* For small numbers, we can return x for sine(x). Unfortunately,
adding this check slows down the other cases, which may be more
frequent.
*/
else if (Within(xk, BoundTrivialSin))
{
/* Make t0 volatile to force compiler to fetch it at run-time
rather than optimizing away the multiplication.
*/
static volatile const double t0 = 1/3.;
/* Get t0 once. If we wrote "t0*t0", the compiler would load it
twice, since it is volatile.
*/
const double t1 = t0;
/* Perform a multiplication and pass it into an assembly construct to
prevent the compiler from knowing we do not use the result and
optimizing away the multiplication.
*/
__asm__("" : : "X" (t1*t1));
// Return the floating-point number nearest sine(x), which is x.
return x;
}
// If |x| is small, no reduction is necessary.
else if (Within(xk, BoundPolynomial))
r = x, j0 = 0;
/* The interval for this could be enlarged by examining the sine
polynomial and figuring out how big r can get before the error is
too large.
*/
// If |x| is medium, we can use a fast reduction routine.
else if (Within(xk, BoundMedium))
ReduceMedium(&r, &j0, x);
// Otherwise, we need the full, slow reduction routine.
else if (Within(xk, BoundFull))
ReduceFull(&r, &j0, x);
else
return x-x;
switch (j0)
{
default:
case 0: return +sinp(r);
case 1: return +cosp(r);
case 2: return -sinp(r);
case 3: return -cosp(r);
}
}
/* double cos(double x).
Notes:
Citations in parentheses below indicate the source of a requirement.
"C" stands for ISO/IEC 9899:TC2.
The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
requirements since it defers to C and requires errno behavior only if
we choose to support it by arranging for "math_errhandling &
MATH_ERRNO" to be non-zero, which we do not.
Return value:
For +/- infinity, return a NaN (C F.9.1.6).
For a NaN, return the same NaN (C F.9 11 and 13).
Otherwise:
If the rounding mode is round-to-nearest, return cosine(x) within a
few ULP. The maximum error of this routine is not precisely
known. The maximum error of the reduction might be around 3 ULP,
although this is partly a guess. The polynomials have small
errors. The polynomial evaluation might have an error under 1
ULP. So the worst error for this routine might be under 4 ULP.
Not currently implemented: In other rounding modes, return
cosine(x) possibly with slightly worse error, not necessarily
honoring the rounding mode (Ali Sazegari narrowing C F.9 10).
All results are in [-1, 1].
Exceptions:
Raise underflow for a denormal result (C F.9 7 and Draft Standard for
Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the
smallest normal, underflow may or may not be raised. This is stricter
than the older 754 standard.
May or may not raise inexact, even if the result is exact (C F.9 8).
Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
of C 4.2.1) or an infinity (C F.9.1.5) but not if the input is a quiet
NaN (C F.9 11).
May not raise exceptions otherwise (C F.9 9).
Properties:
Desired to be monotonic. Not yet proven!
*/
double cos(double x)
{
// Get |x| in form used by the bound-check operations for a key.
BoundKeyType xk = Key(x);
double r;
int j0;
// Avoid generating inexact for zero.
if (x == 0)
return 1;
/* For small numbers, we can return 1 for cosine(x). Unfortunately,
adding this check slows down the other cases, which may be more
frequent. It greatly speeds up cases where |x| < 0x1p-492, by
avoiding arithmetic with denormals.
*/
else if (Within(xk, BoundTrivialCos))
{
/* Make t0 volatile to force compiler to fetch it at run-time
rather than optimizing away the multiplication.
*/
static volatile const double t0 = 1/3.;
/* Get t0 once. If we wrote "t0*t0", the compiler would load it
twice, since it is volatile.
*/
const double t1 = t0;
/* Perform a multiplication and pass it into an assembly construct to
prevent the compiler from knowing we do not use the result and
optimizing away the multiplication.
*/
__asm__("" : : "X" (t1*t1));
// Return the floating-point number nearest cosine(x), which is 1.
return 1;
}
// If |x| is small, no reduction is necessary.
else if (Within(xk, BoundPolynomial))
r = x, j0 = 0;
/* The interval for this could be enlarged by examining the cosine
polynomial and figuring out how big r can get before the error is
too large.
*/
// If |x| is medium, we can use a fast reduction routine.
else if (Within(xk, BoundMedium))
ReduceMedium(&r, &j0, x);
// Otherwise, we need the full, slow reduction routine.
else if (Within(xk, BoundFull))
ReduceFull(&r, &j0, x);
else
return x-x;
switch (j0)
{
default:
case 0: return +cosp(r);
case 1: return -sinp(r);
case 2: return -cosp(r);
case 3: return +sinp(r);
}
}
/* double tan(double x).
Notes:
Citations in parentheses below indicate the source of a requirement.
"C" stands for ISO/IEC 9899:TC2.
The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
requirements since it defers to C and requires errno behavior only if
we choose to support it by arranging for "math_errhandling &
MATH_ERRNO" to be non-zero, which we do not.
Return value:
For +/- zero, return zero with same sign (C F.9 12 and F.9.1.7).
For +/- infinity, return a NaN (C F.9.1.7).