Source/Intel/sincostan.c

/*	sincostan.c -- sin, cos, and tan for standard math library.

	These are preliminary versions of sin, cos, and tan written to provide
	argument reduction using the correct value of pi, rather than pi rounded to
	66-bits as used by the Intel trigonometric instructions.  Versions to
	enhance accuracy and speed may follow.
*/


#include <math.h>


// Describe the destination floating-point type.
#define	SignificandBits	53
#define	ExponentBits	11


/*	Define a structure for accessing the internal components of a double.

	s.exponent is the raw exponent field.

	s.significand2 is the least two significant bits of the significand field.

	The other fields are unused.
*/
typedef union
{
	double d;
	struct
	{
		#if defined __BIG_ENDIAN__
			unsigned int sign         : 1;
			unsigned int exponent     : ExponentBits;
			unsigned int significand0 : SignificandBits-1-32;
			unsigned int significand1 : 30;
			unsigned int significand2 :  2;
		#else	// defined __BIG_ENDIAN__
			unsigned int significand2 :  2;
			unsigned int significand1 : 30;
			unsigned int significand0 : SignificandBits-1-32;
			unsigned int exponent     : ExponentBits;
			unsigned int sign         : 1;
		#endif	// defined __BIG_ENDIAN__
	} s;
} Double;


/*	Each of the following numbers is a strict upper bound on some technique
	for calculating sin, cos, or tan.  That is, the technique is known to apply
	when |x| < bound, where x is the function argument.  (ReduceFull also
	requires that \x| be greater than some amount around 0x1p-622, to avoid
	underflow or loss of accuracy.)

	If |x| < BoundDenormal, x is denormal or zero.

	If |x| < BoundTrivialSin, x is the correctly rounded sine of x.

	If |x| < BoundTrivialCos, 1 is the correctly rounded cosine of x.

	If |x| < BoundTrivialTan, x is the correctly rounded tangent of x.

	If |x| < BoundPolynomial, sinp(x) or cosp(x) may be used for sin(x) or
	cos(x), with no argument reduction.

	If |x| < BoundMedium, ReduceMedium may be used to reduce x.

	If |x| < BoundFull, ReduceFull may be used to reduce x.

	Otherwise, x is an infinity or a NaN.
*/
#define	BoundDenormal	0x1p-1022
#define	BoundTrivialSin	0x1.7137449123ef7p-26
#define	BoundTrivialCos	0x1.6A09E667F3BCDp-27
#define	BoundTrivialTan	0x1.250BFE1B082F5p-26
#define	BoundPolynomial	0x3.243f6a8885a31p-2
#define	BoundMedium		0x1.000013be57a40p19
#define	BoundFull		INFINITY


/*	Below, we define methods to test whether a function argument, x, is within
	bounds suitable for a techique for implementing the function.  Several
	methods are defined, using natural floating-point comparisons or using
	integer comparisons on 32 or 64 bits of the floating-point representation.
	Implementations may choose whichever method is best for performance.

	To use, first prepare the key used for checking bounds:

		BoundKeyType xk = Key(x);

	That is done once at the start of a routine.  Then, as many times as
	desired, bounds are checked:

		if (Within(xk, bound))
			...

	The "bound" must be one of the names defined above, such as BoundMedium.

	If Within(xk, bound) evaluates to true, then |x| < bound.  The converse is
	not necessarily true; if Within(xk, bound) is false, |x| might be less than
	bound.  This is because some implementations of Within do not check all
	bits of xk and bound.  They are intended only as fast screens that get the
	majority of values.  However, if floor(log[2](|x|)) < floor(log[2](bound)),
	Within(xk, bound) is guaranteed to be true -- its evaluation includes at
	least all of the floating-point exponent.
*/
#if defined __i386__	// (Add any other desired architectures.)

	/*	This method uses integer comparisons with the most significant 32 bits
		of the IEEE-754 representation of the floating-point number.
	*/

	#include <inttypes.h>

	typedef uint32_t BoundKeyType;

	typedef union
	{
		double d;
		#if defined __BIG_ENDIAN__
			struct { uint32_t key, notkey; } s;
		#else
			struct { uint32_t notkey, key; } s;
		#endif
	} DoubleForBound;

	// Prepare constants containing the bounds in our desired form.
	#define	DefineBound(bound)	\
		static const DoubleForBound Double##bound = { bound };
	DefineBound(BoundDenormal)
	DefineBound(BoundTrivialSin)
	DefineBound(BoundTrivialCos)
	DefineBound(BoundTrivialTan)
	DefineBound(BoundPolynomial)
	DefineBound(BoundMedium)
	DefineBound(BoundFull)

	// Get the key for |x|.
	static BoundKeyType Key(double x)
	{
		const DoubleForBound X = { x };
		BoundKeyType t = X.s.key;
		return t & 0x7fffffff;
	}

	#define	Within(xk, bound)	((xk) < Double##bound.s.key)

#elif defined __x86_64__	// (Add any other desired architectures.)

	/*	This method uses integer comparisons with all 64 bits of the IEEE-754
		representation of the floating-point number.
	*/

	#include <inttypes.h>

	typedef uint64_t BoundKeyType;

	typedef union
	{
		double d;
		uint64_t key;
	} DoubleForBound;

	// Prepare constants containing the bounds in our desired form.
	#define	DefineBound(bound)	\
		static const DoubleForBound Double##bound = { bound };
	DefineBound(BoundDenormal)
	DefineBound(BoundTrivialSin)
	DefineBound(BoundTrivialCos)
	DefineBound(BoundTrivialTan)
	DefineBound(BoundPolynomial)
	DefineBound(BoundMedium)
	DefineBound(BoundFull)

	// Get the key for |x|.
	static BoundKeyType Key(double x)
	{
		const DoubleForBound X = { x };
		BoundKeyType t = X.key;
		return t & 0x7fffffffffffffff;
	}

	#define	Within(xk, bound)	((xk) < Double##bound.key)

#else

	// If no other method is selected, use regular floating-point comparison.

	typedef double BoundKeyType;

	#define	DefineBound(bound)

	// Get the key for |x|.
	static BoundKeyType Key(double x)
	{
		#if defined __i386__ || defined __x86_64__
			// On Intel, just clear the sign bit.
			double xk = -0.;
			__asm__("andnpd %[x], %[xk]" : [xk] "+x" (xk) : [x] "x" (x));
			return xk;
		#elif 1
			return x < 0 ? -x : +x;
		#else
			return fabs(x);
		#endif
	}

	#define	Within(xk, bound)	((xk) < bound)

#endif


/*	Several argument-reduction routines follow.  Each has this specification:

		static void Reduce<Name>(double *xp, int *a, double x).

	Input:
	
		x is a number to be reduced modulo pi/2.  Each routine has
		qualifications on what values it supports for x.  Generally, x must not
		be a NaN or infinity.

	Output:

		*xp is set to a residue of x modulo pi/2.  Mostly, -pi/4 <= x <= +pi/4,
		but *xp may be outside this interval by as much as 3.24128e-11.

		*a is set to the arc of the circle x is in.  0 <= *a < 4.

		On output, x is approximately k * 2*pi + *a * pi/2 + *xp for some
		integer k.  This holds even if x is outside [-pi/4, pi/4]; *a matches
		*xp.

	This means all the reduction routines are returning *xp in radians.
	ReduceFull goes through steps in which it has period-fraction units (0 to 1
	corresponding to 0 to pi/2 radians), so the restoration of radians is
	forced and introduces additional rounding error and execution time.  We
	might want to do something about that.

	Since *xp is a single double, it is insufficient to represent the residue
	precisely enough to compute a faithfully rounded function.  For that, we
	will have to return extended precision, perhaps a long double or perhaps
	two doubles.
*/


/*	static void ReduceFull(double *xp, int *a, double x).

	Input:
	
		x is a number to be reduced modulo pi/2.  This routine requires 1 <=
		|x|.  (The actual limit is something smaller, perhaps around 0x1p-622,
		but 1 suffices.)

	Output:

		*xp is set to x modulo pi/2.

		*a is set to the arc of the circle x is in.  0 <= *a < 4.

		On output, x is approximately k * 2*pi + *a * pi/2 + *xp for some
		integer k.

	This routine was adapted from the _sin function in double_fns.h (which
	implements vvsin).
*/
static void ReduceFull(double *xp, int *a, double x)
{
	// The rows are 8-bit shifts of 27-bit windows on 2/pi * 0x1p400.
	static const double TwoOverPiWithOffset[4][45] =
	{
		{
			+0x0.0000000p+0, +0x1.45f3070p+399, -0x1.1b1bbe8p+372,
			-0x1.6b01ec8p+345, +0x1.5f47d50p+318, -0x1.6447e48p+291,
			-0x1.3ad4ce0p+263, +0x1.e21c820p+237, +0x1.fe51640p+208,
			-0x1.5085110p+182, +0x1.586dca0p+154, -0x1.c8e2df0p+129,
			+0x1.374b800p+102, +0x1.924bbb0p+74, -0x1.f62e6e0p+48,
			+0x1.cfe1df0p+20, -0x1.38d3b58p-6, -0x1.63045e0p-34,
			+0x1.1afa980p-63, -0x1.44bb7b0p-88, -0x1.6638fe0p-116,
			+0x1.ad17e00p-142, -0x1.bec66e0p-168, -0x1.4e33e58p-195,
			+0x1.9cfa4e0p-223, +0x1.08bf178p-249, -0x1.036be40p-279,
			+0x1.8ffc4c0p-303, -0x1.0fd3000p-339, -0x1.fc04340p-358,
			-0x1.dce94c0p-385, +0x1.4da3ee0p-413, -0x1.64c0980p-439,
			-0x1.b069ec8p-465, -0x1.1617380p-493, -0x1.32c3400p-521,
			-0x1.5d28ae0p-548, +0x1.eeb1fb0p-574, -0x1.a0e8500p-604,
			+0x1.e839cf8p-627, +0x1.e294a48p-654, +0x1.d4d7f68p-681,
			+0x1.fb11f90p-708, -0x1.517bd50p-735, +0x1.9823800p-767
		},
		{
			+0x0.0000000p+0, +0x1.45f3000p+399, +0x1.b727240p+377,
			-0x1.f56b020p+353, +0x1.3abe900p+325, -0x1.5964480p+299,
			+0x1.b6c52b0p+271, +0x1.93c4390p+244, +0x1.07f9480p+214,
			-0x1.38a8428p+191, -0x1.0ea7920p+162, -0x1.b7238c0p+135,
			+0x1.09374b8p+110, +0x1.924c000p+74, -0x1.15f62e8p+56,
			+0x1.21cfe20p+28, -0x1.0a71a70p+1, -0x1.acb1820p-25,
			-0x1.77dca08p-52, -0x1.68a25d8p-79, -0x1.ec598e0p-106,
			-0x1.fb29740p-133, -0x1.037d8d0p-161, +0x1.1d63980p-188,
			+0x1.a99cfa0p-215, +0x1.3908bf0p-241, +0x1.77bf250p-269,
			+0x1.d8ffc80p-299, -0x1.a000880p-322, +0x1.6603fc0p-350,
			-0x1.a1dce90p-377, -0x1.2fac970p-403, -0x1.2593040p-433,
			+0x1.9e4f960p-457, +0x1.36e9e90p-485, -0x1.c099620p-512,
			+0x1.7fa8b60p-538, -0x1.5b08a70p-565, -0x1.41a0e80p-596,
			-0x1.30be300p-622, -0x1.821d6b8p-646, +0x1.25d4d80p-673,
			-0x1.2813b80p-702, -0x1.ca8be00p-730, +0x1.580cc10p-754
		},
		{
			+0x0.0000000p+0, +0x1.4600000p+399, -0x1.9f246c0p+386,
			-0x1.bbead60p+360, -0x1.ec54100p+329, -0x1.c159648p+307,
			+0x1.c0db628p+280, +0x1.5993c40p+252, +0x1.c820ff0p+225,
			+0x1.458eaf0p+198, +0x1.ebbc560p+172, +0x1.b7246e0p+144,
			+0x1.d2126f0p+117, -0x1.a3ff370p+91, +0x1.2eea0a0p+64,
			-0x1.736f180p+37, -0x1.e214e40p+8, +0x1.62534e8p-17,
			-0x1.177dc80p-48, -0x1.0568a28p-71, +0x1.1213a68p-98,
			-0x1.c7eca60p-127, +0x1.7df9040p-154, +0x1.cc8eb20p-179,
			-0x1.9f2b318p-206, -0x1.6c6f780p-237, +0x1.f8bbdf8p-260,
			+0x1.283b200p-288, -0x1.da00080p-318, -0x1.fa67f00p-344,
			-0x1.0d0ee80p-372, +0x1.6b414e0p-397, -0x1.7049650p-423,
			+0x1.fb3c9f0p-450, +0x1.6136ea0p-477, -0x1.7381320p-505,
			-0x1.8680578p-530, +0x1.aea4f78p-557, -0x1.38141a0p-584,
			-0x1.d098600p-613, +0x1.ce7de28p-638, +0x1.4a4baa0p-666,
			-0x1.404a050p-692, +0x1.1f8d5d0p-720, +0x1.0ac0680p-749
		},
		{
			+0x0.0000000p+0, +0x1.8000000p+399, -0x1.d067c90p+396,
			-0x1.b1bbeb0p+368, +0x1.4fe13b0p+341, -0x1.05c1598p+315,
			+0x1.bb81b70p+287, -0x1.d6a66c0p+260, -0x1.de37df0p+233,
			-0x1.ae9c400p+200, -0x1.4214438p+180, -0x1.4f246e0p+153,
			+0x1.b8e9090p+126, +0x1.ba5c010p+99, -0x1.b6d1160p+72,
			+0x1.3a32440p+43, -0x1.80f10a0p+17, -0x1.c69dac8p-9,
			-0x1.8c11780p-36, +0x1.1afa978p-63, -0x1.12edec8p-90,
			+0x1.338e050p-117, -0x1.4ba0818p-144, -0x1.f633718p-171,
			+0x1.8e60d50p-198, -0x1.8c16c70p-225, +0x1.17e2f00p-254,
			-0x1.036be28p-279, +0x1.ff89800p-308, -0x1.0fd3400p-339,
			+0x1.fde5e00p-365, +0x1.18b5a10p-388, -0x1.64b8248p-414,
			-0x1.9302618p-441, -0x1.834f648p-468, -0x1.6173820p-497,
			+0x1.9a797f8p-522, +0x1.45aea50p-549, -0x1.14e0500p-578,
			-0x1.a0e84c0p-604, -0x1.7c63040p-631, -0x1.d6b5b40p-658,
			-0x1.59404a0p-684, -0x1.3b81cc0p-714, +0x1.7421580p-738
		}
	};

	Double X = { x };

	/*	Set ec to the unbiased exponent minus 33.  Why 33?  I do not know.
		This was in the vvsin code in double_fns.h.
	*/
	int ec = X.s.exponent - (1023+33);

	// Set k to ceiling(ec / 27) and m to residue.
	int k = (ec + 26) * (607*4) >> 16;
	int m = 27*k - ec;

	// offset is used to select a row in the reduction table.  See below.
	int offset = m >> 3;

	/*	The reduction table, TwoOverPiWithOffset, contains bits of 2/pi.
		First, all entries are scaled by 2**400 to avoid overflow/underflow
		issues.  Each entry contains 27 bits of 2/pi, except in the first two
		columns.  The first column contains zeroes because no reduction is done
		for numbers that are already small.  The entries in the second row
		start before (at a bit with higher value) the leading bit of 2/pi, so
		they contain some leading zeroes.  The sign bits form part of the 27
		bits represented.

		Each row forms a contiguous string of bits of 2/pi.  That is, adding
		all the entries with sufficient precision yields a single bit string
		representing 2/pi.  The rows differ in their starting points; row 0
		begins with the leading bit of 2/pi, and rows 1, 2, and 3 begin 8, 16,
		and 24 bits before that.
	*/

	/*	Scale x to avoid overflow in Dekker split.  This is compensated for
		in the entries in TwoOverPiWithOffset.
	*/
	x *= 0x1p-400;

	/*	Use Dekker's algorithm to split x into 26 bits and 27 bits.  This
		requires round-to-nearest mode.
	*/
	double xDekker = x * (0x1p27 + 1);
	double x0 = xDekker - (xDekker - x);
	double x1 = x - x0;

	// Get address of starting point in table.
	const double *p0 = &TwoOverPiWithOffset[offset][k];

	// Get table entries.
	const double fp0 = p0[0];
	const double fp1 = p0[1];
	const double fp2 = p0[2];
	const double fp3 = p0[3];

	// Get high bits of x * f, where f is the part of 2/pi we are using.
	const double f0 = x1 * fp0 + fp1 * x0;

	double f = x1 * fp1 + fp2 * x0;

	// Combine to do integer work.
	const double fi = f0 + f;

	static const double IntegerBias = 0x1.8p52;

	// Force the integer bits into a specific position.
	Double Fi = { fi + IntegerBias };
		/*	|fi| is less than 0x1p36, so fi + IntegerBias is well within
			[0x1p52, 0x1p53), so it has a known exponent, and the bits with
			weight 2 and 1 are the least significant bits in its significand.

			We know |fi| is less than 0x1p36 because it is the sum of x1 * fp0,
			fp1 * x0, and something small in f.  x0 has the same exponent as
			x.  Say 2**e <= |x| < 2**(e+1).  (That is the original x, before we
			scaled it by 0x1p-400.)  The table entry we look up for fp0 has
			magnitude less than 2**(400+27-27*ceiling((e-33)/27)), and fp1 is
			less, bounded by 2**(400-27*ceiling((e-33)/27)).  Including the
			scaling in x, fp1 * x0 is less than 2**(e+1-400) *
			2**(400-27*ceiling((e-33)/27)) <= 2**34.  Similarly, fp0 * x1 is
			less than 2**34, so their sum is less than 2**35.
		*/

	// Get the two least significant integer bits.
	*a = Fi.s.significand2;

	double fint = Fi.d - IntegerBias;

	const double fp4 = p0[4];
	const double fp5 = p0[5];
	const double fp6 = p0[6];

	f = f0 - fint + f;
	f += x1 * fp2 + fp3 * x0;
	f += x1 * fp3 + fp4 * x0;
	f += x1 * fp4 + fp5 * x0;
	f += x1 * fp5 + fp6 * x0;

	// Convert to radians by multiplying by pi/2.
	*xp = f * 0x3.243F6A8885A3p-1;
}


/*	static void ReduceMedium(double *xp, int *a, double x).

	Input:
	
		x is a number to be reduced modulo pi/2.  This routine requires |x| <=
		X.  X is described below.

	Output:

		*xp is set to x modulo pi/2.

		*a is set to the arc of the circle x is in.  0 <= *a < 4.

		On output, x is approximately k * 2*pi + *a * pi/2 + *xp for some
		integer k.

	Nomenclature:

		p is the period, pi/2.

		X is the maximum value x may have.  X is 0x1.000013be57a3fp19.

		InversePeriod is 1/p, rounded to the nearest double, ties to even.

		n is x * InversePeriod, rounded to the nearest double, ties to even,
		and then rounded to the nearest integer, ties to even.

		N is the maximum value n may have.  N is X * InversePeriod, rounded to
		the nearest double, ties to even, and then rounded to the nearest
		integer, ties to even.  N is 333773.

	Notes:

		In comments in this routine, unquoted expressions are mathematical and
		quoted expressions are floating-point.  Thus, "a + b" refers to the
		floating-point operation of addition, including rounding.

		"n * Period[0]" is exact for all |n| <= N.  It is inexact for n = N+1,
		so this is the source of the bound on x.  X is the greatest value for
		which |n| <= N.  (It is happenstance that Period[0] limits x; if the
		period were different, its bits might result in Period[1] causing the
		limit.  n has some other roles in the errors in this routine that might
		limit x when used with other periods or different precisions.)

		The difference between InversePeriod and 1/p can cause n to differ from
		the ideal value when x is near a multiple of p.  The result is that
		instead of producing a result inside the target interval [-p/2, p/2], a
		result slightly outside the interval is produced.  The result may be as
		much as 3.24128e-11 outside the interval.  The polynomial and the
		polynomial evaluation must be satisfactory over this extended interval.
*/
static void ReduceMedium(double *xp, int *a, double x)
{
	static const double InversePeriod =
		2 / 0x3.243f6a8885a308d313198a2e03707344ap0;

	/*	Period is an extended-precision representation of the reduction period.
		Each element except the last has the property that multiplication by
		any integer with magnitude at most N is exact.
	*/
	static const double Period[] = {
			+0x1.921FB54440000p-0,
			+0x1.68C234C4C0000p-39,
			+0x1.98A2E03707345p-77
		};

	// Estimate x / p and round to nearest integer.
	double n = x * InversePeriod + 0x1.8p52 - 0x1.8p52;

	// Record which arc of the circle x lies in.
	*a = (int) n & 3;

	// This is exact, per design of Period.
	double np0 = n * Period[0];

	// This is exact, per design of Period.
	double np1 = n * Period[1];

	/*	The error here is at most 1/2 ULP(n * Period[2]), which is
		1/2 ULP(n * +0x1.98A2E03707345p-77) = ULP(n * +0x1.98A2E03707345p-78).

		n <= N = 333773, so an absolute bound on that is ULP(333773 *
		+0x1.98A2E03707345p-78) = 0x1p-111, but we need a bound dependent on n
		to partition a proof below.

		This is called error (a).
	*/
	double np2 = n * Period[2];

	/*	Set x00 to x - n * Period[0].  np0 is very near x, so this is exact,
		per David Goldberg, _What Every Computer Scientist Should Know About
		Floating-Point Arithmetic_, Theorem 11.

		The nomenclature used here is:

			x0 is x - n * (Period[0]),
			x1 is x - n * (Period[0] + Period[1]), and
			x2 is x - n * (Period[0] + Period[1] + Period[2]).

			x00 is the first and only "word" of an extended precision
			representation of x0.

			x10 is the first "word" of x1; it has the most significant bits.
			x11 is the next "word" of x1; it has the following bits.
	*/
	double x00 = x - np0;

	/*	Subtract n * Period[1] from x0 to produce x1.  These operations yield
		an exact result, per Knuth, _The Art of Computer Programming_ 2, second
		edition, page 221, section 4.2.2, Theorem C, in the sense that x10 +
		x11 will exactly equal x0 - n * Period[1].  The first statement puts
		the high bits in x10, and the second statement determines what was
		rounded away in x10 and puts it in x11.

		Knuth assumes |np1| <= |x00|.  This is often the case, but perhaps
		cancellation has reduced the magnitude of x00.  Hoever, if |x00| <
		|np1|, this arithmetic is exact, because x00's least significant bit is
		at least as large as the ULP of Period[1].
	*/
	double x10 = x00 - np1;
	double x11 = x00 - x10 - np1;

	/*	This comment demonstrates the rounding error in the following
		statement, "t = x11 - np2", is tiny relative to x10.

		This is called error (b).

		Let LSB(x) be the weight of the least significant bit set in a
		floating-point number x.  E.g, LSB(1.25) is 1/4, although ULP(1.25) is
		2**-52.
		
		Here, x10 and x11 are each multiples of LSB(Period[1]).  If |x10| <
		2**53 LSB(Period[1]), then x11 is zero, because all the bits of x10 +
		x11 fit into x10.  In that case, "x11 - np2" is exact.

		Otherwise, 2**53 LSB(Period[1]) <= |x10|, so 2 LSB(Period[1]) <=
		ULP(x10).  LSB(Period[1]) = 0x1p-73, so 0x1p-72 <= ULP(x10).

		By construction of x10 and x11, |x11| <= 1/2 ULP(x10).
		
		|np2| <= |"n * Period[2]"| <= |n * Period[2] * (1+2**-53)| = 333733 *
		0x1.98A2E03707345p-77 * (1+2**-53) < 0x1.0426p-58.

		|"x11 - np2"| <= 2*max(|x11|, |np2|) <= 2*max(1/2 ULP(x10),
		0x1.0426p-58) = max(ULP(x10), 0x1.0426p-57).  The rounding error in
		"x11 - np2" is at most 2**-53 times this, so it is at most max(2**-53
		ULP(x10), 0x1p-110).  Since ULP(x10) is at least 0x1p-72, the rounding
		error is at most 0x1p-38 ULP(x10).
	*/
	double t = x11 - np2;

	/*	Add t to x1 to produce x2.  As before, we could use Knuth's technique
		to produce an exact sum, x20 + x21 = x10 + t.  For now, we only return
		x20.  When we want extended precision, we could return x21 as well.
		(If so, be careful to show that |t| <= |x10|, as required for Knuth's
		technique, or otherwise show the arithmetic is sufficiently accurate.)

		The difference between x20 and x10 + t is called error (c).  It is at
		most 1/2 ULP(x20).
	*/
	double x20 = x10 + t;
	// double x21 = x10 - x20 + t;

	*xp = x20;

	/*	Having subtracted n * (Period[0] + Period[1] + Period[2]) from x, we
		have an error caused by the difference between p and Period[0] +
		Period[1] + Period[2], which is less than 0x1.6fdb2p-131.  So the error
		is less than n * 0x1.6fdb2p-131.
	   
		n <= N = 333773, so an absolute bound on that is 333773 *
		0x1.6fdb2p-131 < 0x1.d45fp-113, but we need a bound dependent on n to
		partition a proof below.

		This is called error (d).

		Our reduction has four errors:

			(a), which is at most ULP(n * +0x1.98A2E03707345p-78) <= 0x1p-111.
			(b), which is at most 0x1p-38 ULP(x10).
			(c), which is at most 1/2 ULP(x20).
			(d), which is at most n * 0x1.6fdb2p-131 < 0x1.d45fp-113.

		From the discussion of (b), if x11 is not zero, then 0x1p-72 <=
		ULP(x10).  Then x10 dominates the result, because np2 is so small that
		subtracting it from x1 to produce x2 cannot reduce the exponent by more
		than one.  So the errors (a), (b), and (d) are tiny relative to x10,
		and the total error is very nearly (c), 1/2 ULP(x20).

		However, if x11 is zero, then (b) is zero, and we only need to consider
		(a), (c), and (d).  We partition this into two cases.

		Case 0:  |x| < 0x1p12.

			The double-precision floating-point number closest to a multiple of
			pi/2 in that interval is 0x1.6c6cbc45dc8dep6 (according to Maple
			code from Muller, _Elementary Functions_), and it is about
			0x1.6d61b58c99c43p-60 away from a multiple of pi/2.

			n is at most 0x1p12 * InversePi, rounded to a double, so n < 2608.
			Then:

				(a) is at most ULP(2608 * +0x1.98A2E03707345p-78) = 0x1p-119.
				(b) is zero.
				(c) is at most 1/2 ULP(x20).
				(d) is at most 2608 * 0x1.6fdb2p-131 < 0x1p-119.

			The final result might be as low as about 0x1.6d6p-60.  That has an
			ULP of 0x1p-113, so (a) and (d) are small relative to it, and the
			total error is nearly (c), 1/2 ULP(x20).

		Case 1:  0x1p12 <= |x|.

			We still have |x| <= 0x1.000013be57a3fp19.  The double-precision
			floating-point number closest to a multiple of pi/2 in that set is
			0x1.6c6cbc45dc8dep11 (Muller again), and it is about
			0x1.6d61b58c99c43p-55 away from a multiple of pi/2.

			Then:

				(a) is at most ULP(333773 * +0x1.98A2E03707345p-78) = 0x1p-111.
				(b) is zero.
				(c) is at most 1/2 ULP(x20).
				(d) is at most 333773 * 0x1.6fdb2p-131 < 0x1.d45fp-113.

			The final result might be as low as about 0x1.6d6p-55.  That has an
			ULP of 0x1p-108, so (a) and (d) are small relative to it, and the
			total error is nearly (c), 1/2 ULP(x20).

		Therefore, the total error is never much more than 1/2 ULP of the value
		returned in *xp.
	*/
}


/*	static double sinp(double r)
	
	Return sine(r) using only a polynomial, no reduction.

	Input:
	
		r, with |r| < pi/4 + 3.24128e-11.

	Output:

		Approximately sine(r) is returned.
*/
static double sinp(double r)
{
	double rr = r * r;

	/*	Derived from Cephes Math Library Release 2.8:  June, 2000.  Maple's
		infnorm routine says this polynomial is within .0688 ULP of
		sine(r) for |r| < pi/4 + 3.24128e-11.
	*/
	return  r + r * rr * ((((((
		+ 1.58962301576546568060E-10) * rr
		- 2.50507477628578072866E-8 ) * rr
		+ 2.75573136213857245213E-6 ) * rr
		- 1.98412698295895385996E-4 ) * rr
		+ 8.33333333332211858878E-3 ) * rr
		- 1.66666666666666307295E-1 );
}


/*	static double cosp(double r)
	
	Return cosine(r) using only a polynomial, no reduction.

	Input:
	
		r, with |r| < pi/4 + 3.24128e-11.

	Output:

		Approximately cosine(r) is returned.
*/
static double cosp(double r)
{
	double rr = r * r;

	/*	Derived from Cephes Math Library Release 2.8:  June, 2000.  Maple's
		infnorm routine says this polynomial is within .00709 ULP of
		cosine(r) for |r| < pi/4 + 3.24128e-11.
	*/
	return (((((((
		- 1.13585365213876817300E-11) * rr
		+ 2.08757008419747316778E-9 ) * rr
		- 2.75573141792967388112E-7 ) * rr
		+ 2.48015872888517045348E-5 ) * rr
		- 1.38888888888730564116E-3 ) * rr
		+ 4.16666666666665929218E-2 ) * rr
		-  .5                       ) * rr
		+ 1;
}


/*	double sin(double x).

	Notes:

		Citations in parentheses below indicate the source of a requirement.

		"C" stands for ISO/IEC 9899:TC2.

		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
		requirements since it defers to C and requires errno behavior only if
		we choose to support it by arranging for "math_errhandling &
		MATH_ERRNO" to be non-zero, which we do not.

	Return value:

		For +/- zero, return zero with same sign (C F.9 12 and F.9.1.6).

		For +/- infinity, return a NaN (C F.9.1.6).

		For a NaN, return the same NaN (C F.9 11 and 13).

		Otherwise:

			If the rounding mode is round-to-nearest, return sine(x) within a
			few ULP.  The maximum error of this routine is not precisely
			known.  The maximum error of the reduction might be around 3 ULP,
			although this is partly a guess.  The polynomials have small
			errors.  The polynomial evaluation might have an error under 1
			ULP.  So the worst error for this routine might be under 4 ULP.
		  
			Not currently implemented:  In other rounding modes, return sine(x)
			possibly with slightly worse error, not necessarily honoring the
			rounding mode (Ali Sazegari narrowing C F.9 10).

		All results are in [-1, 1].

	Exceptions:

		Raise underflow for a denormal result (C F.9 7 and Draft Standard for
		Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
		smallest normal, underflow may or may not be raised.  This is stricter
		than the older 754 standard.

		May or may not raise inexact, even if the result is exact (C F.9 8).

		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
		of C 4.2.1) or an infinity (C F.9.1.6) but not if the input is a quiet
		NaN (C F.9 11).

		May not raise exceptions otherwise (C F.9 9).

	Properties:

		Desired to be monotonic.  Not yet proven!
*/
double sin(double x)
{
	// Get |x| in form used by the bound-check operations for a key.
	BoundKeyType xk = Key(x);

	double r;
	int j0;

	/*	Handle denormal numbers and zero here to generate underflow for
		denormals numbers and to avoid generating inexact for zero.
	*/
	if (Within(xk, BoundDenormal))
		return x * (1 - 0x1p-53);

	/*	For small numbers, we can return x for sine(x).  Unfortunately,
		adding this check slows down the other cases, which may be more
		frequent.
	*/
	else if (Within(xk, BoundTrivialSin))
	{
		/*	Make t0 volatile to force compiler to fetch it at run-time
			rather than optimizing away the multiplication.
		*/
		static volatile const double t0 = 1/3.;

		/*	Get t0 once.  If we wrote "t0*t0", the compiler would load it
			twice, since it is volatile.
		*/
		const double t1 = t0;

		/*	Perform a multiplication and pass it into an assembly construct to
			prevent the compiler from knowing we do not use the result and
			optimizing away the multiplication.
		*/
		__asm__("" : : "X" (t1*t1));

		// Return the floating-point number nearest sine(x), which is x.
		return x;
	}

	// If |x| is small, no reduction is necessary.
	else if (Within(xk, BoundPolynomial))
		r = x, j0 = 0;
		/*	The interval for this could be enlarged by examining the sine
			polynomial and figuring out how big r can get before the error is
			too large.
		*/

	// If |x| is medium, we can use a fast reduction routine.
	else if (Within(xk, BoundMedium))
		ReduceMedium(&r, &j0, x);

	// Otherwise, we need the full, slow reduction routine.
	else if (Within(xk, BoundFull))
		ReduceFull(&r, &j0, x);

	else
		return x-x;

	switch (j0)
	{
		default:
		case 0:	return +sinp(r);
		case 1:	return +cosp(r);
		case 2:	return -sinp(r);
		case 3:	return -cosp(r);
	}
}


/*	double cos(double x).

	Notes:

		Citations in parentheses below indicate the source of a requirement.

		"C" stands for ISO/IEC 9899:TC2.

		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
		requirements since it defers to C and requires errno behavior only if
		we choose to support it by arranging for "math_errhandling &
		MATH_ERRNO" to be non-zero, which we do not.

	Return value:

		For +/- infinity, return a NaN (C F.9.1.6).

		For a NaN, return the same NaN (C F.9 11 and 13).

		Otherwise:

			If the rounding mode is round-to-nearest, return cosine(x) within a
			few ULP.  The maximum error of this routine is not precisely
			known.  The maximum error of the reduction might be around 3 ULP,
			although this is partly a guess.  The polynomials have small
			errors.  The polynomial evaluation might have an error under 1
			ULP.  So the worst error for this routine might be under 4 ULP.
		  
			Not currently implemented:  In other rounding modes, return
			cosine(x) possibly with slightly worse error, not necessarily
			honoring the rounding mode (Ali Sazegari narrowing C F.9 10).

		All results are in [-1, 1].

	Exceptions:

		Raise underflow for a denormal result (C F.9 7 and Draft Standard for
		Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
		smallest normal, underflow may or may not be raised.  This is stricter
		than the older 754 standard.

		May or may not raise inexact, even if the result is exact (C F.9 8).

		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
		of C 4.2.1) or an infinity (C F.9.1.5) but not if the input is a quiet
		NaN (C F.9 11).

		May not raise exceptions otherwise (C F.9 9).

	Properties:

		Desired to be monotonic.  Not yet proven!
*/
double cos(double x)
{
	// Get |x| in form used by the bound-check operations for a key.
	BoundKeyType xk = Key(x);

	double r;
	int j0;

	// Avoid generating inexact for zero.
	if (x == 0)
		return 1;

	/*	For small numbers, we can return 1 for cosine(x).  Unfortunately,
		adding this check slows down the other cases, which may be more
		frequent.  It greatly speeds up cases where |x| < 0x1p-492, by
		avoiding arithmetic with denormals.
	*/
	else if (Within(xk, BoundTrivialCos))
	{
		/*	Make t0 volatile to force compiler to fetch it at run-time
			rather than optimizing away the multiplication.
		*/
		static volatile const double t0 = 1/3.;

		/*	Get t0 once.  If we wrote "t0*t0", the compiler would load it
			twice, since it is volatile.
		*/
		const double t1 = t0;

		/*	Perform a multiplication and pass it into an assembly construct to
			prevent the compiler from knowing we do not use the result and
			optimizing away the multiplication.
		*/
		__asm__("" : : "X" (t1*t1));

		// Return the floating-point number nearest cosine(x), which is 1.
		return 1;
	}

	// If |x| is small, no reduction is necessary.
	else if (Within(xk, BoundPolynomial))
		r = x, j0 = 0;
		/*	The interval for this could be enlarged by examining the cosine
			polynomial and figuring out how big r can get before the error is
			too large.
		*/

	// If |x| is medium, we can use a fast reduction routine.
	else if (Within(xk, BoundMedium))
		ReduceMedium(&r, &j0, x);

	// Otherwise, we need the full, slow reduction routine.
	else if (Within(xk, BoundFull))
		ReduceFull(&r, &j0, x);

	else
		return x-x;

	switch (j0)
	{
		default:
		case 0:	return +cosp(r);
		case 1:	return -sinp(r);
		case 2:	return -cosp(r);
		case 3:	return +sinp(r);
	}
}


/*	double tan(double x).

	Notes:

		Citations in parentheses below indicate the source of a requirement.

		"C" stands for ISO/IEC 9899:TC2.

		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
		requirements since it defers to C and requires errno behavior only if
		we choose to support it by arranging for "math_errhandling &
		MATH_ERRNO" to be non-zero, which we do not.

	Return value:

		For +/- zero, return zero with same sign (C F.9 12 and F.9.1.7).

		For +/- infinity, return a NaN (C F.9.1.7).

		For a NaN, return the same NaN (C F.9 11 and 13).

		Otherwise:

			If the rounding mode is round-to-nearest, return tangent(x) within
			a few ULP.  The maximum error of this routine is not precisely
			known.  The maximum error of the reduction might be around 3 ULP,
			although this is partly a guess.  The polynomials have small
			errors.  The polynomial evaluation might have an error under 1
			ULP.  How the final division affects error has not been considered
			yet.  4.55 ULP has been observed.
		  
			Not currently implemented:  In other rounding modes, return
			tangent(x) possibly with slightly worse error, not necessarily
			honoring the rounding mode (Ali Sazegari narrowing C F.9 10).

	Exceptions:

		Raise underflow for a denormal result (C F.9 7 and Draft Standard for
		Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
		smallest normal, underflow may or may not be raised.  This is stricter
		than the older 754 standard.

		May or may not raise inexact, even if the result is exact (C F.9 8).

		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
		of C 4.2.1) or an infinity (C F.9.1.7) but not if the input is a quiet
		NaN (C F.9 11).

		May not raise exceptions otherwise (C F.9 9).

	Properties:

		Desired to be monotonic.  Not yet proven!
*/
double tan(double x)
{
	// Get |x| in form used by the bound-check operations for a key.
	BoundKeyType xk = Key(x);

	double r;
	int j0;

	/*	Handle denormal numbers and zero here to generate underflow for
		denormals numbers and to avoid generating inexact for zero.
	*/
	if (Within(xk, BoundDenormal))
		return x * (1 - 0x1p-53);

	/*	For small numbers, we can return x for tangent(x).  Unfortunately,
		adding this check slows down the other cases, which may be more
		frequent.
	*/
	else if (Within(xk, BoundTrivialTan))
	{
		/*	Make t0 volatile to force compiler to fetch it at run-time
			rather than optimizing away the multiplication.
		*/
		static volatile const double t0 = 1/3.;

		/*	Get t0 once.  If we wrote "t0*t0", the compiler would load it
			twice, since it is volatile.
		*/
		const double t1 = t0;

		/*	Perform a multiplication and pass it into an assembly construct to
			prevent the compiler from knowing we do not use the result and
			optimizing it away.
		*/
		__asm__("" : : "X" (t1*t1));

		// Return the floating-point number nearest tangent(x), which is x.
		return x;
	}

	// If |x| is small, no reduction is necessary.
	else if (Within(xk, BoundPolynomial))
		r = x, j0 = 0;
		/*	The interval for this could be enlarged by examining the
			polynomials and figuring out how big r can get before the error is
			too large.
		*/

	// If |x| is medium, we can use a fast reduction routine.
	else if (Within(xk, BoundMedium))
		ReduceMedium(&r, &j0, x);

	// Otherwise, we need the full, slow reduction routine.
	else if (Within(xk, BoundFull))
		ReduceFull(&r, &j0, x);

	else
		return x-x;

	switch (j0)
	{
		default:
		case 0:	case 2: return + sinp(r) / cosp(r);
		case 1:	case 3: return - cosp(r) / sinp(r);
	}
}