Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

369 lines (323 sloc) 9.929 kb
#include "doomtype.h"
#include "doomdef.h"
#include "x86.h"
extern "C"
{
CPUInfo CPU;
}
#if !defined(__amd64__) && !defined(__i386__) && !defined(_M_IX86) && !defined(_M_X64)
void CheckCPUID(CPUInfo *cpu)
{
memset(cpu, 0, sizeof(*cpu));
cpu->DataL1LineSize = 32; // Assume a 32-byte cache line
}
void DumpCPUInfo(const CPUInfo *cpu)
{
}
#else
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <mmintrin.h>
#include <emmintrin.h>
#ifdef __GNUC__
#if defined(__i386__) && defined(__PIC__)
// %ebx may by the PIC register. */
#define __cpuid(output, func) \
__asm__ __volatile__("xchgl\t%%ebx, %1\n\t" \
"cpuid\n\t" \
"xchgl\t%%ebx, %1\n\t" \
: "=a" ((output)[0]), "=r" ((output)[1]), "=c" ((output)[2]), "=d" ((output)[3]) \
: "a" (func));
#else
#define __cpuid(output, func) __asm__ __volatile__("cpuid" : "=a" ((output)[0]),\
"=b" ((output)[1]), "=c" ((output)[2]), "=d" ((output)[3]) : "a" (func));
#endif
#endif
void CheckCPUID(CPUInfo *cpu)
{
int foo[4];
unsigned int maxext;
memset(cpu, 0, sizeof(*cpu));
cpu->DataL1LineSize = 32; // Assume a 32-byte cache line
#if !defined(_M_IX86) && !defined(__i386__) && !defined(_M_X64) && !defined(__amd64__)
return;
#else
#if defined(_M_IX86) || defined(__i386__)
// Old 486s do not have CPUID, so we must test for its presence.
// This code is adapted from the samples in AMD's document
// entitled "AMD-K6™ MMX Processor Multimedia Extensions."
#ifndef __GNUC__
__asm
{
pushfd // save EFLAGS
pop eax // store EFLAGS in EAX
mov ecx,eax // save in ECX for later testing
xor eax,0x00200000 // toggle bit 21
push eax // put to stack
popfd // save changed EAX to EFLAGS
pushfd // push EFLAGS to TOS
pop eax // store EFLAGS in EAX
cmp eax,ecx // see if bit 21 has changed
jne haveid // if no change, then no CPUID
}
return;
haveid:
#else
int oldfd, newfd;
__asm__ __volatile__("\t"
"pushf\n\t"
"popl %0\n\t"
"movl %0,%1\n\t"
"xorl $0x200000,%0\n\t"
"pushl %0\n\t"
"popf\n\t"
"pushf\n\t"
"popl %0\n\t"
: "=r" (newfd), "=r" (oldfd));
if (oldfd == newfd)
{
return;
}
#endif
#endif
// Get vendor ID
__cpuid(foo, 0);
cpu->dwVendorID[0] = foo[1];
cpu->dwVendorID[1] = foo[3];
cpu->dwVendorID[2] = foo[2];
if (foo[1] == MAKE_ID('A','u','t','h') &&
foo[3] == MAKE_ID('e','n','t','i') &&
foo[2] == MAKE_ID('c','A','M','D'))
{
cpu->bIsAMD = true;
}
// Get features flags and other info
__cpuid(foo, 1);
cpu->FeatureFlags[0] = foo[1]; // Store brand index and other stuff
cpu->FeatureFlags[1] = foo[2]; // Store extended feature flags
cpu->FeatureFlags[2] = foo[3]; // Store feature flags
// If CLFLUSH instruction is supported, get the real cache line size.
if (foo[3] & (1 << 19))
{
cpu->DataL1LineSize = (foo[1] & 0xFF00) >> (8 - 3);
}
cpu->Stepping = foo[0] & 0x0F;
cpu->Type = (foo[0] & 0x3000) >> 12; // valid on Intel only
cpu->Model = (foo[0] & 0xF0) >> 4;
cpu->Family = (foo[0] & 0xF00) >> 8;
if (cpu->Family == 15)
{ // Add extended family.
cpu->Family += (foo[0] >> 20) & 0xFF;
}
if (cpu->Family == 6 || cpu->Family == 15)
{ // Add extended model ID.
cpu->Model |= (foo[0] >> 12) & 0xF0;
}
// Check for extended functions.
__cpuid(foo, 0x80000000);
maxext = (unsigned int)foo[0];
if (maxext >= 0x80000004)
{ // Get processor brand string.
__cpuid((int *)&cpu->dwCPUString[0], 0x80000002);
__cpuid((int *)&cpu->dwCPUString[4], 0x80000003);
__cpuid((int *)&cpu->dwCPUString[8], 0x80000004);
}
if (cpu->bIsAMD)
{
if (maxext >= 0x80000005)
{ // Get data L1 cache info.
__cpuid(foo, 0x80000005);
cpu->AMD_DataL1Info = foo[2];
}
if (maxext >= 0x80000001)
{ // Get AMD-specific feature flags.
__cpuid(foo, 0x80000001);
cpu->AMDStepping = foo[0] & 0x0F;
cpu->AMDModel = (foo[0] & 0xF0) >> 4;
cpu->AMDFamily = (foo[0] & 0xF00) >> 8;
if (cpu->AMDFamily == 15)
{ // Add extended model and family.
cpu->AMDFamily += (foo[0] >> 20) & 0xFF;
cpu->AMDModel |= (foo[0] >> 12) & 0xF0;
}
cpu->FeatureFlags[3] = foo[3]; // AMD feature flags
}
}
#endif
}
void DumpCPUInfo(const CPUInfo *cpu)
{
char cpustring[4*4*3+1];
// Why does Intel right-justify this string (on P4s)
// or add extra spaces (on Cores)?
const char *f = cpu->CPUString;
char *t;
// Skip extra whitespace at the beginning.
while (*f == ' ')
{
++f;
}
// Copy string to temp buffer, but condense consecutive
// spaces to a single space character.
for (t = cpustring; *f != '\0'; ++f)
{
if (*f == ' ' && *(f - 1) == ' ')
{
continue;
}
*t++ = *f;
}
*t = '\0';
if (cpu->VendorID[0])
{
Printf("CPU Vendor ID: %s\n", cpu->VendorID);
if (cpustring[0])
{
Printf(" Name: %s\n", cpustring);
}
if (cpu->bIsAMD)
{
Printf(" Family %d (%d), Model %d, Stepping %d\n",
cpu->Family, cpu->AMDFamily, cpu->AMDModel, cpu->AMDStepping);
}
else
{
Printf(" Family %d, Model %d, Stepping %d\n",
cpu->Family, cpu->Model, cpu->Stepping);
}
Printf(" Features:");
if (cpu->bMMX) Printf(" MMX");
if (cpu->bMMXPlus) Printf(" MMX+");
if (cpu->bSSE) Printf(" SSE");
if (cpu->bSSE2) Printf(" SSE2");
if (cpu->bSSE3) Printf(" SSE3");
if (cpu->bSSSE3) Printf(" SSSE3");
if (cpu->bSSE41) Printf(" SSE4.1");
if (cpu->bSSE42) Printf(" SSE4.2");
if (cpu->b3DNow) Printf(" 3DNow!");
if (cpu->b3DNowPlus) Printf(" 3DNow!+");
Printf ("\n");
}
}
#if 0
// Compiler output for this function is crap compared to the assembly
// version, which is why it isn't used.
void DoBlending_MMX2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
{
__m64 blendcolor;
__m64 blendalpha;
__m64 zero;
__m64 blending256;
__m64 color1;
__m64 color2;
zero = _mm_setzero_si64();
#ifndef __GNUC__
blending256.m64_i64 = 0x10001000100;
#else
blending256 = (__m64)0x10001000100ll;
#endif
blendcolor = _mm_unpacklo_pi8(_m_from_int((r << 16) | (g << 8) | b), zero); // 000000RR 00GG00BB
blendalpha = _mm_unpacklo_pi8(_m_from_int((a << 16) | (a << 8) | a), zero); // 000000AA 00AA00AA
blendcolor = _mm_mullo_pi16(blendcolor, blendalpha); // premultiply blend by alpha
blendalpha = _mm_subs_pu16(blending256, blendalpha); // one minus alpha
// Do two colors per iteration: Count must be even
for (count >>= 1; count > 0; --count)
{
color1 = *(__m64 *)from; // 00r2g2b2 00r1g1b1
from += 2;
color2 = _mm_unpackhi_pi8(color1, zero); // 000000r2 00g200b2
color1 = _mm_unpacklo_pi8(color1, zero); // 000000r1 00g100b1
color1 = _mm_mullo_pi16(blendalpha, color1); // 0000r1rr g1ggb1bb
color2 = _mm_mullo_pi16(blendalpha, color2); // 0000r2rr g2ggb2bb
color1 = _mm_adds_pu16(blendcolor, color1);
color2 = _mm_adds_pu16(blendcolor, color2);
color1 = _mm_srli_pi16(color1, 8);
color2 = _mm_srli_pi16(color2, 8);
*(__m64 *)to = _mm_packs_pu16(color1, color2); // 00r2g2b2 00r1g1b1
to += 2;
}
_mm_empty();
}
#endif
#ifdef X86_ASM
extern "C" void STACK_ARGS DoBlending_MMX(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a);
#endif
void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a)
{
__m128i blendcolor;
__m128i blendalpha;
__m128i zero;
__m128i blending256;
__m128i color1;
__m128i color2;
size_t unaligned;
unaligned = ((size_t)from | (size_t)to) & 0xF;
#ifdef X86_ASM
// For unaligned accesses, the assembly MMX version is slightly faster.
// Note that using unaligned SSE loads and stores is still faster than
// the compiler-generated MMX version.
if (unaligned)
{
DoBlending_MMX(from, to, count, r, g, b, a);
return;
}
#endif
#if defined(__amd64__) || defined(_M_IX64)
long long color;
blending256 = _mm_set_epi64x(0x10001000100ll, 0x10001000100ll);
color = ((long long)r << 32) | (g << 16) | b;
blendcolor = _mm_set_epi64x(color, color);
color = ((long long)a << 32) | (a << 16) | a;
blendalpha = _mm_set_epi64x(color, color);
#else
int color;
blending256 = _mm_set_epi32(0x100, 0x1000100, 0x100, 0x1000100);
color = (g << 16) | b;
blendcolor = _mm_set_epi32(r, color, r, color);
color = (a << 16) | a;
blendalpha = _mm_set_epi32(a, color, a, color);
#endif
blendcolor = _mm_mullo_epi16(blendcolor, blendalpha); // premultiply blend by alpha
blendalpha = _mm_subs_epu16(blending256, blendalpha); // one minus alpha
zero = _mm_setzero_si128();
#ifndef X86_ASM
if (unaligned)
{
for (count >>= 2; count > 0; --count)
{
color1 = _mm_loadu_si128((__m128i *)from);
from += 4;
color2 = _mm_unpackhi_epi8(color1, zero);
color1 = _mm_unpacklo_epi8(color1, zero);
color1 = _mm_mullo_epi16(blendalpha, color1);
color2 = _mm_mullo_epi16(blendalpha, color2);
color1 = _mm_adds_epu16(blendcolor, color1);
color2 = _mm_adds_epu16(blendcolor, color2);
color1 = _mm_srli_epi16(color1, 8);
color2 = _mm_srli_epi16(color2, 8);
_mm_storeu_si128((__m128i *)to, _mm_packus_epi16(color1, color2));
to += 4;
}
}
else
#endif
{
for (count >>= 2; count > 0; --count)
{
color1 = _mm_load_si128((__m128i *)from);
from += 4;
color2 = _mm_unpackhi_epi8(color1, zero);
color1 = _mm_unpacklo_epi8(color1, zero);
color1 = _mm_mullo_epi16(blendalpha, color1);
color2 = _mm_mullo_epi16(blendalpha, color2);
color1 = _mm_adds_epu16(blendcolor, color1);
color2 = _mm_adds_epu16(blendcolor, color2);
color1 = _mm_srli_epi16(color1, 8);
color2 = _mm_srli_epi16(color2, 8);
_mm_store_si128((__m128i *)to, _mm_packus_epi16(color1, color2));
to += 4;
}
}
}
#endif
Jump to Line
Something went wrong with that request. Please try again.