Skip to content
Permalink
Browse files

Optimizations, 20-30% speed up

  • Loading branch information...
rawrunprotected authored and rawrunprotected committed Apr 28, 2018
1 parent 5699c2c commit d2fa65fd6b5df8b94d8d4b02914354c9aca2d46d
@@ -15,3 +15,6 @@
*.user
*.idb
*.ilk
*.cfg
*.vtss
*.aux
@@ -23,6 +23,6 @@ Requirements
License
============

[!["Creative Commons Licence"](https://i.creativecommons.org/l/by/4.0/80x15.png)](http://creativecommons.org/licenses/by/4.0/)

[!["Creative Commons Licence"](https://i.creativecommons.org/l/by/4.0/80x15.png)](http://creativecommons.org/licenses/by/4.0/)

This work is licensed under a [Creative Commons Attribution 4.0 International License](http://creativecommons.org/licenses/by/4.0/).
@@ -83,8 +83,8 @@ int APIENTRY wWinMain(HINSTANCE hInstance, HINSTANCE, LPWSTR, int)

g_rasterizer = std::make_unique<Rasterizer>(WINDOW_WIDTH, WINDOW_HEIGHT);

// Pad to a multiple of 4 quads
while (indices.size() % 16 != 0)
// Pad to a multiple of 8 quads
while (indices.size() % 32 != 0)
{
indices.push_back(indices[0]);
}
@@ -100,7 +100,7 @@ int APIENTRY wWinMain(HINSTANCE hInstance, HINSTANCE, LPWSTR, int)
quadAabbs.push_back(aabb);
}

auto batchAssignment = SurfaceAreaHeuristic::generateBatches(quadAabbs, 512, 4);
auto batchAssignment = SurfaceAreaHeuristic::generateBatches(quadAabbs, 512, 8);

Aabb refAabb;
for (auto v : vertices)
@@ -204,7 +204,7 @@ LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
float rasterTime = std::chrono::duration<float, std::milli>(raster_end - raster_start).count();
static float avgRasterTime = rasterTime;

float alpha = 0.035f;
float alpha = 0.0035f;
avgRasterTime = rasterTime * alpha + avgRasterTime * (1.0f - alpha);

int fps = int(1000.0f / avgRasterTime);
@@ -248,7 +248,7 @@ LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
auto now = std::chrono::high_resolution_clock::now();

XMVECTOR right = XMVector3Normalize(XMVector3Cross(g_cameraDirection, g_upVector));
float translateSpeed = 0.005f * std::chrono::duration<float, std::milli>(now - lastPaint).count();
float translateSpeed = 0.01f * std::chrono::duration<float, std::milli>(now - lastPaint).count();
float rotateSpeed = 0.002f * std::chrono::duration<float, std::milli>(now - lastPaint).count();

lastPaint = now;
@@ -41,7 +41,7 @@ std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __
__m128 normal = quadNormals[j];

__m128 bestDistance = _mm_set1_ps(-std::numeric_limits<float>::infinity());
int bestCentroid = -1;
uint32_t bestCentroid = 0;
for (int k = 0; k < centroids.size(); ++k)
{
__m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F);
@@ -78,7 +78,7 @@ std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __
}

std::vector<__m128> orderedVertices;
for (int k = 0; k < centroids.size(); ++k)
for (uint32_t k = 0; k < centroids.size(); ++k)
{
for (int j = 0; j < vertices.size() / 4; ++j)
{
@@ -102,33 +102,58 @@ std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __

__m128 half = _mm_set1_ps(0.5f);

for (size_t i = 0; i < orderedVertices.size(); i += 16)
occluder->m_packetCount = 0;
occluder->m_vertexData = reinterpret_cast<__m256i*>(_aligned_malloc(orderedVertices.size() * 4, 32));

for (size_t i = 0; i < orderedVertices.size(); i += 32)
{
__m128i v[8];

for (auto j = 0; j < 4; ++j)
{
// Transform into [0,1] space relative to bounding box
__m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents);
__m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents);
__m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents);
__m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents);
__m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents);
__m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents);
__m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents);
__m128 v4 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 16], refMin), invExtents);
__m128 v5 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 20], refMin), invExtents);
__m128 v6 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 24], refMin), invExtents);
__m128 v7 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 28], refMin), invExtents);

// Transpose into [xxxx][yyyy][zzzz][wwww]
_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
_MM_TRANSPOSE4_PS(v4, v5, v6, v7);

// Scale and truncate to int
v0 = _mm_fmadd_ps(v0, scalingX, half);
v1 = _mm_fmadd_ps(v1, scalingY, half);
v2 = _mm_fmadd_ps(v2, scalingZ, half);

__m128i X = _mm_cvttps_epi32(v0);
__m128i Y = _mm_cvttps_epi32(v1);
__m128i Z = _mm_cvttps_epi32(v2);
v4 = _mm_fmadd_ps(v4, scalingX, half);
v5 = _mm_fmadd_ps(v5, scalingY, half);
v6 = _mm_fmadd_ps(v6, scalingZ, half);

__m128i X0 = _mm_sub_epi32(_mm_cvttps_epi32(v0), _mm_set1_epi32(1024));
__m128i Y0 = _mm_cvttps_epi32(v1);
__m128i Z0 = _mm_cvttps_epi32(v2);

__m128i X1 = _mm_sub_epi32(_mm_cvttps_epi32(v4), _mm_set1_epi32(1024));
__m128i Y1 = _mm_cvttps_epi32(v5);
__m128i Z1 = _mm_cvttps_epi32(v6);

// Pack to 11/11/10 format
__m128i XYZ = _mm_or_si128(_mm_slli_epi32(X, 21), _mm_or_si128(_mm_slli_epi32(Y, 10), Z));
__m128i XYZ0 = _mm_or_si128(_mm_slli_epi32(X0, 21), _mm_or_si128(_mm_slli_epi32(Y0, 10), Z0));
__m128i XYZ1 = _mm_or_si128(_mm_slli_epi32(X1, 21), _mm_or_si128(_mm_slli_epi32(Y1, 10), Z1));

occluder->m_vertexData.push_back(XYZ);
v[2 * j + 0] = XYZ0;
v[2 * j + 1] = XYZ1;
}

occluder->m_vertexData[occluder->m_packetCount++] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(v + 0));
occluder->m_vertexData[occluder->m_packetCount++] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(v + 2));
occluder->m_vertexData[occluder->m_packetCount++] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(v + 4));
occluder->m_vertexData[occluder->m_packetCount++] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(v + 6));
}

occluder->m_refMin = refMin;
@@ -2,6 +2,7 @@

#include <memory>
#include <vector>
#include <intrin.h>

struct Occluder
{
@@ -15,7 +16,8 @@ struct Occluder
__m128 m_boundsMin;
__m128 m_boundsMax;

std::vector<__m128i> m_vertexData;
__m256i* m_vertexData;
uint32_t m_packetCount;
};


Oops, something went wrong.

0 comments on commit d2fa65f

Please sign in to comment.
You can’t perform that action at this time.