Skip to content

Commit d2fa65f

Browse files
rawrunprotectedrawrunprotected
authored andcommitted
Optimizations, 20-30% speed up
1 parent 5699c2c commit d2fa65f

File tree

7 files changed

+625
-470
lines changed

7 files changed

+625
-470
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,6 @@
1515
*.user
1616
*.idb
1717
*.ilk
18+
*.cfg
19+
*.vtss
20+
*.aux

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,6 @@ Requirements
2323
License
2424
============
2525

26-
[!["Creative Commons Licence"](https://i.creativecommons.org/l/by/4.0/80x15.png)](http://creativecommons.org/licenses/by/4.0/)
27-
26+
[!["Creative Commons Licence"](https://i.creativecommons.org/l/by/4.0/80x15.png)](http://creativecommons.org/licenses/by/4.0/)
27+
2828
This work is licensed under a [Creative Commons Attribution 4.0 International License](http://creativecommons.org/licenses/by/4.0/).

SoftwareRasterizer/Main.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,8 @@ int APIENTRY wWinMain(HINSTANCE hInstance, HINSTANCE, LPWSTR, int)
8383

8484
g_rasterizer = std::make_unique<Rasterizer>(WINDOW_WIDTH, WINDOW_HEIGHT);
8585

86-
// Pad to a multiple of 4 quads
87-
while (indices.size() % 16 != 0)
86+
// Pad to a multiple of 8 quads
87+
while (indices.size() % 32 != 0)
8888
{
8989
indices.push_back(indices[0]);
9090
}
@@ -100,7 +100,7 @@ int APIENTRY wWinMain(HINSTANCE hInstance, HINSTANCE, LPWSTR, int)
100100
quadAabbs.push_back(aabb);
101101
}
102102

103-
auto batchAssignment = SurfaceAreaHeuristic::generateBatches(quadAabbs, 512, 4);
103+
auto batchAssignment = SurfaceAreaHeuristic::generateBatches(quadAabbs, 512, 8);
104104

105105
Aabb refAabb;
106106
for (auto v : vertices)
@@ -204,7 +204,7 @@ LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
204204
float rasterTime = std::chrono::duration<float, std::milli>(raster_end - raster_start).count();
205205
static float avgRasterTime = rasterTime;
206206

207-
float alpha = 0.035f;
207+
float alpha = 0.0035f;
208208
avgRasterTime = rasterTime * alpha + avgRasterTime * (1.0f - alpha);
209209

210210
int fps = int(1000.0f / avgRasterTime);
@@ -248,7 +248,7 @@ LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
248248
auto now = std::chrono::high_resolution_clock::now();
249249

250250
XMVECTOR right = XMVector3Normalize(XMVector3Cross(g_cameraDirection, g_upVector));
251-
float translateSpeed = 0.005f * std::chrono::duration<float, std::milli>(now - lastPaint).count();
251+
float translateSpeed = 0.01f * std::chrono::duration<float, std::milli>(now - lastPaint).count();
252252
float rotateSpeed = 0.002f * std::chrono::duration<float, std::milli>(now - lastPaint).count();
253253

254254
lastPaint = now;

SoftwareRasterizer/Occluder.cpp

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __
4141
__m128 normal = quadNormals[j];
4242

4343
__m128 bestDistance = _mm_set1_ps(-std::numeric_limits<float>::infinity());
44-
int bestCentroid = -1;
44+
uint32_t bestCentroid = 0;
4545
for (int k = 0; k < centroids.size(); ++k)
4646
{
4747
__m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F);
@@ -78,7 +78,7 @@ std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __
7878
}
7979

8080
std::vector<__m128> orderedVertices;
81-
for (int k = 0; k < centroids.size(); ++k)
81+
for (uint32_t k = 0; k < centroids.size(); ++k)
8282
{
8383
for (int j = 0; j < vertices.size() / 4; ++j)
8484
{
@@ -102,33 +102,58 @@ std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __
102102

103103
__m128 half = _mm_set1_ps(0.5f);
104104

105-
for (size_t i = 0; i < orderedVertices.size(); i += 16)
105+
occluder->m_packetCount = 0;
106+
occluder->m_vertexData = reinterpret_cast<__m256i*>(_aligned_malloc(orderedVertices.size() * 4, 32));
107+
108+
for (size_t i = 0; i < orderedVertices.size(); i += 32)
106109
{
110+
__m128i v[8];
111+
107112
for (auto j = 0; j < 4; ++j)
108113
{
109114
// Transform into [0,1] space relative to bounding box
110-
__m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents);
111-
__m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents);
112-
__m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents);
115+
__m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents);
116+
__m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents);
117+
__m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents);
113118
__m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents);
119+
__m128 v4 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 16], refMin), invExtents);
120+
__m128 v5 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 20], refMin), invExtents);
121+
__m128 v6 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 24], refMin), invExtents);
122+
__m128 v7 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 28], refMin), invExtents);
114123

115124
// Transpose into [xxxx][yyyy][zzzz][wwww]
116125
_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
126+
_MM_TRANSPOSE4_PS(v4, v5, v6, v7);
117127

118128
// Scale and truncate to int
119129
v0 = _mm_fmadd_ps(v0, scalingX, half);
120130
v1 = _mm_fmadd_ps(v1, scalingY, half);
121131
v2 = _mm_fmadd_ps(v2, scalingZ, half);
122132

123-
__m128i X = _mm_cvttps_epi32(v0);
124-
__m128i Y = _mm_cvttps_epi32(v1);
125-
__m128i Z = _mm_cvttps_epi32(v2);
133+
v4 = _mm_fmadd_ps(v4, scalingX, half);
134+
v5 = _mm_fmadd_ps(v5, scalingY, half);
135+
v6 = _mm_fmadd_ps(v6, scalingZ, half);
136+
137+
__m128i X0 = _mm_sub_epi32(_mm_cvttps_epi32(v0), _mm_set1_epi32(1024));
138+
__m128i Y0 = _mm_cvttps_epi32(v1);
139+
__m128i Z0 = _mm_cvttps_epi32(v2);
140+
141+
__m128i X1 = _mm_sub_epi32(_mm_cvttps_epi32(v4), _mm_set1_epi32(1024));
142+
__m128i Y1 = _mm_cvttps_epi32(v5);
143+
__m128i Z1 = _mm_cvttps_epi32(v6);
126144

127145
// Pack to 11/11/10 format
128-
__m128i XYZ = _mm_or_si128(_mm_slli_epi32(X, 21), _mm_or_si128(_mm_slli_epi32(Y, 10), Z));
146+
__m128i XYZ0 = _mm_or_si128(_mm_slli_epi32(X0, 21), _mm_or_si128(_mm_slli_epi32(Y0, 10), Z0));
147+
__m128i XYZ1 = _mm_or_si128(_mm_slli_epi32(X1, 21), _mm_or_si128(_mm_slli_epi32(Y1, 10), Z1));
129148

130-
occluder->m_vertexData.push_back(XYZ);
149+
v[2 * j + 0] = XYZ0;
150+
v[2 * j + 1] = XYZ1;
131151
}
152+
153+
occluder->m_vertexData[occluder->m_packetCount++] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(v + 0));
154+
occluder->m_vertexData[occluder->m_packetCount++] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(v + 2));
155+
occluder->m_vertexData[occluder->m_packetCount++] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(v + 4));
156+
occluder->m_vertexData[occluder->m_packetCount++] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(v + 6));
132157
}
133158

134159
occluder->m_refMin = refMin;

SoftwareRasterizer/Occluder.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include <memory>
44
#include <vector>
5+
#include <intrin.h>
56

67
struct Occluder
78
{
@@ -15,7 +16,8 @@ struct Occluder
1516
__m128 m_boundsMin;
1617
__m128 m_boundsMax;
1718

18-
std::vector<__m128i> m_vertexData;
19+
__m256i* m_vertexData;
20+
uint32_t m_packetCount;
1921
};
2022

2123

0 commit comments

Comments
 (0)