@@ -41,7 +41,7 @@ std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __
4141 __m128 normal = quadNormals[j];
4242
4343 __m128 bestDistance = _mm_set1_ps (-std::numeric_limits<float >::infinity ());
44- int bestCentroid = - 1 ;
44+ uint32_t bestCentroid = 0 ;
4545 for (int k = 0 ; k < centroids.size (); ++k)
4646 {
4747 __m128 distance = _mm_dp_ps (centroids[k], normal, 0x7F );
@@ -78,7 +78,7 @@ std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __
7878 }
7979
8080 std::vector<__m128> orderedVertices;
81- for (int k = 0 ; k < centroids.size (); ++k)
81+ for (uint32_t k = 0 ; k < centroids.size (); ++k)
8282 {
8383 for (int j = 0 ; j < vertices.size () / 4 ; ++j)
8484 {
@@ -102,33 +102,58 @@ std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __
102102
103103 __m128 half = _mm_set1_ps (0 .5f );
104104
105- for (size_t i = 0 ; i < orderedVertices.size (); i += 16 )
105+ occluder->m_packetCount = 0 ;
106+ occluder->m_vertexData = reinterpret_cast <__m256i*>(_aligned_malloc (orderedVertices.size () * 4 , 32 ));
107+
108+ for (size_t i = 0 ; i < orderedVertices.size (); i += 32 )
106109 {
110+ __m128i v[8 ];
111+
107112 for (auto j = 0 ; j < 4 ; ++j)
108113 {
109114 // Transform into [0,1] space relative to bounding box
110- __m128 v0 = _mm_mul_ps (_mm_sub_ps (orderedVertices[i + j + 0 ], refMin), invExtents);
111- __m128 v1 = _mm_mul_ps (_mm_sub_ps (orderedVertices[i + j + 4 ], refMin), invExtents);
112- __m128 v2 = _mm_mul_ps (_mm_sub_ps (orderedVertices[i + j + 8 ], refMin), invExtents);
115+ __m128 v0 = _mm_mul_ps (_mm_sub_ps (orderedVertices[i + j + 0 ], refMin), invExtents);
116+ __m128 v1 = _mm_mul_ps (_mm_sub_ps (orderedVertices[i + j + 4 ], refMin), invExtents);
117+ __m128 v2 = _mm_mul_ps (_mm_sub_ps (orderedVertices[i + j + 8 ], refMin), invExtents);
113118 __m128 v3 = _mm_mul_ps (_mm_sub_ps (orderedVertices[i + j + 12 ], refMin), invExtents);
119+ __m128 v4 = _mm_mul_ps (_mm_sub_ps (orderedVertices[i + j + 16 ], refMin), invExtents);
120+ __m128 v5 = _mm_mul_ps (_mm_sub_ps (orderedVertices[i + j + 20 ], refMin), invExtents);
121+ __m128 v6 = _mm_mul_ps (_mm_sub_ps (orderedVertices[i + j + 24 ], refMin), invExtents);
122+ __m128 v7 = _mm_mul_ps (_mm_sub_ps (orderedVertices[i + j + 28 ], refMin), invExtents);
114123
115124 // Transpose into [xxxx][yyyy][zzzz][wwww]
116125 _MM_TRANSPOSE4_PS (v0, v1, v2, v3);
126+ _MM_TRANSPOSE4_PS (v4, v5, v6, v7);
117127
118128 // Scale and truncate to int
119129 v0 = _mm_fmadd_ps (v0, scalingX, half);
120130 v1 = _mm_fmadd_ps (v1, scalingY, half);
121131 v2 = _mm_fmadd_ps (v2, scalingZ, half);
122132
123- __m128i X = _mm_cvttps_epi32 (v0);
124- __m128i Y = _mm_cvttps_epi32 (v1);
125- __m128i Z = _mm_cvttps_epi32 (v2);
133+ v4 = _mm_fmadd_ps (v4, scalingX, half);
134+ v5 = _mm_fmadd_ps (v5, scalingY, half);
135+ v6 = _mm_fmadd_ps (v6, scalingZ, half);
136+
137+ __m128i X0 = _mm_sub_epi32 (_mm_cvttps_epi32 (v0), _mm_set1_epi32 (1024 ));
138+ __m128i Y0 = _mm_cvttps_epi32 (v1);
139+ __m128i Z0 = _mm_cvttps_epi32 (v2);
140+
141+ __m128i X1 = _mm_sub_epi32 (_mm_cvttps_epi32 (v4), _mm_set1_epi32 (1024 ));
142+ __m128i Y1 = _mm_cvttps_epi32 (v5);
143+ __m128i Z1 = _mm_cvttps_epi32 (v6);
126144
127145 // Pack to 11/11/10 format
128- __m128i XYZ = _mm_or_si128 (_mm_slli_epi32 (X, 21 ), _mm_or_si128 (_mm_slli_epi32 (Y, 10 ), Z));
146+ __m128i XYZ0 = _mm_or_si128 (_mm_slli_epi32 (X0, 21 ), _mm_or_si128 (_mm_slli_epi32 (Y0, 10 ), Z0));
147+ __m128i XYZ1 = _mm_or_si128 (_mm_slli_epi32 (X1, 21 ), _mm_or_si128 (_mm_slli_epi32 (Y1, 10 ), Z1));
129148
130- occluder->m_vertexData .push_back (XYZ);
149+ v[2 * j + 0 ] = XYZ0;
150+ v[2 * j + 1 ] = XYZ1;
131151 }
152+
153+ occluder->m_vertexData [occluder->m_packetCount ++] = _mm256_loadu_si256 (reinterpret_cast <const __m256i*>(v + 0 ));
154+ occluder->m_vertexData [occluder->m_packetCount ++] = _mm256_loadu_si256 (reinterpret_cast <const __m256i*>(v + 2 ));
155+ occluder->m_vertexData [occluder->m_packetCount ++] = _mm256_loadu_si256 (reinterpret_cast <const __m256i*>(v + 4 ));
156+ occluder->m_vertexData [occluder->m_packetCount ++] = _mm256_loadu_si256 (reinterpret_cast <const __m256i*>(v + 6 ));
132157 }
133158
134159 occluder->m_refMin = refMin;
0 commit comments