Skip to content
Permalink
Browse files

Added support for Fuji X-Trans raw files.

  • Loading branch information...
rom9 committed Jun 5, 2019
1 parent 1288f88 commit b3be6ee0e3c289bb55cd986b1e3bc1bb7bfe8d09
Showing with 90 additions and 30 deletions.
  1. +1 −1 rtengine/improccoordinator.cc
  2. +88 −28 rtengine/rawimagesource.cc
  3. +1 −1 rtgui/toolpanelcoord.cc
@@ -279,7 +279,7 @@ void ImProcCoordinator::updatePreviewImage(int todo, bool panningRelatedChange)
highDetailPreprocessComputed = highDetailNeeded;

// After preprocess, run film negative processing if enabled
if((todo & M_RAW) && imgsrc->getSensorType() == ST_BAYER && params->filmNegative.enabled) {
if((todo & M_RAW) && (imgsrc->getSensorType() == ST_BAYER || imgsrc->getSensorType() == ST_FUJI_XTRANS) && params->filmNegative.enabled) {
imgsrc->filmNegativeProcess (params->filmNegative);
}
}
@@ -3499,6 +3499,9 @@ bool RawImageSource::channelsAvg(Coord spotPos, int spotSize, float avgs[3], con
{
avgs[0] = avgs[1] = avgs[2] = 0.f; // Channel averages

if(ri->getSensorType() != ST_BAYER && ri->getSensorType() != ST_FUJI_XTRANS)
return false;

if (settings->verbose)
printf("Spot coord: x=%d y=%d\n", spotPos.x, spotPos.y);

@@ -3510,10 +3513,13 @@ bool RawImageSource::channelsAvg(Coord spotPos, int spotSize, float avgs[3], con
if(x1<0 || x2>W || y1<0 || y2>H)
return false; // Spot goes outside bounds, bail out.

int pxCount[3] = {0}; // Per-channel sample counts
for(int c=spotPos.x-spotSize; c<spotPos.x+spotSize; c++) {
for(int r=spotPos.y-spotSize; r<spotPos.y+spotSize; r++) {

int ch = FC(r,c);
int ch = (ri->getSensorType() == ST_BAYER) ? FC(r,c) : ri->XTRANSFC(r,c);

pxCount[ch]++;
// If film negative is currently enabled, undo the effect by elevating to 1/exp,
// in order to sample the original, linear value
if(params.enabled)
@@ -3523,9 +3529,8 @@ bool RawImageSource::channelsAvg(Coord spotPos, int spotSize, float avgs[3], con
}
}

avgs[0] = avgs[0] / (spotSize * spotSize);
avgs[1] = avgs[1] / (spotSize * spotSize * 2); // Double pixel count for Green
avgs[2] = avgs[2] / (spotSize * spotSize);
for(int ch=0; ch<3; ch++)
avgs[ch] = avgs[ch] / (pxCount[ch]);

return true;
}
@@ -3590,28 +3595,53 @@ void RawImageSource::filmNegativeProcess(const procparams::FilmNegativeParams &p
MyTime t1, t2, t3,t4, t5;
t1.set();

if(ri->getSensorType() == ST_BAYER) {
#ifdef _OPENMP
#pragma omp parallel
#pragma omp parallel
#endif
{
{

#ifdef _OPENMP
#pragma omp for nowait
#pragma omp for nowait
#endif
for (int row = 0; row < H; row ++) {
for (int col = 0; col < W; col++) {
float val = rawData[row][col];
int c = FC(row, col); // three colors, 0=R, 1=G, 2=B
for (int row = 0; row < H; row ++) {
for (int col = 0; col < W; col++) {
float val = rawData[row][col];
int c = FC(row, col); // three colors, 0=R, 1=G, 2=B

// Exponents are expressed as positive in the parameters, so negate them in order
// to get the reciprocals. Avoid trouble with zeroes, minimum pixel value is 1.
val = pow_F(max(val, 1.f), -exps[c]);
// Exponents are expressed as positive in the parameters, so negate them in order
// to get the reciprocals. Avoid trouble with zeroes, minimum pixel value is 1.
val = pow_F(max(val, 1.f), -exps[c]);

rawData[row][col] = (val);
rawData[row][col] = (val);
}
}
}
} else if(ri->getSensorType() == ST_FUJI_XTRANS) {
#ifdef _OPENMP
#pragma omp parallel
#endif
{

#ifdef _OPENMP
#pragma omp for nowait
#endif
for (int row = 0; row < H; row ++) {
for (int col = 0; col < W; col++) {
float val = rawData[row][col];
int c = ri->XTRANSFC(row, col); // three colors, 0=R, 1=G, 2=B

// Exponents are expressed as positive in the parameters, so negate them in order
// to get the reciprocals. Avoid trouble with zeroes, minimum pixel value is 1.
val = pow_F(max(val, 1.f), -exps[c]);

rawData[row][col] = (val);
}
}
}
}


t2.set();
if (settings->verbose)
printf("Pow loop time us: %d\n", t2.etime(t1));
@@ -3625,14 +3655,23 @@ void RawImageSource::filmNegativeProcess(const procparams::FilmNegativeParams &p

// Sample one every 5 pixels, and push the value in the appropriate channel vector.
// Chose an odd step, not multiple of the CFA size, to get a chance to visit each channel.
for (int row = 0; row < H; row+=5) {
for (int col = 0; col < W; col+=5) {
float val = rawData[row][col];
int c = FC(row, col); // three colors, 0=R, 1=G, 2=B
cvs[c].push_back(val);
if(ri->getSensorType() == ST_BAYER) {
for (int row = 0; row < H; row+=5) {
for (int col = 0; col < W; col+=5) {
int c = FC(row, col); // three colors, 0=R, 1=G, 2=B
cvs[c].push_back(rawData[row][col]);
}
}
} else if(ri->getSensorType() == ST_FUJI_XTRANS) {
for (int row = 0; row < H; row+=5) {
for (int col = 0; col < W; col+=5) {
int c = ri->XTRANSFC(row, col); // three colors, 0=R, 1=G, 2=B
cvs[c].push_back(rawData[row][col]);
}
}
}


t3.set();
if (settings->verbose)
printf("Median vector fill loop time us: %d\n", t3.etime(t2));
@@ -3658,24 +3697,45 @@ void RawImageSource::filmNegativeProcess(const procparams::FilmNegativeParams &p
}


if(ri->getSensorType() == ST_BAYER) {

#ifdef _OPENMP
#pragma omp parallel
#pragma omp parallel
#endif
{
{

#ifdef _OPENMP
#pragma omp for nowait
#pragma omp for nowait
#endif
for (int row = 0; row < H; row ++) {
for (int col = 0; col < W; col++) {
int c = FC(row, col); // three colors, 0=R, 1=G, 2=B
// Apply the multipliers
rawData[row][col] *= mults[c];
for (int row = 0; row < H; row ++) {
for (int col = 0; col < W; col++) {
int c = FC(row, col); // three colors, 0=R, 1=G, 2=B
// Apply the multipliers
rawData[row][col] *= mults[c];
}
}
}
} else if(ri->getSensorType() == ST_FUJI_XTRANS) {

#ifdef _OPENMP
#pragma omp parallel
#endif
{

#ifdef _OPENMP
#pragma omp for nowait

This comment has been minimized.

Copy link
@heckflosse

heckflosse Jun 6, 2019

The nowait doesn't make sense in this context because it will wait in line 3735 anyway

This comment has been minimized.

Copy link
@heckflosse

heckflosse Jun 6, 2019

You an simply write

#ifdef _OPENMP
        #pragma omp parallel for
#endif
        for (int row = 0; row < H; row ++) {
            for (int col = 0; col < W; col++) {
                int c  = ri->XTRANSFC(row, col);                        // three colors,  0=R, 1=G,  2=B
                // Apply the multipliers
                rawData[row][col] *= mults[c];
            }
        }
    }

instead of

#ifdef _OPENMP
        #pragma omp parallel
#endif
        {

#ifdef _OPENMP
            #pragma omp for nowait
#endif
            for (int row = 0; row < H; row ++) {
                for (int col = 0; col < W; col++) {
                    int c  = ri->XTRANSFC(row, col);                        // three colors,  0=R, 1=G,  2=B
                    // Apply the multipliers
                    rawData[row][col] *= mults[c];
                }
            }
        }
    }

This comment has been minimized.

Copy link
@rom9

rom9 Jun 6, 2019

Author Owner

Done. Thanks again ;-)

This comment has been minimized.

Copy link
@heckflosse

heckflosse Jun 7, 2019

You're welcome :)

#endif
for (int row = 0; row < H; row ++) {
for (int col = 0; col < W; col++) {
int c = ri->XTRANSFC(row, col); // three colors, 0=R, 1=G, 2=B
// Apply the multipliers
rawData[row][col] *= mults[c];
}
}
}
}


t5.set();
if (settings->verbose)
printf("Mult loop time us: %d\n", t5.etime(t4));
@@ -324,7 +324,7 @@ void ToolPanelCoordinator::imageTypeChanged (bool isRaw, bool isBayer, bool isXt
sensorbayer->FoldableToolPanel::hide();
preprocess->FoldableToolPanel::show();
flatfield->FoldableToolPanel::show();
filmNegative->FoldableToolPanel::hide();
filmNegative->FoldableToolPanel::show();
retinex->FoldableToolPanel::setGrayedOut(false);

return false;

4 comments on commit b3be6ee

@heckflosse

This comment has been minimized.

Copy link

replied Jun 6, 2019

@rom9 Here's a patch with optimized pow-loops and also sse-code

diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index 9a9c9d08f..7ba99e893 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -3597,47 +3597,57 @@ void RawImageSource::filmNegativeProcess(const procparams::FilmNegativeParams &p
 
     if(ri->getSensorType() == ST_BAYER) {
 #ifdef _OPENMP
-        #pragma omp parallel
-#endif
-        {
-
-#ifdef _OPENMP
-            #pragma omp for nowait
+        #pragma omp parallel for schedule(dynamic, 16)
 #endif
-            for (int row = 0; row < H; row ++) {
-                for (int col = 0; col < W; col++) {
-                    float val = rawData[row][col];
-                    int c  = FC(row, col);                        // three colors,  0=R, 1=G,  2=B
-
-                    // Exponents are expressed as positive in the parameters, so negate them in order
-                    // to get the reciprocals. Avoid trouble with zeroes, minimum pixel value is 1.
-                    val = pow_F(max(val, 1.f), -exps[c]);
-
-                    rawData[row][col] = (val);
-                }
+        for (int row = 0; row < H; row ++) {
+            int col = 0;
+            // Exponents are expressed as positive in the parameters, so negate them in order
+            // to get the reciprocals. Avoid trouble with zeroes, minimum pixel value is 1.
+            const float exps0 = -exps[FC(row, col)];
+            const float exps1 = -exps[FC(row, col + 1)];
+#ifdef __SSE2__
+            const vfloat expsv = _mm_setr_ps(exps0, exps1, exps0, exps1);
+            const vfloat onev = F2V(1.f);
+            for (; col < W - 3; col+=4) {
+                STVFU(rawData[row][col], pow_F(vmaxf(LVFU(rawData[row][col]), onev), expsv));
+            }
+#endif // __SSE2__
+            for (; col < W - 1; col+=2) {
+                rawData[row][col] = pow_F(max(rawData[row][col], 1.f), exps0);
+                rawData[row][col + 1] = pow_F(max(rawData[row][col + 1], 1.f), exps1);
+            }
+            if (col < W) {
+                rawData[row][col] = pow_F(max(rawData[row][col], 1.f), exps0);
             }
         }
     } else if(ri->getSensorType() == ST_FUJI_XTRANS) {
 #ifdef _OPENMP
-        #pragma omp parallel
-#endif
-        {
-
-#ifdef _OPENMP
-            #pragma omp for nowait
+        #pragma omp parallel for schedule(dynamic, 16)
 #endif
-            for (int row = 0; row < H; row ++) {
-                for (int col = 0; col < W; col++) {
-                    float val = rawData[row][col];
-                    int c  = ri->XTRANSFC(row, col);                        // three colors,  0=R, 1=G,  2=B
-
-                    // Exponents are expressed as positive in the parameters, so negate them in order
-                    // to get the reciprocals. Avoid trouble with zeroes, minimum pixel value is 1.
-                    val = pow_F(max(val, 1.f), -exps[c]);
-
-                    rawData[row][col] = (val);
+        for (int row = 0; row < H; row ++) {
+            int col = 0;
+            // Exponents are expressed as positive in the parameters, so negate them in order
+            // to get the reciprocals. Avoid trouble with zeroes, minimum pixel value is 1.
+            const float expsc[6] = {-exps[ri->XTRANSFC(row, 0)], -exps[ri->XTRANSFC(row, 1)], -exps[ri->XTRANSFC(row, 2)], -exps[ri->XTRANSFC(row, 3)], -exps[ri->XTRANSFC(row, 4)], -exps[ri->XTRANSFC(row, 5)]};
+#ifdef __SSE2__
+            const vfloat expsv0 = _mm_setr_ps(expsc[0], expsc[1], expsc[2], expsc[3]);
+            const vfloat expsv1 = _mm_setr_ps(expsc[4], expsc[5], expsc[0], expsc[1]);
+            const vfloat expsv2 = _mm_setr_ps(expsc[2], expsc[3], expsc[4], expsc[5]);
+            const vfloat onev = F2V(1.f);
+            for (; col < W - 11; col+=12) {
+                STVFU(rawData[row][col], pow_F(vmaxf(LVFU(rawData[row][col]), onev), expsv0));
+                STVFU(rawData[row][col + 4], pow_F(vmaxf(LVFU(rawData[row][col + 4]), onev), expsv1));
+                STVFU(rawData[row][col + 8], pow_F(vmaxf(LVFU(rawData[row][col + 8]), onev), expsv2));
+            }
+#endif // __SSE2__
+            for (; col < W - 5; col+=6) {
+                for (int c = 0; c < 6; ++c) {
+                    rawData[row][col + c] = pow_F(max(rawData[row][col + c], 1.f), expsc[c]);
                 }
             }
+            for (int c = 0; col < W; col++, c++) {
+                rawData[row][col + c] = pow_F(max(rawData[row][col + c], 1.f), expsc[c]);
+            }
         }
     }
 
@rom9

This comment has been minimized.

Copy link
Owner Author

replied Jun 6, 2019

Holy cow, this thing flies! Another ~4x improvement:
Pow loop time us: 84016
I had to stare at the code for quite a while to get what you're doing here. If i understand correctly, you're pre-loading the channel pattern only once per row, and then repeating it in chunks, instead of calling (XTRANS-)FC each time.
That's brilliant, thank you very much ;-)

@heckflosse

This comment has been minimized.

Copy link

replied Jun 6, 2019

The preloading was needed to vectorize it. For bayer, where the pattern repeats every 2 pixels, it was sufficient to use a +=4 as increment. For xtrans, where the pattern repeats every 6 pixels, I had to use += 12 (the least common multiple of 4 and 6) as increment and process 3 vectors of 4 floats per iteration. The vectorization then led to the ~4x improvement. Avoiding the calls to FC in the inner loop was only a small part of the optimization.

@heckflosse

This comment has been minimized.

Copy link

replied Jun 6, 2019

@rom9 Let me explain some things:

  1. STVFU (SToreVectorofFloatstoUnalignedmemory) writes 4 floats to memory in one instruction
  2. LVFU (LoadVectorofFloatsfromUnalignedmemory) reads 4 floats from memory in one instruction
  3. _mm_setr_ps(a, b, c, d) builds a vector of 4 floats (a,b,c,d)
  4. F2V(a) (Float2Vector) builds a vector of 4 floats (a,a,a,a)
  5. vmaxf just calculates the max of 2 vectors (per element) in one instruction
  6. the vectorized pow_F of course is more than one instruction, but it's branchfree and does the same for all elements of the vector in the same time as for one element of the vector.

Being able to write branchfree vectorized code sometimes leads to even larger than 4x speedups (as one would expect by using vectors of 4 floats)

Please sign in to comment.
You can’t perform that action at this time.