Merge pull request #34 from robertknight/orient-detect

Add simple orientation detection
robertknight · Jun 5, 2022 · 5a5853c · 5a5853c
2 parents 42d7cc2 + 26893cc
commit 5a5853c
Show file tree

Hide file tree

Showing 7 changed files with 158 additions and 4 deletions.
diff --git a/examples/web/ocr-app.js b/examples/web/ocr-app.js
@@ -96,6 +96,17 @@ function OCRWordBox({ box, imageWidth, imageHeight }) {
   );
 }
 
+function isNormalOrientation(orientation) {
+  return orientation.confidence > 0 && orientation.rotation === 0;
+}
+
+function formatOrientation(orientation) {
+  if (orientation.confidence === 0) {
+    return "Unknown";
+  }
+  return `${orientation.rotation}°`;
+}
+
 function OCRDemoApp() {
   const ocrClient = useRef(null);
   const [documentImage, setDocumentImage] = useState(null);
@@ -104,6 +115,7 @@ function OCRDemoApp() {
   const [ocrProgress, setOCRProgress] = useState(null);
   const [status, setStatus] = useState(null);
   const [wordBoxes, setWordBoxes] = useState([]);
+  const [orientation, setOrientation] = useState(null);
 
   const canvasRef = useRef(null);
 
@@ -114,6 +126,7 @@ function OCRDemoApp() {
 
     setError(null);
     setWordBoxes(null);
+    setOrientation(null);
 
     // Set progress to `0` rather than `null` here to show the progress bar
     // immediately after an image is selected.
@@ -154,6 +167,9 @@ function OCRDemoApp() {
         setStatus("Loading image");
         await ocr.loadImage(documentImage);
 
+        const orientation = await ocr.getOrientation();
+        setOrientation(orientation);
+
         // Perform OCR and display progress.
         setStatus("Recognizing text");
         let boxes = await ocr.getTextBoxes("word", setOCRProgress);
@@ -227,6 +243,9 @@ function OCRDemoApp() {
       <FileDropZone onDrop={loadImage} />
       {status !== null && <div>{status}…</div>}
       {ocrProgress !== null && <ProgressBar value={ocrProgress} />}
+      {orientation !== null &&
+        !isNormalOrientation(orientation) &&
+        `Orientation: ${formatOrientation(orientation)}`}
       {documentImage && (
         <div className="OCRDemoApp__output">
           <canvas

diff --git a/src/lib.cpp b/src/lib.cpp
@@ -29,6 +29,11 @@ struct TextRect {
   std::string text;
 };
 
+struct Orientation {
+  int rotation = 0;
+  float confidence = 0.0f;
+};
+
 enum class TextUnit {
   Word,
   Line,
@@ -176,6 +181,54 @@ class OCREngine {
     return string_from_raw(tesseract_->GetUTF8Text());
   }
 
+  Orientation GetOrientation() {
+    // Tesseract's orientation detection is part of the legacy (non-LSTM)
+    // engine, which is not compiled in to reduce binary size. Hence we use
+    // Leptonica's orientation detection instead. See comments for
+    // `pixOrientDetect` in the Leptonica source for details of how it works.
+    //
+    // The method is simplistic, and is designed for latin text, but it serves
+    // as a baseline that can be improved upon later.
+    auto pix = tesseract_->GetThresholdedImage();
+
+    // Metric that indicates whether the image is right-side up vs upside down.
+    // +ve indicates right-side up.
+    float up_conf = 0;
+
+    // Metric that indicates whether the image is right-side up after being
+    // rotated 90 degrees clockwise.
+    float left_conf = 0;
+
+    auto had_error = pixOrientDetect(pix, &up_conf, &left_conf,
+                                     0 /* min_count */, 0 /* debug */);
+    pixDestroy(&pix);
+
+    if (had_error) {
+      // If there is an error, we currently report a result with zero confidence
+      // score.
+      return {};
+    }
+
+    // Are we more confident that the image is rotated at 0/180 degrees than
+    // 90/270?
+    auto is_up_or_down = abs(up_conf) - abs(left_conf) > 5.0;
+    int rotation;
+    if (is_up_or_down) {
+      if (up_conf > 0) {
+        rotation = 0;
+      } else {
+        rotation = 180;
+      }
+    } else {
+      if (left_conf < 0) {
+        rotation = 90;
+      } else {
+        rotation = 270;
+      }
+    }
+    return {.rotation = rotation, .confidence = 1};
+  }
+
  private:
   std::vector<TextRect> GetBoxes(TextUnit unit, bool with_text) {
     auto iter = unique_from_raw(tesseract_->GetIterator());
@@ -242,6 +295,10 @@ EMSCRIPTEN_BINDINGS(ocrlib) {
       .field("confidence", &TextRect::confidence)
       .field("text", &TextRect::text);
 
+  value_object<Orientation>("Orientation")
+      .field("rotation", &Orientation::rotation)
+      .field("confidence", &Orientation::confidence);
+
   class_<Image>("Image").constructor<int, int>().function("data", &Image::Data);
 
   class_<OCREngine>("OCREngine")
@@ -250,6 +307,7 @@ EMSCRIPTEN_BINDINGS(ocrlib) {
       .function("loadModel", &OCREngine::LoadModel)
       .function("loadImage", &OCREngine::LoadImage)
       .function("getBoundingBoxes", &OCREngine::GetBoundingBoxes)
+      .function("getOrientation", &OCREngine::GetOrientation)
       .function("getTextBoxes", &OCREngine::GetTextBoxes)
       .function("getText", &OCREngine::GetText);
 

diff --git a/src/ocr-client.js b/src/ocr-client.js
@@ -10,6 +10,7 @@ import { imageDataFromBitmap } from "./utils";
 
 /**
  * @typedef {import('./ocr-engine').BoxItem} BoxItem
+ * @typedef {import('./ocr-engine').Orientation} Orientation
  * @typedef {import('./ocr-engine').TextItem} TextItem
  * @typedef {import('./ocr-engine').TextUnit} TextUnit
  */
@@ -203,6 +204,22 @@ export class OCRClient {
     }
   }
 
+  /**
+   * Attempt to determine the orientation of the image.
+   *
+   * This currently uses a simplistic algorithm [1] which is designed for
+   * non-uppercase Latin text. It will likely perform badly for other scripts or
+   * if the text is all uppercase.
+   *
+   * [1] See http://www.leptonica.org/papers/skew-measurement.pdf
+   *
+   * @return {Promise<Orientation>}
+   */
+  async getOrientation() {
+    const engine = await this._ocrEngine;
+    return engine.getOrientation();
+  }
+
   /** @param {ProgressListener} listener */
   _addProgressListener(listener) {
     this._progressListeners.push(listener);

diff --git a/src/ocr-engine.js b/src/ocr-engine.js
@@ -65,6 +65,14 @@ export const layoutFlags = {
  * @prop {string} text
  */
 
+/**
+ * Result of orientation detection.
+ *
+ * @typedef Orientation
+ * @prop {number} rotation
+ * @prop {number} confidence - Confidence value in [0, 1]
+ */
+
 /**
  * @typedef {'line'|'word'} TextUnit
  */
@@ -241,6 +249,22 @@ class OCREngine {
     );
   }
 
+  /**
+   * Attempt to determine the orientation of the document image in degrees.
+   *
+   * This currently uses a simplistic algorithm [1] which is designed for
+   * non-uppercase Latin text. It will likely perform badly for other scripts or
+   * if the text is all uppercase.
+   *
+   * [1] See http://www.leptonica.org/papers/skew-measurement.pdf
+   *
+   * @return {Orientation}
+   */
+  getOrientation() {
+    this._checkImageLoaded();
+    return this._engine.getOrientation();
+  }
+
   _checkModelLoaded() {
     if (!this._modelLoaded) {
       throw new Error("No text recognition model loaded");

diff --git a/test/ocr-client-test.js b/test/ocr-client-test.js
@@ -108,4 +108,14 @@ describe("OCRClient", () => {
       }
     }
   });
+
+  // Test orientation detection method returns a result. Detailed tests for
+  // different orientations are handled in the OCREngine tests.
+  it("can determine image orientation", async () => {
+    const imageData = await loadImage(resolve("./small-test-page.jpg"));
+    await ocr.loadImage(imageData);
+    const orient = await ocr.getOrientation();
+    assert.equal(orient.rotation, 0);
+    assert.equal(orient.confidence, 1.0);
+  });
 });
diff --git a/test/ocr-engine-test.js b/test/ocr-engine-test.js
@@ -2,13 +2,14 @@ import { dirname } from "node:path";
 import { readFile } from "node:fs/promises";
 
 import { assert } from "chai";
+import sharp from "sharp";
 
 import {
   createOCREngine,
   layoutFlags,
   supportsFastBuild,
 } from "../dist/lib.js";
-import { loadImage, resolve } from "./util.js";
+import { loadImage, resolve, toImageData } from "./util.js";
 
 const { StartOfLine, EndOfLine } = layoutFlags;
 
@@ -293,4 +294,18 @@ describe("OCREngine", () => {
     });
     assert.deepEqual(progressSteps, [100]);
   });
+
+  it("can determine image orientation", async () => {
+    const imagePath = resolve("./small-test-page.jpg");
+
+    for (let rotation of [0, 90, 180, 270]) {
+      const image = await sharp(imagePath).ensureAlpha().rotate(rotation);
+
+      ocr.loadImage(await toImageData(image));
+      const estimatedOrient = ocr.getOrientation();
+
+      assert.equal(estimatedOrient.rotation, rotation);
+      assert.equal(estimatedOrient.confidence, 1);
+    }
+  });
 });
diff --git a/test/util.js b/test/util.js
@@ -12,14 +12,25 @@ export function resolve(path, moduleURL = import.meta.url) {
 }
 
 /**
- * @param {string} path
+ * Convert a sharp image to an ImageData-like object that can be passed to
+ * OCREngine and OCRClient.
  */
-export async function loadImage(path) {
-  const image = await sharp(path).ensureAlpha();
+export async function toImageData(image) {
   const { width, height } = await image.metadata();
   return {
     data: await image.raw().toBuffer(),
     width,
     height,
   };
 }
+
+/**
+ * Load and decode an image into an ImageData-like object.
+ *
+ * @param {string} path
+ * @return {ImageData}
+ */
+export async function loadImage(path) {
+  const image = await sharp(path).ensureAlpha();
+  return toImageData(image);
+}