Skip to content

Commit

Permalink
Merge pull request #34 from robertknight/orient-detect
Browse files Browse the repository at this point in the history
Add simple orientation detection
  • Loading branch information
robertknight committed Jun 5, 2022
2 parents 42d7cc2 + 26893cc commit 5a5853c
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 4 deletions.
19 changes: 19 additions & 0 deletions examples/web/ocr-app.js
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,17 @@ function OCRWordBox({ box, imageWidth, imageHeight }) {
);
}

function isNormalOrientation(orientation) {
return orientation.confidence > 0 && orientation.rotation === 0;
}

function formatOrientation(orientation) {
if (orientation.confidence === 0) {
return "Unknown";
}
return `${orientation.rotation}°`;
}

function OCRDemoApp() {
const ocrClient = useRef(null);
const [documentImage, setDocumentImage] = useState(null);
Expand All @@ -104,6 +115,7 @@ function OCRDemoApp() {
const [ocrProgress, setOCRProgress] = useState(null);
const [status, setStatus] = useState(null);
const [wordBoxes, setWordBoxes] = useState([]);
const [orientation, setOrientation] = useState(null);

const canvasRef = useRef(null);

Expand All @@ -114,6 +126,7 @@ function OCRDemoApp() {

setError(null);
setWordBoxes(null);
setOrientation(null);

// Set progress to `0` rather than `null` here to show the progress bar
// immediately after an image is selected.
Expand Down Expand Up @@ -154,6 +167,9 @@ function OCRDemoApp() {
setStatus("Loading image");
await ocr.loadImage(documentImage);

const orientation = await ocr.getOrientation();
setOrientation(orientation);

// Perform OCR and display progress.
setStatus("Recognizing text");
let boxes = await ocr.getTextBoxes("word", setOCRProgress);
Expand Down Expand Up @@ -227,6 +243,9 @@ function OCRDemoApp() {
<FileDropZone onDrop={loadImage} />
{status !== null && <div>{status}</div>}
{ocrProgress !== null && <ProgressBar value={ocrProgress} />}
{orientation !== null &&
!isNormalOrientation(orientation) &&
`Orientation: ${formatOrientation(orientation)}`}
{documentImage && (
<div className="OCRDemoApp__output">
<canvas
Expand Down
58 changes: 58 additions & 0 deletions src/lib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ struct TextRect {
std::string text;
};

struct Orientation {
int rotation = 0;
float confidence = 0.0f;
};

enum class TextUnit {
Word,
Line,
Expand Down Expand Up @@ -176,6 +181,54 @@ class OCREngine {
return string_from_raw(tesseract_->GetUTF8Text());
}

Orientation GetOrientation() {
// Tesseract's orientation detection is part of the legacy (non-LSTM)
// engine, which is not compiled in to reduce binary size. Hence we use
// Leptonica's orientation detection instead. See comments for
// `pixOrientDetect` in the Leptonica source for details of how it works.
//
// The method is simplistic, and is designed for latin text, but it serves
// as a baseline that can be improved upon later.
auto pix = tesseract_->GetThresholdedImage();

// Metric that indicates whether the image is right-side up vs upside down.
// +ve indicates right-side up.
float up_conf = 0;

// Metric that indicates whether the image is right-side up after being
// rotated 90 degrees clockwise.
float left_conf = 0;

auto had_error = pixOrientDetect(pix, &up_conf, &left_conf,
0 /* min_count */, 0 /* debug */);
pixDestroy(&pix);

if (had_error) {
// If there is an error, we currently report a result with zero confidence
// score.
return {};
}

// Are we more confident that the image is rotated at 0/180 degrees than
// 90/270?
auto is_up_or_down = abs(up_conf) - abs(left_conf) > 5.0;
int rotation;
if (is_up_or_down) {
if (up_conf > 0) {
rotation = 0;
} else {
rotation = 180;
}
} else {
if (left_conf < 0) {
rotation = 90;
} else {
rotation = 270;
}
}
return {.rotation = rotation, .confidence = 1};
}

private:
std::vector<TextRect> GetBoxes(TextUnit unit, bool with_text) {
auto iter = unique_from_raw(tesseract_->GetIterator());
Expand Down Expand Up @@ -242,6 +295,10 @@ EMSCRIPTEN_BINDINGS(ocrlib) {
.field("confidence", &TextRect::confidence)
.field("text", &TextRect::text);

value_object<Orientation>("Orientation")
.field("rotation", &Orientation::rotation)
.field("confidence", &Orientation::confidence);

class_<Image>("Image").constructor<int, int>().function("data", &Image::Data);

class_<OCREngine>("OCREngine")
Expand All @@ -250,6 +307,7 @@ EMSCRIPTEN_BINDINGS(ocrlib) {
.function("loadModel", &OCREngine::LoadModel)
.function("loadImage", &OCREngine::LoadImage)
.function("getBoundingBoxes", &OCREngine::GetBoundingBoxes)
.function("getOrientation", &OCREngine::GetOrientation)
.function("getTextBoxes", &OCREngine::GetTextBoxes)
.function("getText", &OCREngine::GetText);

Expand Down
17 changes: 17 additions & 0 deletions src/ocr-client.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { imageDataFromBitmap } from "./utils";

/**
* @typedef {import('./ocr-engine').BoxItem} BoxItem
* @typedef {import('./ocr-engine').Orientation} Orientation
* @typedef {import('./ocr-engine').TextItem} TextItem
* @typedef {import('./ocr-engine').TextUnit} TextUnit
*/
Expand Down Expand Up @@ -203,6 +204,22 @@ export class OCRClient {
}
}

/**
* Attempt to determine the orientation of the image.
*
* This currently uses a simplistic algorithm [1] which is designed for
* non-uppercase Latin text. It will likely perform badly for other scripts or
* if the text is all uppercase.
*
* [1] See http://www.leptonica.org/papers/skew-measurement.pdf
*
* @return {Promise<Orientation>}
*/
async getOrientation() {
const engine = await this._ocrEngine;
return engine.getOrientation();
}

/** @param {ProgressListener} listener */
_addProgressListener(listener) {
this._progressListeners.push(listener);
Expand Down
24 changes: 24 additions & 0 deletions src/ocr-engine.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ export const layoutFlags = {
* @prop {string} text
*/

/**
* Result of orientation detection.
*
* @typedef Orientation
* @prop {number} rotation
* @prop {number} confidence - Confidence value in [0, 1]
*/

/**
* @typedef {'line'|'word'} TextUnit
*/
Expand Down Expand Up @@ -241,6 +249,22 @@ class OCREngine {
);
}

/**
* Attempt to determine the orientation of the document image in degrees.
*
* This currently uses a simplistic algorithm [1] which is designed for
* non-uppercase Latin text. It will likely perform badly for other scripts or
* if the text is all uppercase.
*
* [1] See http://www.leptonica.org/papers/skew-measurement.pdf
*
* @return {Orientation}
*/
getOrientation() {
this._checkImageLoaded();
return this._engine.getOrientation();
}

_checkModelLoaded() {
if (!this._modelLoaded) {
throw new Error("No text recognition model loaded");
Expand Down
10 changes: 10 additions & 0 deletions test/ocr-client-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,14 @@ describe("OCRClient", () => {
}
}
});

// Test orientation detection method returns a result. Detailed tests for
// different orientations are handled in the OCREngine tests.
it("can determine image orientation", async () => {
const imageData = await loadImage(resolve("./small-test-page.jpg"));
await ocr.loadImage(imageData);
const orient = await ocr.getOrientation();
assert.equal(orient.rotation, 0);
assert.equal(orient.confidence, 1.0);
});
});
17 changes: 16 additions & 1 deletion test/ocr-engine-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ import { dirname } from "node:path";
import { readFile } from "node:fs/promises";

import { assert } from "chai";
import sharp from "sharp";

import {
createOCREngine,
layoutFlags,
supportsFastBuild,
} from "../dist/lib.js";
import { loadImage, resolve } from "./util.js";
import { loadImage, resolve, toImageData } from "./util.js";

const { StartOfLine, EndOfLine } = layoutFlags;

Expand Down Expand Up @@ -293,4 +294,18 @@ describe("OCREngine", () => {
});
assert.deepEqual(progressSteps, [100]);
});

it("can determine image orientation", async () => {
const imagePath = resolve("./small-test-page.jpg");

for (let rotation of [0, 90, 180, 270]) {
const image = await sharp(imagePath).ensureAlpha().rotate(rotation);

ocr.loadImage(await toImageData(image));
const estimatedOrient = ocr.getOrientation();

assert.equal(estimatedOrient.rotation, rotation);
assert.equal(estimatedOrient.confidence, 1);
}
});
});
17 changes: 14 additions & 3 deletions test/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,25 @@ export function resolve(path, moduleURL = import.meta.url) {
}

/**
* @param {string} path
* Convert a sharp image to an ImageData-like object that can be passed to
* OCREngine and OCRClient.
*/
export async function loadImage(path) {
const image = await sharp(path).ensureAlpha();
export async function toImageData(image) {
const { width, height } = await image.metadata();
return {
data: await image.raw().toBuffer(),
width,
height,
};
}

/**
* Load and decode an image into an ImageData-like object.
*
* @param {string} path
* @return {ImageData}
*/
export async function loadImage(path) {
const image = await sharp(path).ensureAlpha();
return toImageData(image);
}

0 comments on commit 5a5853c

Please sign in to comment.