Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add simple orientation detection #34

Merged
merged 4 commits into from
Jun 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
19 changes: 19 additions & 0 deletions examples/web/ocr-app.js
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,17 @@ function OCRWordBox({ box, imageWidth, imageHeight }) {
);
}

function isNormalOrientation(orientation) {
return orientation.confidence > 0 && orientation.rotation === 0;
}

function formatOrientation(orientation) {
if (orientation.confidence === 0) {
return "Unknown";
}
return `${orientation.rotation}°`;
}

function OCRDemoApp() {
const ocrClient = useRef(null);
const [documentImage, setDocumentImage] = useState(null);
Expand All @@ -104,6 +115,7 @@ function OCRDemoApp() {
const [ocrProgress, setOCRProgress] = useState(null);
const [status, setStatus] = useState(null);
const [wordBoxes, setWordBoxes] = useState([]);
const [orientation, setOrientation] = useState(null);

const canvasRef = useRef(null);

Expand All @@ -114,6 +126,7 @@ function OCRDemoApp() {

setError(null);
setWordBoxes(null);
setOrientation(null);

// Set progress to `0` rather than `null` here to show the progress bar
// immediately after an image is selected.
Expand Down Expand Up @@ -154,6 +167,9 @@ function OCRDemoApp() {
setStatus("Loading image");
await ocr.loadImage(documentImage);

const orientation = await ocr.getOrientation();
setOrientation(orientation);

// Perform OCR and display progress.
setStatus("Recognizing text");
let boxes = await ocr.getTextBoxes("word", setOCRProgress);
Expand Down Expand Up @@ -227,6 +243,9 @@ function OCRDemoApp() {
<FileDropZone onDrop={loadImage} />
{status !== null && <div>{status}</div>}
{ocrProgress !== null && <ProgressBar value={ocrProgress} />}
{orientation !== null &&
!isNormalOrientation(orientation) &&
`Orientation: ${formatOrientation(orientation)}`}
{documentImage && (
<div className="OCRDemoApp__output">
<canvas
Expand Down
58 changes: 58 additions & 0 deletions src/lib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ struct TextRect {
std::string text;
};

struct Orientation {
int rotation = 0;
float confidence = 0.0f;
};

enum class TextUnit {
Word,
Line,
Expand Down Expand Up @@ -176,6 +181,54 @@ class OCREngine {
return string_from_raw(tesseract_->GetUTF8Text());
}

Orientation GetOrientation() {
// Tesseract's orientation detection is part of the legacy (non-LSTM)
// engine, which is not compiled in to reduce binary size. Hence we use
// Leptonica's orientation detection instead. See comments for
// `pixOrientDetect` in the Leptonica source for details of how it works.
//
// The method is simplistic, and is designed for latin text, but it serves
// as a baseline that can be improved upon later.
auto pix = tesseract_->GetThresholdedImage();

// Metric that indicates whether the image is right-side up vs upside down.
// +ve indicates right-side up.
float up_conf = 0;

// Metric that indicates whether the image is right-side up after being
// rotated 90 degrees clockwise.
float left_conf = 0;

auto had_error = pixOrientDetect(pix, &up_conf, &left_conf,
0 /* min_count */, 0 /* debug */);
pixDestroy(&pix);

if (had_error) {
// If there is an error, we currently report a result with zero confidence
// score.
return {};
}

// Are we more confident that the image is rotated at 0/180 degrees than
// 90/270?
auto is_up_or_down = abs(up_conf) - abs(left_conf) > 5.0;
int rotation;
if (is_up_or_down) {
if (up_conf > 0) {
rotation = 0;
} else {
rotation = 180;
}
} else {
if (left_conf < 0) {
rotation = 90;
} else {
rotation = 270;
}
}
return {.rotation = rotation, .confidence = 1};
}

private:
std::vector<TextRect> GetBoxes(TextUnit unit, bool with_text) {
auto iter = unique_from_raw(tesseract_->GetIterator());
Expand Down Expand Up @@ -242,6 +295,10 @@ EMSCRIPTEN_BINDINGS(ocrlib) {
.field("confidence", &TextRect::confidence)
.field("text", &TextRect::text);

value_object<Orientation>("Orientation")
.field("rotation", &Orientation::rotation)
.field("confidence", &Orientation::confidence);

class_<Image>("Image").constructor<int, int>().function("data", &Image::Data);

class_<OCREngine>("OCREngine")
Expand All @@ -250,6 +307,7 @@ EMSCRIPTEN_BINDINGS(ocrlib) {
.function("loadModel", &OCREngine::LoadModel)
.function("loadImage", &OCREngine::LoadImage)
.function("getBoundingBoxes", &OCREngine::GetBoundingBoxes)
.function("getOrientation", &OCREngine::GetOrientation)
.function("getTextBoxes", &OCREngine::GetTextBoxes)
.function("getText", &OCREngine::GetText);

Expand Down
17 changes: 17 additions & 0 deletions src/ocr-client.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { imageDataFromBitmap } from "./utils";

/**
* @typedef {import('./ocr-engine').BoxItem} BoxItem
* @typedef {import('./ocr-engine').Orientation} Orientation
* @typedef {import('./ocr-engine').TextItem} TextItem
* @typedef {import('./ocr-engine').TextUnit} TextUnit
*/
Expand Down Expand Up @@ -203,6 +204,22 @@ export class OCRClient {
}
}

/**
* Attempt to determine the orientation of the image.
*
* This currently uses a simplistic algorithm [1] which is designed for
* non-uppercase Latin text. It will likely perform badly for other scripts or
* if the text is all uppercase.
*
* [1] See http://www.leptonica.org/papers/skew-measurement.pdf
*
* @return {Promise<Orientation>}
*/
async getOrientation() {
const engine = await this._ocrEngine;
return engine.getOrientation();
}

/** @param {ProgressListener} listener */
_addProgressListener(listener) {
this._progressListeners.push(listener);
Expand Down
24 changes: 24 additions & 0 deletions src/ocr-engine.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ export const layoutFlags = {
* @prop {string} text
*/

/**
* Result of orientation detection.
*
* @typedef Orientation
* @prop {number} rotation
* @prop {number} confidence - Confidence value in [0, 1]
*/

/**
* @typedef {'line'|'word'} TextUnit
*/
Expand Down Expand Up @@ -241,6 +249,22 @@ class OCREngine {
);
}

/**
* Attempt to determine the orientation of the document image in degrees.
*
* This currently uses a simplistic algorithm [1] which is designed for
* non-uppercase Latin text. It will likely perform badly for other scripts or
* if the text is all uppercase.
*
* [1] See http://www.leptonica.org/papers/skew-measurement.pdf
*
* @return {Orientation}
*/
getOrientation() {
this._checkImageLoaded();
return this._engine.getOrientation();
}

_checkModelLoaded() {
if (!this._modelLoaded) {
throw new Error("No text recognition model loaded");
Expand Down
10 changes: 10 additions & 0 deletions test/ocr-client-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,14 @@ describe("OCRClient", () => {
}
}
});

// Test orientation detection method returns a result. Detailed tests for
// different orientations are handled in the OCREngine tests.
it("can determine image orientation", async () => {
const imageData = await loadImage(resolve("./small-test-page.jpg"));
await ocr.loadImage(imageData);
const orient = await ocr.getOrientation();
assert.equal(orient.rotation, 0);
assert.equal(orient.confidence, 1.0);
});
});
17 changes: 16 additions & 1 deletion test/ocr-engine-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ import { dirname } from "node:path";
import { readFile } from "node:fs/promises";

import { assert } from "chai";
import sharp from "sharp";

import {
createOCREngine,
layoutFlags,
supportsFastBuild,
} from "../dist/lib.js";
import { loadImage, resolve } from "./util.js";
import { loadImage, resolve, toImageData } from "./util.js";

const { StartOfLine, EndOfLine } = layoutFlags;

Expand Down Expand Up @@ -293,4 +294,18 @@ describe("OCREngine", () => {
});
assert.deepEqual(progressSteps, [100]);
});

it("can determine image orientation", async () => {
const imagePath = resolve("./small-test-page.jpg");

for (let rotation of [0, 90, 180, 270]) {
const image = await sharp(imagePath).ensureAlpha().rotate(rotation);

ocr.loadImage(await toImageData(image));
const estimatedOrient = ocr.getOrientation();

assert.equal(estimatedOrient.rotation, rotation);
assert.equal(estimatedOrient.confidence, 1);
}
});
});
17 changes: 14 additions & 3 deletions test/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,25 @@ export function resolve(path, moduleURL = import.meta.url) {
}

/**
* @param {string} path
* Convert a sharp image to an ImageData-like object that can be passed to
* OCREngine and OCRClient.
*/
export async function loadImage(path) {
const image = await sharp(path).ensureAlpha();
export async function toImageData(image) {
const { width, height } = await image.metadata();
return {
data: await image.raw().toBuffer(),
width,
height,
};
}

/**
* Load and decode an image into an ImageData-like object.
*
* @param {string} path
* @return {ImageData}
*/
export async function loadImage(path) {
const image = await sharp(path).ensureAlpha();
return toImageData(image);
}