-
Notifications
You must be signed in to change notification settings - Fork 24
/
ocr-engine.ts
401 lines (355 loc) · 11.3 KB
/
ocr-engine.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
// @ts-ignore - Don't error if library hasn't been built yet.
import initTesseractCore from "../build/tesseract-core";
import { imageDataFromBitmap } from "./utils";
/**
* JS interface to a `std::vector` returned from a C++ method wrapped by
* Embind.
*/
type StdVector<T> = {
size: () => number;
get: (index: number) => T;
};
/**
* Create a JS array from a std::vector wrapper created by Embind.
*/
function jsArrayFromStdVector<T>(vec: StdVector<T>): T[] {
const size = vec.size();
const result = [];
for (let i = 0; i < size; i++) {
result.push(vec.get(i));
}
return result;
}
/**
* Flags indicating position of a text item.
*
* Keep this in sync with `LayoutFlags` in lib.cpp.
*/
export const layoutFlags = {
StartOfLine: 1,
EndOfLine: 2,
};
export type IntRect = {
left: number;
top: number;
right: number;
bottom: number;
};
/**
* Item of text found in a document image by layout analysis.
*/
export type BoxItem = {
rect: IntRect;
/** Combination of flags from {@link layoutFlags} */
flags: number;
};
/**
* Item of text found in a document image by layout analysis and OCR.
*/
export type TextItem = {
rect: IntRect;
/** Combination of flags from {@link layoutFlags} */
flags: number;
/** Confidence score for this word in [0, 1] */
confidence: number;
text: string;
};
/**
* Result of orientation detection.
*/
export type Orientation = {
rotation: number;
/** Confidence value in [0, 1] */
confidence: number;
};
export type TextUnit = "line" | "word";
/**
* Handler that receives OCR operation progress updates.
*/
export type ProgressListener = (progress: number) => void;
/**
* Low-level synchronous API for performing OCR.
*
* Instances are constructed using {@link createOCREngine}.
*/
export class OCREngine {
private _tesseractLib: any;
private _engine: any;
private _modelLoaded: boolean;
private _imageLoaded: boolean;
private _progressChannel?: MessagePort;
/**
* Initialize the OCREngine.
*
* Use {@link createOCREngine} rather than calling this directly.
*
* @param tessLib - Emscripten entry point for the compiled WebAssembly module.
* @param progressChannel - Channel used to report progress
* updates when OCREngine is run on a background thread
*/
constructor(tessLib: any, progressChannel?: MessagePort) {
this._tesseractLib = tessLib;
this._engine = new tessLib.OCREngine();
this._modelLoaded = false;
this._imageLoaded = false;
this._progressChannel = progressChannel;
}
/**
* Shut down the OCR engine and free up resources.
*/
destroy() {
this._engine.delete();
this._engine = null;
}
/**
* Get the value, represented as a string, of a Tesseract configuration variable.
*
* See {@link setVariable} for available variables.
*/
getVariable(name: string): string {
const result = this._engine.getVariable(name);
if (!result.success) {
throw new Error(`Unable to get variable ${name}`);
}
return result.value;
}
/**
* Set the value of a Tesseract configuration variable.
*
* For a list of configuration variables, see
* https://github.com/tesseract-ocr/tesseract/blob/677f5822f247ccb12b4e026265e88b959059fb59/src/ccmain/tesseractclass.cpp#L53
*
* If you have Tesseract installed locally, executing `tesseract --print-parameters`
* will also display a list of configuration variables.
*/
setVariable(name: string, value: string) {
const result = this._engine.setVariable(name, value);
if (result.error) {
throw new Error(`Unable to set variable ${name}`);
}
}
/**
* Load a trained text recognition model.
*/
loadModel(model: Uint8Array | ArrayBuffer) {
const modelArray =
model instanceof ArrayBuffer ? new Uint8Array(model) : model;
const result = this._engine.loadModel(modelArray);
if (result.error) {
throw new Error("Text recognition model failed to load");
}
this._modelLoaded = true;
}
/**
* Load a document image for processing by subsequent operations.
*
* This is a cheap operation as expensive processing is deferred until
* bounding boxes or text content is requested.
*/
loadImage(image: ImageBitmap | ImageData) {
let imageData;
if (typeof ImageBitmap !== "undefined" && image instanceof ImageBitmap) {
imageData = imageDataFromBitmap(image);
} else {
imageData = image as ImageData;
}
if (imageData.data.length < imageData.width * imageData.height * 4) {
throw new Error("Image data length does not match width/height");
}
if (imageData.width <= 0 || imageData.height <= 0) {
throw new Error("Image width or height is zero");
}
// Free up resources used by the previous image, if any. Doing this before
// creating the buffer for the new image reduces peak memory usage.
this._engine.clearImage();
// Allocate a temporary internal image for transfering the image data into
// Tesseract
const engineImage = new this._tesseractLib.Image(
imageData.width,
imageData.height
);
const engineImageBuf = engineImage.data();
engineImageBuf.set(new Uint32Array(imageData.data.buffer));
// Load the image. This will take a copy of the image within Tesseract, so
// we can release the original afterwards.
const result = this._engine.loadImage(engineImage);
engineImage.delete();
if (result.error) {
throw new Error("Failed to load image");
}
this._imageLoaded = true;
}
/**
* Clear the current image and text recognition results.
*
* This will clear the loaded image data internally, but keep the text
* recognition model loaded.
*
* At present there is no way to shrink WebAssembly memory, so this will not
* return the memory used by the image to the OS/browser. To release memory,
* the `OCREngine` instance needs to be destroyed via {@link destroy}.
*/
clearImage() {
this._engine.clearImage();
this._imageLoaded = false;
}
/**
* Perform layout analysis on the current image, if not already done, and
* return bounding boxes for a given unit of text.
*
* This operation is relatively cheap compared to text recognition, so can
* provide much faster results if only the location of lines/words etc. on
* the page is required, not the text content. This operation can also be
* performed before a text recognition model is loaded.
*
* This method may return a different number/positions of words on a line
* compared to {@link getTextBoxes} due to the simpler analysis. After full
* OCR has been performed by {@link getTextBoxes} or {@link getText}, this
* method should return the same results.
*/
getBoundingBoxes(unit: TextUnit): BoxItem[] {
this._checkImageLoaded();
const textUnit = this._textUnitForUnit(unit);
return jsArrayFromStdVector(this._engine.getBoundingBoxes(textUnit));
}
/**
* Perform layout analysis and text recognition on the current image, if
* not already done, and return bounding boxes and text content for a given
* unit of text.
*
* A text recognition model must be loaded with {@link loadModel} before this
* is called.
*/
getTextBoxes(unit: TextUnit, onProgress?: ProgressListener): TextItem[] {
this._checkImageLoaded();
this._checkModelLoaded();
const textUnit = this._textUnitForUnit(unit);
return jsArrayFromStdVector(
this._engine.getTextBoxes(textUnit, (progress: number) => {
onProgress?.(progress);
this._progressChannel?.postMessage({ progress });
})
);
}
/**
* Perform layout analysis and text recognition on the current image, if
* not already done, and return the page text as a string.
*
* A text recognition model must be loaded with {@link loadModel} before this
* is called.
*/
getText(onProgress?: ProgressListener): string {
this._checkImageLoaded();
this._checkModelLoaded();
return this._engine.getText((progress: number) => {
onProgress?.(progress);
this._progressChannel?.postMessage({ progress });
});
}
/**
* Perform layout analysis and text recognition on the current image, if
* not already done, and return the page text in hOCR format.
*
* A text recognition model must be loaded with {@link loadModel} before this
* is called.
*/
getHOCR(onProgress?: ProgressListener): string {
this._checkImageLoaded();
this._checkModelLoaded();
return this._engine.getHOCR((progress: number) => {
onProgress?.(progress);
this._progressChannel?.postMessage({ progress });
});
}
/**
* Attempt to determine the orientation of the document image in degrees.
*
* This currently uses a simplistic algorithm [1] which is designed for
* non-uppercase Latin text. It will likely perform badly for other scripts or
* if the text is all uppercase.
*
* [1] See http://www.leptonica.org/papers/skew-measurement.pdf
*/
getOrientation(): Orientation {
this._checkImageLoaded();
return this._engine.getOrientation();
}
private _checkModelLoaded() {
if (!this._modelLoaded) {
throw new Error("No text recognition model loaded");
}
}
private _checkImageLoaded() {
if (!this._imageLoaded) {
throw new Error("No image loaded");
}
}
private _textUnitForUnit(unit: TextUnit) {
const { TextUnit } = this._tesseractLib;
switch (unit) {
case "word":
return TextUnit.Word;
case "line":
return TextUnit.Line;
default:
throw new Error("Invalid text unit");
}
}
}
function wasmSIMDSupported() {
// Tiny WebAssembly file generated from the following source using `wat2wasm`:
//
// (module
// (func (result v128)
// i32.const 0
// i8x16.splat
// i8x16.popcnt
// )
// )
const simdTest = Uint8Array.from([
0, 97, 115, 109, 1, 0, 0, 0, 1, 5, 1, 96, 0, 1, 123, 3, 2, 1, 0, 10, 10, 1,
8, 0, 65, 0, 253, 15, 253, 98, 11,
]);
return WebAssembly.validate(simdTest);
}
function resolve(path: string, baseURL: string) {
return new URL(path, baseURL).href;
}
/**
* Return true if the current JS runtime supports all the WebAssembly features
* needed for the "fast" WebAssembly build. If not, the "fallback" version must
* be used.
*/
export function supportsFastBuild() {
return wasmSIMDSupported();
}
export type CreateOCREngineOptions = {
/**
* WebAssembly binary to load. This can be used to customize how the binary URL
* is determined and fetched. {@link supportsFastBuild} can be used to
* determine which build to load.
*/
wasmBinary?: Uint8Array | ArrayBuffer;
progressChannel?: MessagePort;
};
/**
* Initialize the OCR library and return a new {@link OCREngine}.
*/
export async function createOCREngine({
wasmBinary,
progressChannel,
}: CreateOCREngineOptions = {}) {
if (!wasmBinary) {
const wasmPath = supportsFastBuild()
? "./tesseract-core.wasm"
: "./tesseract-core-fallback.wasm";
// nb. If this code is included in a non-ESM bundle, Rollup will replace
// `import.meta.url` with code that uses `document.currentScript` /
// `location.href`.
const wasmURL = resolve(wasmPath, import.meta.url);
const wasmBinaryResponse = await fetch(wasmURL);
wasmBinary = await wasmBinaryResponse.arrayBuffer();
}
const tessLib = await initTesseractCore({ wasmBinary });
return new OCREngine(tessLib, progressChannel);
}