Skip to content

Commit d89c224

Browse files
committed
Upgrade to latest Tesseract.js, refs #1
1 parent 4caf66d commit d89c224

File tree

1 file changed

+8
-13
lines changed

1 file changed

+8
-13
lines changed

pdf-ocr.html

+8-13
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<meta name="viewport" content="width=device-width, initial-scale=1.0">
66
<script defer data-domain="tools.simonwillison.net" src="https://plausible.io/js/script.js"></script>
77
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.min.js"></script>
8-
<script src="https://unpkg.com/tesseract.js@v2.1.0/dist/tesseract.min.js"></script>
8+
<script src="https://cdn.jsdelivr.net/npm/tesseract.js@5/dist/tesseract.min.js"></script>
99
<style>
1010
body {
1111
padding: 1em;
@@ -92,8 +92,6 @@ <h2>Pages</h2>
9292
ta.style.height = (ta.scrollHeight + 5) + 'px';
9393
}
9494

95-
const worker = Tesseract.createWorker();
96-
9795
dropzone.addEventListener('dragover', handleDragOver);
9896
dropzone.addEventListener('dragleave', handleDragLeave);
9997
dropzone.addEventListener('drop', handleDrop);
@@ -137,6 +135,7 @@ <h2>Pages</h2>
137135
});
138136

139137
async function processFile(file) {
138+
const worker = await Tesseract.createWorker();
140139
fullDocumentTextarea.value = '';
141140
fullDocumentSection.style.display = 'none';
142141
imageContainer.innerHTML = '';
@@ -145,26 +144,22 @@ <h2>Pages</h2>
145144
dropzone.classList.add('disabled');
146145
fileSelectionAllowed = false;
147146

148-
await worker.load();
149-
await worker.loadLanguage("eng");
150-
await worker.initialize("eng");
151-
152147
if (file.type === 'application/pdf') {
153148
const { numPages, imageIterator } = await convertPDFToImages(file);
154149
let done = 0;
155150
dropzone.innerText = `Processing ${numPages} page${numPages > 1 ? 's' : ''}`;
156151
for await (const { imageURL } of imageIterator) {
157-
const ta = await processImage(imageURL);
158-
const { text } = await ocrImage(imageURL);
152+
const ta = await displayImage(imageURL);
153+
const { text } = await ocrImage(worker, imageURL);
159154
setTextarea(ta, text);
160155
showFullDocument();
161156
done += 1;
162157
dropzone.innerText = `Done ${done} of ${numPages}`;
163158
}
164159
} else {
165160
const imageURL = URL.createObjectURL(file);
166-
const ta = await processImage(imageURL);
167-
const { text } = await ocrImage(imageURL);
161+
const ta = await displayImage(imageURL);
162+
const { text } = await ocrImage(worker, imageURL);
168163
setTextarea(ta, text);
169164
showFullDocument();
170165
}
@@ -175,7 +170,7 @@ <h2>Pages</h2>
175170
fileSelectionAllowed = true;
176171
}
177172

178-
async function processImage(imageURL) {
173+
async function displayImage(imageURL) {
179174
const imgElement = document.createElement('img');
180175
imgElement.src = imageURL;
181176
imageContainer.appendChild(imgElement);
@@ -216,7 +211,7 @@ <h2>Pages</h2>
216211
return {numPages: numPages, imageIterator: images()};
217212
}
218213

219-
async function ocrImage(imageUrl) {
214+
async function ocrImage(worker, imageUrl) {
220215
const {
221216
data: { text },
222217
} = await worker.recognize(imageUrl);

0 commit comments

Comments
 (0)