Skip to content

Commit 5dffa7e

Browse files
committed
Change URL to /ocr - since it does more than PDfs now, refs #1
1 parent d89c224 commit 5dffa7e

File tree

2 files changed

+228
-216
lines changed

2 files changed

+228
-216
lines changed

ocr.html

+222
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<title>OCR PDFs and images directly in your browser</title>
5+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
6+
<script defer data-domain="tools.simonwillison.net" src="https://plausible.io/js/script.js"></script>
7+
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.min.js"></script>
8+
<script src="https://cdn.jsdelivr.net/npm/tesseract.js@5/dist/tesseract.min.js"></script>
9+
<style>
10+
body {
11+
padding: 1em;
12+
font-family: helvetica, sans-serif;
13+
line-height: 1.3;
14+
}
15+
.dropzone {
16+
box-sizing: border-box;
17+
width: 100%;
18+
height: 10em;
19+
border: 2px dashed #ccc;
20+
display: flex;
21+
justify-content: center;
22+
align-items: center;
23+
font-size: 24px;
24+
cursor: pointer;
25+
padding: 1em;
26+
}
27+
.dropzone.disabled {
28+
cursor: not-allowed;
29+
}
30+
.dropzone.drag-over {
31+
background-color: pink;
32+
}
33+
.image-container img {
34+
margin-bottom: 10px;
35+
max-width: 100%;
36+
}
37+
textarea {
38+
width: 100%;
39+
height: 10em;
40+
margin-bottom: 20px;
41+
box-sizing: border-box;
42+
}
43+
.full-document-section {
44+
display: none;
45+
margin-bottom: 20px;
46+
}
47+
</style>
48+
</head>
49+
<body>
50+
<h1>OCR PDFs and images directly in your browser</h1>
51+
<p>This tool runs entirely in your browser. No files are uploaded to a server.</p>
52+
<p>It uses <a href="https://tesseract.projectnaptha.com/">Tesseract.js</a> for OCR and <a href="https://mozilla.github.io/pdf.js/">PDF.js</a> to convert PDFs into images.</p>
53+
<input type="file" id="fileInput" accept=".pdf,.jpg,.jpeg,.png,.gif" style="display: none;" />
54+
<div class="dropzone" id="dropzone">
55+
Drag and drop a PDF, JPG, PNG, or GIF file here or click to select a file
56+
</div>
57+
<div class="full-document-section" id="fullDocumentSection">
58+
<h2>Full document</h2>
59+
<textarea class="full-document" id="fullDocument"></textarea>
60+
<h2>Pages</h2>
61+
</div>
62+
<div class="image-container"></div>
63+
64+
<script>
65+
const desiredWidth = 1000;
66+
const dropzone = document.getElementById('dropzone');
67+
const fileInput = document.getElementById('fileInput');
68+
const imageContainer = document.querySelector('.image-container');
69+
const fullDocumentTextarea = document.getElementById('fullDocument');
70+
const fullDocumentSection = document.getElementById('fullDocumentSection');
71+
72+
let fileSelectionAllowed = true;
73+
74+
function showFullDocument() {
75+
// Only shows if there are multiple populated textareas
76+
const populatedTextareas = Array.from(
77+
document.querySelectorAll('.image-container textarea')
78+
).filter(ta => ta.value.trim().length);
79+
if (populatedTextareas.length > 1) {
80+
fullDocumentTextarea.value = populatedTextareas.map(ta => ta.value.trim()).join("\n\n");
81+
fullDocumentSection.style.display = 'block';
82+
} else {
83+
fullDocumentTextarea.value = '';
84+
fullDocumentSection.style.display = 'none';
85+
}
86+
}
87+
88+
function setTextarea(ta, text) {
89+
ta.value = text.trim();
90+
// Set textarea height to fit content
91+
ta.style.height = 'auto';
92+
ta.style.height = (ta.scrollHeight + 5) + 'px';
93+
}
94+
95+
dropzone.addEventListener('dragover', handleDragOver);
96+
dropzone.addEventListener('dragleave', handleDragLeave);
97+
dropzone.addEventListener('drop', handleDrop);
98+
dropzone.addEventListener('click', handleClick);
99+
100+
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.worker.min.js';
101+
102+
async function handleDragOver(event) {
103+
event.preventDefault();
104+
if (fileSelectionAllowed) {
105+
dropzone.classList.add('drag-over');
106+
}
107+
}
108+
109+
async function handleDragLeave(event) {
110+
event.preventDefault();
111+
if (fileSelectionAllowed) {
112+
dropzone.classList.remove('drag-over');
113+
}
114+
}
115+
116+
async function handleDrop(event) {
117+
event.preventDefault();
118+
if (fileSelectionAllowed) {
119+
dropzone.classList.remove('drag-over');
120+
const file = event.dataTransfer.files[0];
121+
fileInput.files = event.dataTransfer.files;
122+
processFile(file);
123+
}
124+
}
125+
126+
async function handleClick() {
127+
if (fileSelectionAllowed) {
128+
fileInput.click();
129+
}
130+
}
131+
132+
fileInput.addEventListener('change', (event) => {
133+
const file = event.target.files[0];
134+
processFile(file);
135+
});
136+
137+
async function processFile(file) {
138+
const worker = await Tesseract.createWorker();
139+
fullDocumentTextarea.value = '';
140+
fullDocumentSection.style.display = 'none';
141+
imageContainer.innerHTML = '';
142+
const originalText = dropzone.innerText;
143+
dropzone.innerText = 'Processing file...';
144+
dropzone.classList.add('disabled');
145+
fileSelectionAllowed = false;
146+
147+
if (file.type === 'application/pdf') {
148+
const { numPages, imageIterator } = await convertPDFToImages(file);
149+
let done = 0;
150+
dropzone.innerText = `Processing ${numPages} page${numPages > 1 ? 's' : ''}`;
151+
for await (const { imageURL } of imageIterator) {
152+
const ta = await displayImage(imageURL);
153+
const { text } = await ocrImage(worker, imageURL);
154+
setTextarea(ta, text);
155+
showFullDocument();
156+
done += 1;
157+
dropzone.innerText = `Done ${done} of ${numPages}`;
158+
}
159+
} else {
160+
const imageURL = URL.createObjectURL(file);
161+
const ta = await displayImage(imageURL);
162+
const { text } = await ocrImage(worker, imageURL);
163+
setTextarea(ta, text);
164+
showFullDocument();
165+
}
166+
167+
await worker.terminate();
168+
dropzone.innerText = originalText;
169+
dropzone.classList.remove('disabled');
170+
fileSelectionAllowed = true;
171+
}
172+
173+
async function displayImage(imageURL) {
174+
const imgElement = document.createElement('img');
175+
imgElement.src = imageURL;
176+
imageContainer.appendChild(imgElement);
177+
178+
const altTextarea = document.createElement('textarea');
179+
altTextarea.classList.add('textarea-alt');
180+
altTextarea.placeholder = 'OCRing image...';
181+
imageContainer.appendChild(altTextarea);
182+
183+
return altTextarea;
184+
}
185+
186+
async function convertPDFToImages(file) {
187+
// returns { numPages, imageIterator }
188+
const pdf = await pdfjsLib.getDocument(URL.createObjectURL(file)).promise;
189+
const numPages = pdf.numPages;
190+
async function* images() {
191+
for (let i = 1; i <= numPages; i++) {
192+
try {
193+
const page = await pdf.getPage(i);
194+
const viewport = page.getViewport({ scale: 1 });
195+
const canvas = document.createElement('canvas');
196+
const context = canvas.getContext('2d');
197+
canvas.width = desiredWidth;
198+
canvas.height = (desiredWidth / viewport.width) * viewport.height;
199+
const renderContext = {
200+
canvasContext: context,
201+
viewport: page.getViewport({ scale: desiredWidth / viewport.width }),
202+
};
203+
await page.render(renderContext).promise;
204+
const imageURL = canvas.toDataURL('image/jpeg', 0.8);
205+
yield { imageURL };
206+
} catch (error) {
207+
console.error(`Error rendering page ${i}:`, error);
208+
}
209+
}
210+
}
211+
return {numPages: numPages, imageIterator: images()};
212+
}
213+
214+
async function ocrImage(worker, imageUrl) {
215+
const {
216+
data: { text },
217+
} = await worker.recognize(imageUrl);
218+
return { text };
219+
}
220+
</script>
221+
</body>
222+
</html>

0 commit comments

Comments
 (0)