Skip to content

Commit 4971788

Browse files
authored
Create pdf-ocr.html
Refs #1
0 parents  commit 4971788

File tree

1 file changed

+152
-0
lines changed

1 file changed

+152
-0
lines changed

pdf-ocr.html

+152
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<title>PDF to Images with OCR</title>
5+
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.min.js"></script>
6+
<script src="https://unpkg.com/tesseract.js@v2.1.0/dist/tesseract.min.js"></script>
7+
<style>
8+
body {
9+
padding: 2em;
10+
}
11+
.dropzone {
12+
width: 100%;
13+
height: 10em;
14+
border: 2px dashed #ccc;
15+
display: flex;
16+
justify-content: center;
17+
align-items: center;
18+
font-size: 24px;
19+
cursor: pointer;
20+
}
21+
.dropzone.drag-over {
22+
background-color: pink;
23+
}
24+
.image-container img {
25+
margin-bottom: 10px;
26+
}
27+
.textarea-alt {
28+
width: 100%;
29+
height: 10em;
30+
margin-bottom: 20px;
31+
}
32+
.full-document {
33+
width: 100%;
34+
height: 30em;
35+
margin-top: 20px;
36+
}
37+
</style>
38+
</head>
39+
<body>
40+
<input type="file" id="fileInput" accept=".pdf" style="display: none;" />
41+
<div class="dropzone" id="dropzone">
42+
Drag and drop PDF file here or click to select a file
43+
</div>
44+
<div class="image-container"></div>
45+
<h2>Full document</h2>
46+
<textarea class="full-document" id="fullDocument"></textarea>
47+
48+
<script>
49+
const desiredWidth = 800;
50+
const dropzone = document.getElementById('dropzone');
51+
const fileInput = document.getElementById('fileInput');
52+
const imageContainer = document.querySelector('.image-container');
53+
const fullDocumentTextarea = document.getElementById('fullDocument');
54+
55+
dropzone.addEventListener('dragover', handleDragOver);
56+
dropzone.addEventListener('dragleave', handleDragLeave);
57+
dropzone.addEventListener('drop', handleDrop);
58+
dropzone.addEventListener('click', handleClick);
59+
60+
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.worker.min.js';
61+
62+
async function handleDragOver(event) {
63+
event.preventDefault();
64+
dropzone.classList.add('drag-over');
65+
}
66+
67+
async function handleDragLeave(event) {
68+
event.preventDefault();
69+
dropzone.classList.remove('drag-over');
70+
}
71+
72+
async function handleDrop(event) {
73+
event.preventDefault();
74+
dropzone.classList.remove('drag-over');
75+
const file = event.dataTransfer.files[0];
76+
fileInput.files = event.dataTransfer.files;
77+
processFile(file);
78+
}
79+
80+
async function handleClick() {
81+
fileInput.click();
82+
}
83+
84+
fileInput.addEventListener('change', (event) => {
85+
const file = event.target.files[0];
86+
processFile(file);
87+
});
88+
89+
async function processFile(file) {
90+
const imageIterator = convertPDFToImages(file);
91+
let fullText = '';
92+
93+
for await (const { imageURL } of imageIterator) {
94+
const imgElement = document.createElement('img');
95+
imgElement.src = imageURL;
96+
imageContainer.appendChild(imgElement);
97+
98+
const altTextarea = document.createElement('textarea');
99+
altTextarea.classList.add('textarea-alt');
100+
altTextarea.placeholder = 'Processing...';
101+
imageContainer.appendChild(altTextarea);
102+
103+
const { text } = await ocrImage(imageURL);
104+
altTextarea.value = text;
105+
altTextarea.placeholder = '';
106+
fullText += text + '\n\n';
107+
}
108+
109+
fullDocumentTextarea.value = fullText.trim();
110+
}
111+
112+
async function* convertPDFToImages(file) {
113+
try {
114+
const pdf = await pdfjsLib.getDocument(URL.createObjectURL(file)).promise;
115+
const numPages = pdf.numPages;
116+
117+
for (let i = 1; i <= numPages; i++) {
118+
const page = await pdf.getPage(i);
119+
const viewport = page.getViewport({ scale: 1 });
120+
const canvas = document.createElement('canvas');
121+
const context = canvas.getContext('2d');
122+
canvas.width = desiredWidth;
123+
canvas.height = (desiredWidth / viewport.width) * viewport.height;
124+
const renderContext = {
125+
canvasContext: context,
126+
viewport: page.getViewport({ scale: desiredWidth / viewport.width }),
127+
};
128+
await page.render(renderContext).promise;
129+
const imageURL = canvas.toDataURL('image/jpeg', 0.8);
130+
yield { imageURL };
131+
}
132+
} catch (error) {
133+
console.error('Error:', error);
134+
}
135+
}
136+
137+
async function ocrImage(imageUrl) {
138+
const worker = Tesseract.createWorker();
139+
await worker.load();
140+
await worker.loadLanguage("eng");
141+
await worker.initialize("eng");
142+
143+
const {
144+
data: { text },
145+
} = await worker.recognize(imageUrl);
146+
147+
await worker.terminate();
148+
return { text };
149+
}
150+
</script>
151+
</body>
152+
</html>

0 commit comments

Comments
 (0)