|
| 1 | +<!DOCTYPE html> |
| 2 | +<html> |
| 3 | +<head> |
| 4 | + <title>PDF to Images with OCR</title> |
| 5 | + <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.min.js"></script> |
| 6 | + <script src="https://unpkg.com/tesseract.js@v2.1.0/dist/tesseract.min.js"></script> |
| 7 | + <style> |
| 8 | + body { |
| 9 | + padding: 2em; |
| 10 | + } |
| 11 | + .dropzone { |
| 12 | + width: 100%; |
| 13 | + height: 10em; |
| 14 | + border: 2px dashed #ccc; |
| 15 | + display: flex; |
| 16 | + justify-content: center; |
| 17 | + align-items: center; |
| 18 | + font-size: 24px; |
| 19 | + cursor: pointer; |
| 20 | + } |
| 21 | + .dropzone.drag-over { |
| 22 | + background-color: pink; |
| 23 | + } |
| 24 | + .image-container img { |
| 25 | + margin-bottom: 10px; |
| 26 | + } |
| 27 | + .textarea-alt { |
| 28 | + width: 100%; |
| 29 | + height: 10em; |
| 30 | + margin-bottom: 20px; |
| 31 | + } |
| 32 | + .full-document { |
| 33 | + width: 100%; |
| 34 | + height: 30em; |
| 35 | + margin-top: 20px; |
| 36 | + } |
| 37 | + </style> |
| 38 | +</head> |
| 39 | +<body> |
| 40 | + <input type="file" id="fileInput" accept=".pdf" style="display: none;" /> |
| 41 | + <div class="dropzone" id="dropzone"> |
| 42 | + Drag and drop PDF file here or click to select a file |
| 43 | + </div> |
| 44 | + <div class="image-container"></div> |
| 45 | + <h2>Full document</h2> |
| 46 | + <textarea class="full-document" id="fullDocument"></textarea> |
| 47 | + |
| 48 | + <script> |
| 49 | + const desiredWidth = 800; |
| 50 | + const dropzone = document.getElementById('dropzone'); |
| 51 | + const fileInput = document.getElementById('fileInput'); |
| 52 | + const imageContainer = document.querySelector('.image-container'); |
| 53 | + const fullDocumentTextarea = document.getElementById('fullDocument'); |
| 54 | + |
| 55 | + dropzone.addEventListener('dragover', handleDragOver); |
| 56 | + dropzone.addEventListener('dragleave', handleDragLeave); |
| 57 | + dropzone.addEventListener('drop', handleDrop); |
| 58 | + dropzone.addEventListener('click', handleClick); |
| 59 | + |
| 60 | + pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.worker.min.js'; |
| 61 | + |
| 62 | + async function handleDragOver(event) { |
| 63 | + event.preventDefault(); |
| 64 | + dropzone.classList.add('drag-over'); |
| 65 | + } |
| 66 | + |
| 67 | + async function handleDragLeave(event) { |
| 68 | + event.preventDefault(); |
| 69 | + dropzone.classList.remove('drag-over'); |
| 70 | + } |
| 71 | + |
| 72 | + async function handleDrop(event) { |
| 73 | + event.preventDefault(); |
| 74 | + dropzone.classList.remove('drag-over'); |
| 75 | + const file = event.dataTransfer.files[0]; |
| 76 | + fileInput.files = event.dataTransfer.files; |
| 77 | + processFile(file); |
| 78 | + } |
| 79 | + |
| 80 | + async function handleClick() { |
| 81 | + fileInput.click(); |
| 82 | + } |
| 83 | + |
| 84 | + fileInput.addEventListener('change', (event) => { |
| 85 | + const file = event.target.files[0]; |
| 86 | + processFile(file); |
| 87 | + }); |
| 88 | + |
| 89 | + async function processFile(file) { |
| 90 | + const imageIterator = convertPDFToImages(file); |
| 91 | + let fullText = ''; |
| 92 | + |
| 93 | + for await (const { imageURL } of imageIterator) { |
| 94 | + const imgElement = document.createElement('img'); |
| 95 | + imgElement.src = imageURL; |
| 96 | + imageContainer.appendChild(imgElement); |
| 97 | + |
| 98 | + const altTextarea = document.createElement('textarea'); |
| 99 | + altTextarea.classList.add('textarea-alt'); |
| 100 | + altTextarea.placeholder = 'Processing...'; |
| 101 | + imageContainer.appendChild(altTextarea); |
| 102 | + |
| 103 | + const { text } = await ocrImage(imageURL); |
| 104 | + altTextarea.value = text; |
| 105 | + altTextarea.placeholder = ''; |
| 106 | + fullText += text + '\n\n'; |
| 107 | + } |
| 108 | + |
| 109 | + fullDocumentTextarea.value = fullText.trim(); |
| 110 | + } |
| 111 | + |
| 112 | + async function* convertPDFToImages(file) { |
| 113 | + try { |
| 114 | + const pdf = await pdfjsLib.getDocument(URL.createObjectURL(file)).promise; |
| 115 | + const numPages = pdf.numPages; |
| 116 | + |
| 117 | + for (let i = 1; i <= numPages; i++) { |
| 118 | + const page = await pdf.getPage(i); |
| 119 | + const viewport = page.getViewport({ scale: 1 }); |
| 120 | + const canvas = document.createElement('canvas'); |
| 121 | + const context = canvas.getContext('2d'); |
| 122 | + canvas.width = desiredWidth; |
| 123 | + canvas.height = (desiredWidth / viewport.width) * viewport.height; |
| 124 | + const renderContext = { |
| 125 | + canvasContext: context, |
| 126 | + viewport: page.getViewport({ scale: desiredWidth / viewport.width }), |
| 127 | + }; |
| 128 | + await page.render(renderContext).promise; |
| 129 | + const imageURL = canvas.toDataURL('image/jpeg', 0.8); |
| 130 | + yield { imageURL }; |
| 131 | + } |
| 132 | + } catch (error) { |
| 133 | + console.error('Error:', error); |
| 134 | + } |
| 135 | + } |
| 136 | + |
| 137 | + async function ocrImage(imageUrl) { |
| 138 | + const worker = Tesseract.createWorker(); |
| 139 | + await worker.load(); |
| 140 | + await worker.loadLanguage("eng"); |
| 141 | + await worker.initialize("eng"); |
| 142 | + |
| 143 | + const { |
| 144 | + data: { text }, |
| 145 | + } = await worker.recognize(imageUrl); |
| 146 | + |
| 147 | + await worker.terminate(); |
| 148 | + return { text }; |
| 149 | + } |
| 150 | + </script> |
| 151 | +</body> |
| 152 | +</html> |
0 commit comments