11<!DOCTYPE html>
22< html >
33< head >
4- < title > OCR a PDF</ title >
4+ < title > OCR PDFs and images directly in your browser</ title >
5+ < meta name ="viewport " content ="width=device-width, initial-scale=1.0 ">
56 < script defer data-domain ="tools.simonwillison.net " src ="https://plausible.io/js/script.js "> </ script >
67 < script src ="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.min.js "> </ script >
78 < script src ="https://unpkg.com/tesseract.js@v2.1.0/dist/tesseract.min.js "> </ script >
89 < style >
910 body {
10- padding : 2 em ;
11+ padding : 1 em ;
1112 font-family : helvetica, sans-serif;
13+ line-height : 1.3 ;
1214 }
1315 .dropzone {
16+ box-sizing : border-box;
1417 width : 100% ;
1518 height : 10em ;
1619 border : 2px dashed # ccc ;
1922 align-items : center;
2023 font-size : 24px ;
2124 cursor : pointer;
25+ padding : 1em ;
26+ }
27+ .dropzone .disabled {
28+ cursor : not-allowed;
2229 }
2330 .dropzone .drag-over {
2431 background-color : pink;
2532 }
2633 .image-container img {
2734 margin-bottom : 10px ;
35+ max-width : 100% ;
2836 }
29- . textarea-alt {
37+ textarea {
3038 width : 100% ;
3139 height : 10em ;
3240 margin-bottom : 20px ;
41+ box-sizing : border-box;
3342 }
34- .full-document {
35- width : 100% ;
36- height : 30em ;
37- margin-top : 20px ;
43+ .full-document-section {
44+ display : none;
45+ margin-bottom : 20px ;
3846 }
3947 </ style >
4048</ head >
4149< body >
42- < h1 > OCR a PDF </ h1 >
50+ < h1 > OCR PDFs and images directly in your browser </ h1 >
4351 < p > This tool runs entirely in your browser. No files are uploaded to a server.</ p >
44- < input type ="file " id ="fileInput " accept =".pdf " style ="display: none; " />
52+ < p > It uses < a href ="https://tesseract.projectnaptha.com/ "> Tesseract.js</ a > for OCR and < a href ="https://mozilla.github.io/pdf.js/ "> PDF.js</ a > to convert PDFs into images.</ p >
53+ < input type ="file " id ="fileInput " accept =".pdf,.jpg,.jpeg,.png,.gif " style ="display: none; " />
4554 < div class ="dropzone " id ="dropzone ">
46- Drag and drop PDF file here or click to select a file
55+ Drag and drop a PDF, JPG, PNG, or GIF file here or click to select a file
56+ </ div >
57+ < div class ="full-document-section " id ="fullDocumentSection ">
58+ < h2 > Full document</ h2 >
59+ < textarea class ="full-document " id ="fullDocument "> </ textarea >
60+ < h2 > Pages</ h2 >
4761 </ div >
4862 < div class ="image-container "> </ div >
49- < h2 > Full document</ h2 >
50- < textarea class ="full-document " id ="fullDocument "> </ textarea >
5163
5264 < script >
5365 const desiredWidth = 800 ;
5466 const dropzone = document . getElementById ( 'dropzone' ) ;
5567 const fileInput = document . getElementById ( 'fileInput' ) ;
5668 const imageContainer = document . querySelector ( '.image-container' ) ;
5769 const fullDocumentTextarea = document . getElementById ( 'fullDocument' ) ;
70+ const fullDocumentSection = document . getElementById ( 'fullDocumentSection' ) ;
71+
72+ let fileSelectionAllowed = true ;
73+
74+ function showFullDocument ( ) {
75+ // Only shows if there are multiple populated textareas
76+ const populatedTextareas = Array . from (
77+ document . querySelectorAll ( '.image-container textarea' )
78+ ) . filter ( ta => ta . value . trim ( ) . length ) ;
79+ if ( populatedTextareas . length > 1 ) {
80+ fullDocumentTextarea . value = populatedTextareas . map ( ta => ta . value . trim ( ) ) . join ( "\n\n" ) ;
81+ fullDocumentSection . style . display = 'block' ;
82+ } else {
83+ fullDocumentTextarea . value = '' ;
84+ fullDocumentSection . style . display = 'none' ;
85+ }
86+ }
87+
88+ function setTextarea ( ta , text ) {
89+ ta . value = text . trim ( ) ;
90+ // Set textarea height to fit content
91+ ta . style . height = 'auto' ;
92+ ta . style . height = ( ta . scrollHeight + 5 ) + 'px' ;
93+ }
94+
95+ const worker = Tesseract . createWorker ( ) ;
5896
5997 dropzone . addEventListener ( 'dragover' , handleDragOver ) ;
6098 dropzone . addEventListener ( 'dragleave' , handleDragLeave ) ;
@@ -65,24 +103,32 @@ <h2>Full document</h2>
65103
66104 async function handleDragOver ( event ) {
67105 event . preventDefault ( ) ;
68- dropzone . classList . add ( 'drag-over' ) ;
106+ if ( fileSelectionAllowed ) {
107+ dropzone . classList . add ( 'drag-over' ) ;
108+ }
69109 }
70110
71111 async function handleDragLeave ( event ) {
72112 event . preventDefault ( ) ;
73- dropzone . classList . remove ( 'drag-over' ) ;
113+ if ( fileSelectionAllowed ) {
114+ dropzone . classList . remove ( 'drag-over' ) ;
115+ }
74116 }
75117
76118 async function handleDrop ( event ) {
77119 event . preventDefault ( ) ;
78- dropzone . classList . remove ( 'drag-over' ) ;
79- const file = event . dataTransfer . files [ 0 ] ;
80- fileInput . files = event . dataTransfer . files ;
81- processFile ( file ) ;
120+ if ( fileSelectionAllowed ) {
121+ dropzone . classList . remove ( 'drag-over' ) ;
122+ const file = event . dataTransfer . files [ 0 ] ;
123+ fileInput . files = event . dataTransfer . files ;
124+ processFile ( file ) ;
125+ }
82126 }
83127
84128 async function handleClick ( ) {
85- fileInput . click ( ) ;
129+ if ( fileSelectionAllowed ) {
130+ fileInput . click ( ) ;
131+ }
86132 }
87133
88134 fileInput . addEventListener ( 'change' , ( event ) => {
@@ -91,65 +137,91 @@ <h2>Full document</h2>
91137 } ) ;
92138
93139 async function processFile ( file ) {
94- const imageIterator = convertPDFToImages ( file ) ;
95- let fullText = '' ;
96-
97- for await ( const { imageURL } of imageIterator ) {
98- const imgElement = document . createElement ( 'img' ) ;
99- imgElement . src = imageURL ;
100- imageContainer . appendChild ( imgElement ) ;
140+ fullDocumentTextarea . value = '' ;
141+ fullDocumentSection . style . display = 'none ' ;
142+ imageContainer . innerHTML = '' ;
143+ const originalText = dropzone . innerText ;
144+ dropzone . innerText = 'Processing file...' ;
145+ dropzone . classList . add ( 'disabled' ) ;
146+ fileSelectionAllowed = false ;
101147
102- const altTextarea = document . createElement ( 'textarea' ) ;
103- altTextarea . classList . add ( 'textarea-alt' ) ;
104- altTextarea . placeholder = 'Processing...' ;
105- imageContainer . appendChild ( altTextarea ) ;
148+ await worker . load ( ) ;
149+ await worker . loadLanguage ( "eng" ) ;
150+ await worker . initialize ( "eng" ) ;
106151
152+ if ( file . type === 'application/pdf' ) {
153+ const { numPages, imageIterator } = await convertPDFToImages ( file ) ;
154+ let done = 0 ;
155+ dropzone . innerText = `Processing ${ numPages } page${ numPages > 1 ? 's' : '' } ` ;
156+ for await ( const { imageURL } of imageIterator ) {
157+ const ta = await processImage ( imageURL ) ;
158+ const { text } = await ocrImage ( imageURL ) ;
159+ setTextarea ( ta , text ) ;
160+ showFullDocument ( ) ;
161+ done += 1 ;
162+ dropzone . innerText = `Done ${ done } of ${ numPages } ` ;
163+ }
164+ } else {
165+ const imageURL = URL . createObjectURL ( file ) ;
166+ const ta = await processImage ( imageURL ) ;
107167 const { text } = await ocrImage ( imageURL ) ;
108- altTextarea . value = text ;
109- altTextarea . placeholder = '' ;
110- fullText += text + '\n\n' ;
111- fullDocumentTextarea . value = fullText . trim ( ) ;
168+ setTextarea ( ta , text ) ;
169+ showFullDocument ( ) ;
112170 }
171+
172+ await worker . terminate ( ) ;
173+ dropzone . innerText = originalText ;
174+ dropzone . classList . remove ( 'disabled' ) ;
175+ fileSelectionAllowed = true ;
113176 }
114177
115- async function * convertPDFToImages ( file ) {
116- try {
117- const pdf = await pdfjsLib . getDocument ( URL . createObjectURL ( file ) ) . promise ;
118- const numPages = pdf . numPages ;
178+ async function processImage ( imageURL ) {
179+ const imgElement = document . createElement ( 'img' ) ;
180+ imgElement . src = imageURL ;
181+ imageContainer . appendChild ( imgElement ) ;
119182
183+ const altTextarea = document . createElement ( 'textarea' ) ;
184+ altTextarea . classList . add ( 'textarea-alt' ) ;
185+ altTextarea . placeholder = 'OCRing image...' ;
186+ imageContainer . appendChild ( altTextarea ) ;
187+
188+ return altTextarea ;
189+ }
190+
191+ async function convertPDFToImages ( file ) {
192+ // returns { numPages, imageIterator }
193+ const pdf = await pdfjsLib . getDocument ( URL . createObjectURL ( file ) ) . promise ;
194+ const numPages = pdf . numPages ;
195+ async function * images ( ) {
120196 for ( let i = 1 ; i <= numPages ; i ++ ) {
121- const page = await pdf . getPage ( i ) ;
122- const viewport = page . getViewport ( { scale : 1 } ) ;
123- const canvas = document . createElement ( 'canvas' ) ;
124- const context = canvas . getContext ( '2d' ) ;
125- canvas . width = desiredWidth ;
126- canvas . height = ( desiredWidth / viewport . width ) * viewport . height ;
127- const renderContext = {
128- canvasContext : context ,
129- viewport : page . getViewport ( { scale : desiredWidth / viewport . width } ) ,
130- } ;
131- await page . render ( renderContext ) . promise ;
132- const imageURL = canvas . toDataURL ( 'image/jpeg' , 0.8 ) ;
133- yield { imageURL } ;
197+ try {
198+ const page = await pdf . getPage ( i ) ;
199+ const viewport = page . getViewport ( { scale : 1 } ) ;
200+ const canvas = document . createElement ( 'canvas' ) ;
201+ const context = canvas . getContext ( '2d' ) ;
202+ canvas . width = desiredWidth ;
203+ canvas . height = ( desiredWidth / viewport . width ) * viewport . height ;
204+ const renderContext = {
205+ canvasContext : context ,
206+ viewport : page . getViewport ( { scale : desiredWidth / viewport . width } ) ,
207+ } ;
208+ await page . render ( renderContext ) . promise ;
209+ const imageURL = canvas . toDataURL ( 'image/jpeg' , 0.8 ) ;
210+ yield { imageURL } ;
211+ } catch ( error ) {
212+ console . error ( `Error rendering page ${ i } :` , error ) ;
213+ }
134214 }
135- } catch ( error ) {
136- console . error ( 'Error:' , error ) ;
137215 }
216+ return { numPages : numPages , imageIterator : images ( ) } ;
138217 }
139218
140219 async function ocrImage ( imageUrl ) {
141- const worker = Tesseract . createWorker ( ) ;
142- await worker . load ( ) ;
143- await worker . loadLanguage ( "eng" ) ;
144- await worker . initialize ( "eng" ) ;
145-
146220 const {
147221 data : { text } ,
148222 } = await worker . recognize ( imageUrl ) ;
149-
150- await worker . terminate ( ) ;
151223 return { text } ;
152224 }
153225 </ script >
154226</ body >
155- </ html >
227+ </ html >
0 commit comments