55 < meta name ="viewport " content ="width=device-width, initial-scale=1.0 ">
66 < script defer data-domain ="tools.simonwillison.net " src ="https://plausible.io/js/script.js "> </ script >
77 < script src ="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.min.js "> </ script >
8- < script src ="https://unpkg.com/ tesseract.js@v2.1.0 /dist/tesseract.min.js "> </ script >
8+ < script src ="https://cdn.jsdelivr.net/npm/ tesseract.js@5 /dist/tesseract.min.js "> </ script >
99 < style >
1010 body {
1111 padding : 1em ;
@@ -92,8 +92,6 @@ <h2>Pages</h2>
9292 ta . style . height = ( ta . scrollHeight + 5 ) + 'px' ;
9393 }
9494
95- const worker = Tesseract . createWorker ( ) ;
96-
9795 dropzone . addEventListener ( 'dragover' , handleDragOver ) ;
9896 dropzone . addEventListener ( 'dragleave' , handleDragLeave ) ;
9997 dropzone . addEventListener ( 'drop' , handleDrop ) ;
@@ -137,6 +135,7 @@ <h2>Pages</h2>
137135 } ) ;
138136
139137 async function processFile ( file ) {
138+ const worker = await Tesseract . createWorker ( ) ;
140139 fullDocumentTextarea . value = '' ;
141140 fullDocumentSection . style . display = 'none' ;
142141 imageContainer . innerHTML = '' ;
@@ -145,26 +144,22 @@ <h2>Pages</h2>
145144 dropzone . classList . add ( 'disabled' ) ;
146145 fileSelectionAllowed = false ;
147146
148- await worker . load ( ) ;
149- await worker . loadLanguage ( "eng" ) ;
150- await worker . initialize ( "eng" ) ;
151-
152147 if ( file . type === 'application/pdf' ) {
153148 const { numPages, imageIterator } = await convertPDFToImages ( file ) ;
154149 let done = 0 ;
155150 dropzone . innerText = `Processing ${ numPages } page${ numPages > 1 ? 's' : '' } ` ;
156151 for await ( const { imageURL } of imageIterator ) {
157- const ta = await processImage ( imageURL ) ;
158- const { text } = await ocrImage ( imageURL ) ;
152+ const ta = await displayImage ( imageURL ) ;
153+ const { text } = await ocrImage ( worker , imageURL ) ;
159154 setTextarea ( ta , text ) ;
160155 showFullDocument ( ) ;
161156 done += 1 ;
162157 dropzone . innerText = `Done ${ done } of ${ numPages } ` ;
163158 }
164159 } else {
165160 const imageURL = URL . createObjectURL ( file ) ;
166- const ta = await processImage ( imageURL ) ;
167- const { text } = await ocrImage ( imageURL ) ;
161+ const ta = await displayImage ( imageURL ) ;
162+ const { text } = await ocrImage ( worker , imageURL ) ;
168163 setTextarea ( ta , text ) ;
169164 showFullDocument ( ) ;
170165 }
@@ -175,7 +170,7 @@ <h2>Pages</h2>
175170 fileSelectionAllowed = true ;
176171 }
177172
178- async function processImage ( imageURL ) {
173+ async function displayImage ( imageURL ) {
179174 const imgElement = document . createElement ( 'img' ) ;
180175 imgElement . src = imageURL ;
181176 imageContainer . appendChild ( imgElement ) ;
@@ -216,7 +211,7 @@ <h2>Pages</h2>
216211 return { numPages : numPages , imageIterator : images ( ) } ;
217212 }
218213
219- async function ocrImage ( imageUrl ) {
214+ async function ocrImage ( worker , imageUrl ) {
220215 const {
221216 data : { text } ,
222217 } = await worker . recognize ( imageUrl ) ;
0 commit comments