1+ <!DOCTYPE html>
2+ < html >
3+ < head >
4+ < title > OCR PDFs and images directly in your browser</ title >
5+ < meta name ="viewport " content ="width=device-width, initial-scale=1.0 ">
6+ < script defer data-domain ="tools.simonwillison.net " src ="https://plausible.io/js/script.js "> </ script >
7+ < script src ="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.min.js "> </ script >
8+ < script src ="https://cdn.jsdelivr.net/npm/tesseract.js@5/dist/tesseract.min.js "> </ script >
9+ < style >
10+ body {
11+ padding : 1em ;
12+ font-family : helvetica, sans-serif;
13+ line-height : 1.3 ;
14+ }
15+ .dropzone {
16+ box-sizing : border-box;
17+ width : 100% ;
18+ height : 10em ;
19+ border : 2px dashed # ccc ;
20+ display : flex;
21+ justify-content : center;
22+ align-items : center;
23+ font-size : 24px ;
24+ cursor : pointer;
25+ padding : 1em ;
26+ }
27+ .dropzone .disabled {
28+ cursor : not-allowed;
29+ }
30+ .dropzone .drag-over {
31+ background-color : pink;
32+ }
33+ .image-container img {
34+ margin-bottom : 10px ;
35+ max-width : 100% ;
36+ }
37+ textarea {
38+ width : 100% ;
39+ height : 10em ;
40+ margin-bottom : 20px ;
41+ box-sizing : border-box;
42+ }
43+ .full-document-section {
44+ display : none;
45+ margin-bottom : 20px ;
46+ }
47+ </ style >
48+ </ head >
49+ < body >
50+ < h1 > OCR PDFs and images directly in your browser</ h1 >
51+ < p > This tool runs entirely in your browser. No files are uploaded to a server.</ p >
52+ < p > It uses < a href ="https://tesseract.projectnaptha.com/ "> Tesseract.js</ a > for OCR and < a href ="https://mozilla.github.io/pdf.js/ "> PDF.js</ a > to convert PDFs into images.</ p >
53+ < input type ="file " id ="fileInput " accept =".pdf,.jpg,.jpeg,.png,.gif " style ="display: none; " />
54+ < div class ="dropzone " id ="dropzone ">
55+ Drag and drop a PDF, JPG, PNG, or GIF file here or click to select a file
56+ </ div >
57+ < div class ="full-document-section " id ="fullDocumentSection ">
58+ < h2 > Full document</ h2 >
59+ < textarea class ="full-document " id ="fullDocument "> </ textarea >
60+ < h2 > Pages</ h2 >
61+ </ div >
62+ < div class ="image-container "> </ div >
63+
64+ < script >
65+ const desiredWidth = 1000 ;
66+ const dropzone = document . getElementById ( 'dropzone' ) ;
67+ const fileInput = document . getElementById ( 'fileInput' ) ;
68+ const imageContainer = document . querySelector ( '.image-container' ) ;
69+ const fullDocumentTextarea = document . getElementById ( 'fullDocument' ) ;
70+ const fullDocumentSection = document . getElementById ( 'fullDocumentSection' ) ;
71+
72+ let fileSelectionAllowed = true ;
73+
74+ function showFullDocument ( ) {
75+ // Only shows if there are multiple populated textareas
76+ const populatedTextareas = Array . from (
77+ document . querySelectorAll ( '.image-container textarea' )
78+ ) . filter ( ta => ta . value . trim ( ) . length ) ;
79+ if ( populatedTextareas . length > 1 ) {
80+ fullDocumentTextarea . value = populatedTextareas . map ( ta => ta . value . trim ( ) ) . join ( "\n\n" ) ;
81+ fullDocumentSection . style . display = 'block' ;
82+ } else {
83+ fullDocumentTextarea . value = '' ;
84+ fullDocumentSection . style . display = 'none' ;
85+ }
86+ }
87+
88+ function setTextarea ( ta , text ) {
89+ ta . value = text . trim ( ) ;
90+ // Set textarea height to fit content
91+ ta . style . height = 'auto' ;
92+ ta . style . height = ( ta . scrollHeight + 5 ) + 'px' ;
93+ }
94+
95+ dropzone . addEventListener ( 'dragover' , handleDragOver ) ;
96+ dropzone . addEventListener ( 'dragleave' , handleDragLeave ) ;
97+ dropzone . addEventListener ( 'drop' , handleDrop ) ;
98+ dropzone . addEventListener ( 'click' , handleClick ) ;
99+
100+ pdfjsLib . GlobalWorkerOptions . workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.9.359/pdf.worker.min.js' ;
101+
102+ async function handleDragOver ( event ) {
103+ event . preventDefault ( ) ;
104+ if ( fileSelectionAllowed ) {
105+ dropzone . classList . add ( 'drag-over' ) ;
106+ }
107+ }
108+
109+ async function handleDragLeave ( event ) {
110+ event . preventDefault ( ) ;
111+ if ( fileSelectionAllowed ) {
112+ dropzone . classList . remove ( 'drag-over' ) ;
113+ }
114+ }
115+
116+ async function handleDrop ( event ) {
117+ event . preventDefault ( ) ;
118+ if ( fileSelectionAllowed ) {
119+ dropzone . classList . remove ( 'drag-over' ) ;
120+ const file = event . dataTransfer . files [ 0 ] ;
121+ fileInput . files = event . dataTransfer . files ;
122+ processFile ( file ) ;
123+ }
124+ }
125+
126+ async function handleClick ( ) {
127+ if ( fileSelectionAllowed ) {
128+ fileInput . click ( ) ;
129+ }
130+ }
131+
132+ fileInput . addEventListener ( 'change' , ( event ) => {
133+ const file = event . target . files [ 0 ] ;
134+ processFile ( file ) ;
135+ } ) ;
136+
137+ async function processFile ( file ) {
138+ const worker = await Tesseract . createWorker ( ) ;
139+ fullDocumentTextarea . value = '' ;
140+ fullDocumentSection . style . display = 'none' ;
141+ imageContainer . innerHTML = '' ;
142+ const originalText = dropzone . innerText ;
143+ dropzone . innerText = 'Processing file...' ;
144+ dropzone . classList . add ( 'disabled' ) ;
145+ fileSelectionAllowed = false ;
146+
147+ if ( file . type === 'application/pdf' ) {
148+ const { numPages, imageIterator } = await convertPDFToImages ( file ) ;
149+ let done = 0 ;
150+ dropzone . innerText = `Processing ${ numPages } page${ numPages > 1 ? 's' : '' } ` ;
151+ for await ( const { imageURL } of imageIterator ) {
152+ const ta = await displayImage ( imageURL ) ;
153+ const { text } = await ocrImage ( worker , imageURL ) ;
154+ setTextarea ( ta , text ) ;
155+ showFullDocument ( ) ;
156+ done += 1 ;
157+ dropzone . innerText = `Done ${ done } of ${ numPages } ` ;
158+ }
159+ } else {
160+ const imageURL = URL . createObjectURL ( file ) ;
161+ const ta = await displayImage ( imageURL ) ;
162+ const { text } = await ocrImage ( worker , imageURL ) ;
163+ setTextarea ( ta , text ) ;
164+ showFullDocument ( ) ;
165+ }
166+
167+ await worker . terminate ( ) ;
168+ dropzone . innerText = originalText ;
169+ dropzone . classList . remove ( 'disabled' ) ;
170+ fileSelectionAllowed = true ;
171+ }
172+
173+ async function displayImage ( imageURL ) {
174+ const imgElement = document . createElement ( 'img' ) ;
175+ imgElement . src = imageURL ;
176+ imageContainer . appendChild ( imgElement ) ;
177+
178+ const altTextarea = document . createElement ( 'textarea' ) ;
179+ altTextarea . classList . add ( 'textarea-alt' ) ;
180+ altTextarea . placeholder = 'OCRing image...' ;
181+ imageContainer . appendChild ( altTextarea ) ;
182+
183+ return altTextarea ;
184+ }
185+
186+ async function convertPDFToImages ( file ) {
187+ // returns { numPages, imageIterator }
188+ const pdf = await pdfjsLib . getDocument ( URL . createObjectURL ( file ) ) . promise ;
189+ const numPages = pdf . numPages ;
190+ async function * images ( ) {
191+ for ( let i = 1 ; i <= numPages ; i ++ ) {
192+ try {
193+ const page = await pdf . getPage ( i ) ;
194+ const viewport = page . getViewport ( { scale : 1 } ) ;
195+ const canvas = document . createElement ( 'canvas' ) ;
196+ const context = canvas . getContext ( '2d' ) ;
197+ canvas . width = desiredWidth ;
198+ canvas . height = ( desiredWidth / viewport . width ) * viewport . height ;
199+ const renderContext = {
200+ canvasContext : context ,
201+ viewport : page . getViewport ( { scale : desiredWidth / viewport . width } ) ,
202+ } ;
203+ await page . render ( renderContext ) . promise ;
204+ const imageURL = canvas . toDataURL ( 'image/jpeg' , 0.8 ) ;
205+ yield { imageURL } ;
206+ } catch ( error ) {
207+ console . error ( `Error rendering page ${ i } :` , error ) ;
208+ }
209+ }
210+ }
211+ return { numPages : numPages , imageIterator : images ( ) } ;
212+ }
213+
214+ async function ocrImage ( worker , imageUrl ) {
215+ const {
216+ data : { text } ,
217+ } = await worker . recognize ( imageUrl ) ;
218+ return { text } ;
219+ }
220+ </ script >
221+ </ body >
222+ </ html >
0 commit comments