5555 < h1 > OCR PDFs and images directly in your browser</ h1 >
5656 < p > This tool runs entirely in your browser. No files are uploaded to a server.</ p >
5757 < p > It uses < a href ="https://tesseract.projectnaptha.com/ "> Tesseract.js</ a > for OCR and < a href ="https://mozilla.github.io/pdf.js/ "> PDF.js</ a > to convert PDFs into images.</ p >
58+ < p > < label > Language: < select id ="id_language "> < option > ENG</ option > </ select > </ label > </ p >
5859 < input type ="file " id ="fileInput " accept =".pdf,.jpg,.jpeg,.png,.gif " style ="display: none; " />
5960 < div class ="dropzone " id ="dropzone ">
6061 Drag and drop a PDF, JPG, PNG, or GIF file here or click to select a file
@@ -74,9 +75,131 @@ <h2>Pages</h2>
7475const imageContainer = document . querySelector ( '.image-container' ) ;
7576const fullDocumentTextarea = document . getElementById ( 'fullDocument' ) ;
7677const fullDocumentSection = document . getElementById ( 'fullDocumentSection' ) ;
78+ const languageSelect = document . getElementById ( 'id_language' ) ;
7779
7880let fileSelectionAllowed = true ;
7981
82+ const LANGUAGES = {
83+ "afr" : "Afrikaans" ,
84+ "amh" : "Amharic" ,
85+ "ara" : "Arabic" ,
86+ "asm" : "Assamese" ,
87+ "aze" : "Azerbaijani" ,
88+ "aze_cyrl" : "Azerbaijani - Cyrillic" ,
89+ "bel" : "Belarusian" ,
90+ "ben" : "Bengali" ,
91+ "bod" : "Tibetan" ,
92+ "bos" : "Bosnian" ,
93+ "bul" : "Bulgarian" ,
94+ "cat" : "Catalan; Valencian" ,
95+ "ceb" : "Cebuano" ,
96+ "ces" : "Czech" ,
97+ "chi_sim" : "Chinese - Simplified" ,
98+ "chi_tra" : "Chinese - Traditional" ,
99+ "chr" : "Cherokee" ,
100+ "cym" : "Welsh" ,
101+ "dan" : "Danish" ,
102+ "deu" : "German" ,
103+ "dzo" : "Dzongkha" ,
104+ "ell" : "Greek, Modern (1453-)" ,
105+ "eng" : "English" ,
106+ "enm" : "English, Middle (1100-1500)" ,
107+ "epo" : "Esperanto" ,
108+ "est" : "Estonian" ,
109+ "eus" : "Basque" ,
110+ "fas" : "Persian" ,
111+ "fin" : "Finnish" ,
112+ "fra" : "French" ,
113+ "frk" : "German Fraktur" ,
114+ "frm" : "French, Middle (ca. 1400-1600)" ,
115+ "gle" : "Irish" ,
116+ "glg" : "Galician" ,
117+ "grc" : "Greek, Ancient (-1453)" ,
118+ "guj" : "Gujarati" ,
119+ "hat" : "Haitian; Haitian Creole" ,
120+ "heb" : "Hebrew" ,
121+ "hin" : "Hindi" ,
122+ "hrv" : "Croatian" ,
123+ "hun" : "Hungarian" ,
124+ "iku" : "Inuktitut" ,
125+ "ind" : "Indonesian" ,
126+ "isl" : "Icelandic" ,
127+ "ita" : "Italian" ,
128+ "ita_old" : "Italian - Old" ,
129+ "jav" : "Javanese" ,
130+ "jpn" : "Japanese" ,
131+ "kan" : "Kannada" ,
132+ "kat" : "Georgian" ,
133+ "kat_old" : "Georgian - Old" ,
134+ "kaz" : "Kazakh" ,
135+ "khm" : "Central Khmer" ,
136+ "kir" : "Kirghiz; Kyrgyz" ,
137+ "kor" : "Korean" ,
138+ "kur" : "Kurdish" ,
139+ "lao" : "Lao" ,
140+ "lat" : "Latin" ,
141+ "lav" : "Latvian" ,
142+ "lit" : "Lithuanian" ,
143+ "mal" : "Malayalam" ,
144+ "mar" : "Marathi" ,
145+ "mkd" : "Macedonian" ,
146+ "mlt" : "Maltese" ,
147+ "msa" : "Malay" ,
148+ "mya" : "Burmese" ,
149+ "nep" : "Nepali" ,
150+ "nld" : "Dutch; Flemish" ,
151+ "nor" : "Norwegian" ,
152+ "ori" : "Oriya" ,
153+ "pan" : "Panjabi; Punjabi" ,
154+ "pol" : "Polish" ,
155+ "por" : "Portuguese" ,
156+ "pus" : "Pushto; Pashto" ,
157+ "ron" : "Romanian; Moldavian; Moldovan" ,
158+ "rus" : "Russian" ,
159+ "san" : "Sanskrit" ,
160+ "sin" : "Sinhala; Sinhalese" ,
161+ "slk" : "Slovak" ,
162+ "slv" : "Slovenian" ,
163+ "spa" : "Spanish; Castilian" ,
164+ "spa_old" : "Spanish; Castilian - Old" ,
165+ "sqi" : "Albanian" ,
166+ "srp" : "Serbian" ,
167+ "srp_latn" : "Serbian - Latin" ,
168+ "swa" : "Swahili" ,
169+ "swe" : "Swedish" ,
170+ "syr" : "Syriac" ,
171+ "tam" : "Tamil" ,
172+ "tel" : "Telugu" ,
173+ "tgk" : "Tajik" ,
174+ "tgl" : "Tagalog" ,
175+ "tha" : "Thai" ,
176+ "tir" : "Tigrinya" ,
177+ "tur" : "Turkish" ,
178+ "uig" : "Uighur; Uyghur" ,
179+ "ukr" : "Ukrainian" ,
180+ "urd" : "Urdu" ,
181+ "uzb" : "Uzbek" ,
182+ "uzb_cyrl" : "Uzbek - Cyrillic" ,
183+ "vie" : "Vietnamese" ,
184+ "yid" : "Yiddish"
185+ }
186+
187+ // Populate the languages select box
188+ while ( languageSelect . firstChild ) {
189+ languageSelect . removeChild ( languageSelect . firstChild ) ;
190+ }
191+
192+ for ( const code of Object . values ( Tesseract . languages ) ) {
193+ const name = LANGUAGES [ code ] ;
194+ const option = document . createElement ( 'option' ) ;
195+ option . value = code ;
196+ option . textContent = name ;
197+ if ( option . value == 'eng' ) {
198+ option . selected = true ;
199+ }
200+ languageSelect . appendChild ( option ) ;
201+ }
202+
80203function showFullDocument ( ) {
81204 // Only shows if there are multiple populated textareas
82205 const populatedTextareas = Array . from (
@@ -139,7 +262,7 @@ <h2>Pages</h2>
139262} ) ;
140263
141264async function processFile ( file ) {
142- const worker = await Tesseract . createWorker ( ) ;
265+ const worker = await Tesseract . createWorker ( languageSelect . value ) ;
143266 fullDocumentTextarea . value = '' ;
144267 fullDocumentSection . style . display = 'none' ;
145268 imageContainer . innerHTML = '' ;
0 commit comments