Skip to content

Commit 8872d19

Browse files
committed
Language support for OCR, closes #4
1 parent 711cb7c commit 8872d19

File tree

1 file changed

+124
-1
lines changed

1 file changed

+124
-1
lines changed

ocr.html

+124-1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
<h1>OCR PDFs and images directly in your browser</h1>
5656
<p>This tool runs entirely in your browser. No files are uploaded to a server.</p>
5757
<p>It uses <a href="https://tesseract.projectnaptha.com/">Tesseract.js</a> for OCR and <a href="https://mozilla.github.io/pdf.js/">PDF.js</a> to convert PDFs into images.</p>
58+
<p><label>Language: <select id="id_language"><option>ENG</option></select></label></p>
5859
<input type="file" id="fileInput" accept=".pdf,.jpg,.jpeg,.png,.gif" style="display: none;" />
5960
<div class="dropzone" id="dropzone">
6061
Drag and drop a PDF, JPG, PNG, or GIF file here or click to select a file
@@ -74,9 +75,131 @@ <h2>Pages</h2>
7475
const imageContainer = document.querySelector('.image-container');
7576
const fullDocumentTextarea = document.getElementById('fullDocument');
7677
const fullDocumentSection = document.getElementById('fullDocumentSection');
78+
const languageSelect = document.getElementById('id_language');
7779

7880
let fileSelectionAllowed = true;
7981

82+
const LANGUAGES = {
83+
"afr": "Afrikaans",
84+
"amh": "Amharic",
85+
"ara": "Arabic",
86+
"asm": "Assamese",
87+
"aze": "Azerbaijani",
88+
"aze_cyrl": "Azerbaijani - Cyrillic",
89+
"bel": "Belarusian",
90+
"ben": "Bengali",
91+
"bod": "Tibetan",
92+
"bos": "Bosnian",
93+
"bul": "Bulgarian",
94+
"cat": "Catalan; Valencian",
95+
"ceb": "Cebuano",
96+
"ces": "Czech",
97+
"chi_sim": "Chinese - Simplified",
98+
"chi_tra": "Chinese - Traditional",
99+
"chr": "Cherokee",
100+
"cym": "Welsh",
101+
"dan": "Danish",
102+
"deu": "German",
103+
"dzo": "Dzongkha",
104+
"ell": "Greek, Modern (1453-)",
105+
"eng": "English",
106+
"enm": "English, Middle (1100-1500)",
107+
"epo": "Esperanto",
108+
"est": "Estonian",
109+
"eus": "Basque",
110+
"fas": "Persian",
111+
"fin": "Finnish",
112+
"fra": "French",
113+
"frk": "German Fraktur",
114+
"frm": "French, Middle (ca. 1400-1600)",
115+
"gle": "Irish",
116+
"glg": "Galician",
117+
"grc": "Greek, Ancient (-1453)",
118+
"guj": "Gujarati",
119+
"hat": "Haitian; Haitian Creole",
120+
"heb": "Hebrew",
121+
"hin": "Hindi",
122+
"hrv": "Croatian",
123+
"hun": "Hungarian",
124+
"iku": "Inuktitut",
125+
"ind": "Indonesian",
126+
"isl": "Icelandic",
127+
"ita": "Italian",
128+
"ita_old": "Italian - Old",
129+
"jav": "Javanese",
130+
"jpn": "Japanese",
131+
"kan": "Kannada",
132+
"kat": "Georgian",
133+
"kat_old": "Georgian - Old",
134+
"kaz": "Kazakh",
135+
"khm": "Central Khmer",
136+
"kir": "Kirghiz; Kyrgyz",
137+
"kor": "Korean",
138+
"kur": "Kurdish",
139+
"lao": "Lao",
140+
"lat": "Latin",
141+
"lav": "Latvian",
142+
"lit": "Lithuanian",
143+
"mal": "Malayalam",
144+
"mar": "Marathi",
145+
"mkd": "Macedonian",
146+
"mlt": "Maltese",
147+
"msa": "Malay",
148+
"mya": "Burmese",
149+
"nep": "Nepali",
150+
"nld": "Dutch; Flemish",
151+
"nor": "Norwegian",
152+
"ori": "Oriya",
153+
"pan": "Panjabi; Punjabi",
154+
"pol": "Polish",
155+
"por": "Portuguese",
156+
"pus": "Pushto; Pashto",
157+
"ron": "Romanian; Moldavian; Moldovan",
158+
"rus": "Russian",
159+
"san": "Sanskrit",
160+
"sin": "Sinhala; Sinhalese",
161+
"slk": "Slovak",
162+
"slv": "Slovenian",
163+
"spa": "Spanish; Castilian",
164+
"spa_old": "Spanish; Castilian - Old",
165+
"sqi": "Albanian",
166+
"srp": "Serbian",
167+
"srp_latn": "Serbian - Latin",
168+
"swa": "Swahili",
169+
"swe": "Swedish",
170+
"syr": "Syriac",
171+
"tam": "Tamil",
172+
"tel": "Telugu",
173+
"tgk": "Tajik",
174+
"tgl": "Tagalog",
175+
"tha": "Thai",
176+
"tir": "Tigrinya",
177+
"tur": "Turkish",
178+
"uig": "Uighur; Uyghur",
179+
"ukr": "Ukrainian",
180+
"urd": "Urdu",
181+
"uzb": "Uzbek",
182+
"uzb_cyrl": "Uzbek - Cyrillic",
183+
"vie": "Vietnamese",
184+
"yid": "Yiddish"
185+
}
186+
187+
// Populate the languages select box
188+
while (languageSelect.firstChild) {
189+
languageSelect.removeChild(languageSelect.firstChild);
190+
}
191+
192+
for (const code of Object.values(Tesseract.languages)) {
193+
const name = LANGUAGES[code];
194+
const option = document.createElement('option');
195+
option.value = code;
196+
option.textContent = name;
197+
if (option.value == 'eng') {
198+
option.selected = true;
199+
}
200+
languageSelect.appendChild(option);
201+
}
202+
80203
function showFullDocument() {
81204
// Only shows if there are multiple populated textareas
82205
const populatedTextareas = Array.from(
@@ -139,7 +262,7 @@ <h2>Pages</h2>
139262
});
140263

141264
async function processFile(file) {
142-
const worker = await Tesseract.createWorker();
265+
const worker = await Tesseract.createWorker(languageSelect.value);
143266
fullDocumentTextarea.value = '';
144267
fullDocumentSection.style.display = 'none';
145268
imageContainer.innerHTML = '';

0 commit comments

Comments
 (0)