-
Notifications
You must be signed in to change notification settings - Fork 0
/
translator_new.py
54 lines (41 loc) · 1.71 KB
/
translator_new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from PIL import Image
#import goslate
from langdetect import detect
from googletrans import Translator
#gs = goslate.Goslate()
# -*- coding: utf-8 -*-
#print(detect("इस वॉशिंग मशीन में कुछ खराबी है।"))
#detect("Ein, zwei, drei, vier")
# india.txt contains the output coming from customised ocr , languages contained in india.txt may vary from english , hindi , tamil , urdu , bengali
separators = [u"।", u",", u"."]
text = open("india.txt").read()
#This converts the encoded text to an internal unicode object, where
# all characters are properly recognized as an entity
text = text.decode("utf-8")
#this breaks the text on the white spaces, yielding a list of words
words = text.split()
counter = 1
output = ""
for word in words:
#if the last char is a separator, and is joined to the word:
if word[-1] in separators and len(word) > 1:
#word up to the second to last char:
output += word[:-1] +" "
counter += 1
#last char
output += word[-1] +" "
else:
output += word +" "
counter += 1
#detect(output) will return the standard language code of the text in india.txt , if language in india.txt is bengali then it will return 'ben'
#for hindi it will return 'hin' etc
print("Input from OCR")
print (output)
print ("language is in "+detect(output))
# translate english>english(not needed) hindi>english urdu>english bengali>english , so destination lang=english
translator = Translator()
translated=translator.translate(output, dest='en')
print("Translation of Input")
print(translated.text)
#translator('ur', 'en', output)
#print(gs.translate(output, 'hi'))