In [1]:
# https://www.analyticsvidhya.com/blog/2020/01/3-important-nlp-libraries-indian-languages-python/
# https://github.com/anoopkunchukuttan/indic_nlp_library
# https://github.com/anoopkunchukuttan/indic_nlp_resources

In [2]:
INDIC_NLP_LIB_HOME = r"/root/marathi/indic_nlp_library/"
INDIC_NLP_RESOURCES = r"/root/marathi/indic_nlp_library/indic_nlp_resources/"

In [3]:
import sys

sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME))

In [4]:
from indicnlp import common

common.set_resources_path(INDIC_NLP_RESOURCES)

In [5]:
from indicnlp import loader

loader.load()

In [6]:
# Text Normalization

In [7]:
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

input_text = "\u0958 \u0915\u093c"
remove_nuktas = False
factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer("hi", remove_nuktas=False)
output_text = normalizer.normalize(input_text)

In [8]:
print(input_text)
print()

print("Before normalization")
print(" ".join([hex(ord(c)) for c in input_text]))
print("Length: {}".format(len(input_text)))
print()
print("After normalization")
print(" ".join([hex(ord(c)) for c in output_text]))
print("Length: {}".format(len(output_text)))

क़ क़

Before normalization
0x958 0x20 0x915 0x93c
Length: 4

After normalization
0x915 0x93c 0x20 0x915 0x93c
Length: 5


In [9]:
from indicnlp.tokenize import sentence_tokenize

indic_string = """तो क्या विश्व कप 2019 में मैच का बॉस टॉस है? यानी मैच में हार-जीत में \
टॉस की भूमिका अहम है? आप ऐसा सोच सकते हैं। विश्वकप के अपने-अपने पहले मैच में बुरी तरह हारने वाली एशिया की दो टीमों \
पाकिस्तान और श्रीलंका के कप्तान ने हालांकि अपने हार के पीछे टॉस की दलील तो नहीं दी, लेकिन यह जरूर कहा था कि वह एक अहम टॉस हार गए थे।"""
sentences = sentence_tokenize.sentence_split(indic_string, lang="hi")
for t in sentences:
    print(t)

तो क्या विश्व कप 2019 में मैच का बॉस टॉस है?
यानी मैच में हार-जीत में टॉस की भूमिका अहम है?
आप ऐसा सोच सकते हैं।
विश्वकप के अपने-अपने पहले मैच में बुरी तरह हारने वाली एशिया की दो टीमों पाकिस्तान और श्रीलंका के कप्तान ने हालांकि अपने हार के पीछे टॉस की दलील तो नहीं दी, लेकिन यह जरूर कहा था कि वह एक अहम टॉस हार गए थे।


In [10]:
from indicnlp.tokenize import indic_tokenize

indic_string = "सुनो, कुछ आवाज़ आ रही है। फोन?"

print("Input String: {}".format(indic_string))
print("Tokens: ")
for t in indic_tokenize.trivial_tokenize(indic_string):
    print(t)

Input String: सुनो, कुछ आवाज़ आ रही है। फोन?
Tokens: 
सुनो
,
कुछ
आवाज़
आ
रही
है
।
फोन
?


In [11]:
from indicnlp.tokenize import indic_detokenize  
indic_string='" सुनो , कुछ आवाज़ आ रही है । " , उसने कहा । '

print('Input String: {}'.format(indic_string))
print('Detokenized String: {}'.format(indic_detokenize.trivial_detokenize(indic_string,lang='hi')))


Input String: " सुनो , कुछ आवाज़ आ रही है । " , उसने कहा । 
Detokenized String: "सुनो, कुछ आवाज़ आ रही है।", उसने कहा। 


In [12]:
# script conversion Transliterate and romanization

In [13]:
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator

input_text = "राजस्थान"
print(UnicodeIndicTransliterator.transliterate(input_text, "hi", "gu"))

રાજસ્થાન


In [14]:
from indicnlp.transliterate.unicode_transliterate import ItransTransliterator

ItransTransliterator.to_itrans("राजस्थान", "hi")

'raajasthaana'

In [15]:
from indicnlp.transliterate.unicode_transliterate import ItransTransliterator

print(ItransTransliterator.from_itrans("raajasthaana", "hi"))

राजस्थान


In [16]:
x= 'राजस्थान'
for y in x:
    print(y, "{:x}".format(ord(y)))

र 930
ा 93e
ज 91c
स 938
् 94d
थ 925
ा 93e
न 928


In [17]:
from indicnlp.script import  indic_scripts as isc

c='क'
lang='hi'

isc.get_phonetic_feature_vector(c,lang)

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [18]:
sorted(isc.PV_PROP_RANGES.items(),key=lambda x:x[1][0])

[('basic_type', [0, 6]),
 ('vowel_length', [6, 8]),
 ('vowel_strength', [8, 11]),
 ('vowel_status', [11, 13]),
 ('consonant_type', [13, 18]),
 ('articulation_place', [18, 23]),
 ('aspiration', [23, 25]),
 ('voicing', [25, 27]),
 ('nasalization', [27, 29]),
 ('vowel_horizontal', [29, 32]),
 ('vowel_vertical', [32, 36]),
 ('vowel_roundness', [36, 38])]

In [19]:
from indicnlp.langinfo import *

c='क'
lang='hi'

print('Is vowel?:  {}'.format(is_vowel(c,lang)))
print('Is consonant?:  {}'.format(is_consonant(c,lang)))
print('Is velar?:  {}'.format(is_velar(c,lang)))
print('Is palatal?:  {}'.format(is_palatal(c,lang)))
print('Is aspirated?:  {}'.format(is_aspirated(c,lang)))
print('Is unvoiced?:  {}'.format(is_unvoiced(c,lang)))
print('Is nasal?:  {}'.format(is_nasal(c,lang)))

Is vowel?:  False
Is consonant?:  True
Is velar?:  True
Is palatal?:  False
Is aspirated?:  False
Is unvoiced?:  True
Is nasal?:  False


In [20]:
from indicnlp.script import  indic_scripts as isc
from indicnlp.script import  phonetic_sim as psim

c1='क'
c2='ख'
c3='भ'
lang='hi'

print('Similarity between {} and {}'.format(c1,c2))
print(psim.cosine(
    isc.get_phonetic_feature_vector(c1,lang),
    isc.get_phonetic_feature_vector(c2,lang)
    ))

print()

print(u'Similarity between {} and {}'.format(c1,c3))
print(psim.cosine(
    isc.get_phonetic_feature_vector(c1,lang),
    isc.get_phonetic_feature_vector(c3,lang)
    ))


Similarity between क and ख
0.8333319444467593

Similarity between क and भ
0.4999991666680556


In [21]:
from indicnlp.script import  indic_scripts as isc
from indicnlp.script import  phonetic_sim as psim


slang='hi'
tlang='ml'
sim_mat=psim.create_similarity_matrix(psim.cosine,slang,tlang,normalize=False)

c1='क'
c2='ഖ'
print('Similarity between {} and {}'.format(c1,c2))
print(sim_mat[isc.get_offset(c1,slang),isc.get_offset(c2,tlang)])

Similarity between क and ഖ
0.8333319444467593


Some similarity functions like `sim` do not generate values in the range [0,1] and it may be more convenient to have the similarity values in the range [0,1]. This can be achieved by setting the `normalize` paramter to `True`

In [22]:
slang='hi'
tlang='ml'
sim_mat=psim.create_similarity_matrix(psim.sim1,slang,tlang,normalize=True)

c1='क'
c2='ഖ'
print(u'Similarity between {} and {}'.format(c1,c2))
print(sim_mat[isc.get_offset(c1,slang),isc.get_offset(c2,tlang)])

Similarity between क and ഖ
0.06860894001932027


In [23]:
# Lexical Similarity

In [24]:
from indicnlp.script import  indic_scripts as isc
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator

lang1_str='पिछले दिनों हम लोगों ने कई उत्सव मनाये. कल, हिन्दुस्तान भर में श्री कृष्ण जन्म-महोत्सव मनाया गया.'
lang2_str='વીતેલા દિવસોમાં આપણે કેટલાય ઉત્સવો ઉજવ્યા. હજી ગઇકાલે જ પૂરા હિંદુસ્તાનમાં શ્રીકૃષ્ણ જન્મોત્સવ ઉજવવામાં આવ્યો.'
lang1='hi'
lang2='gu'

lcsr, len1, len2 = isc.lcsr_indic(lang1_str,lang2_str,lang1,lang2)

print('{} string: {}'.format(lang1, lang1_str))
print('{} string: {}'.format(lang2, UnicodeIndicTransliterator.transliterate(lang2_str,lang2,lang1)))
print('Both strings are shown in Devanagari script using script conversion for readability.')
print('LCSR: {}'.format(lcsr))


hi string: पिछले दिनों हम लोगों ने कई उत्सव मनाये. कल, हिन्दुस्तान भर में श्री कृष्ण जन्म-महोत्सव मनाया गया.
gu string: वीतेला दिवसोमां आपणे केटलाय उत्सवो उजव्या. हजी गइकाले ज पूरा हिंदुस्तानमां श्रीकृष्ण जन्मोत्सव उजववामां आव्यो.
Both strings are shown in Devanagari script using script conversion for readability.
LCSR: 0.5545454545454546


In [25]:
# Orthographic Syllabification

In [26]:
from indicnlp.syllable import  syllabifier

w='जगदीशचंद्र'

syllabifier.orthographic_syllabify(w,lang)

['ज', 'ग', 'दी', 'श', 'च', 'ंद्र']

In [27]:
# Word Segmentation

In [28]:
from indicnlp.morph import unsupervised_morph 
analyzer=unsupervised_morph.UnsupervisedMorphAnalyzer('mr')

In [29]:
indic_string='माणसाने माणसाशी माणसासम वागणे'
analyzer.morph_analyze_document(indic_string.split(' '))

['माणस', 'ाने', 'माणस', 'ाशी', 'माणस', 'ा', 'सम', 'वाग', 'णे']

In [30]:
from indicnlp.transliterate import acronym_transliterator

ack_transliterator=acronym_transliterator.LatinToIndicAcronymTransliterator()
ack_transliterator.transliterate('ICICI',lang='hi')

'आईसीआईसीआई'

In [31]:
# Shata-anuvaadak REST API for Translation

In [32]:
import json
import requests
from urllib.parse import quote

text=quote('Mumbai is the capital of Maharashtra')

url='http://www.cfilt.iitb.ac.in/indicnlpweb/indicnlpws/translate/en/mr/{}/'.format(text)
## Note the forward slash '/' at the end of the URL. It's should be there, but please live with it for now!

print(url)
response = requests.get(url)
response.json()

http://www.cfilt.iitb.ac.in/indicnlpweb/indicnlpws/translate/en/mr/Mumbai%20is%20the%20capital%20of%20Maharashtra/


{'mr': 'राजधानी महाराष्ट्र मुंबई आहे . '}

In [33]:
# BrahmiNet REST API for transliteration

In [34]:
import json
import requests
from urllib.parse import  quote

text=quote('manish joe')
# text=quote('मनिश् जोए')
url='http://www.cfilt.iitb.ac.in/indicnlpweb/indicnlpws/transliterate_bulk/en/hi/{}/statistical'.format(text)
    
print(url)
response = requests.get(url)
response.json()

http://www.cfilt.iitb.ac.in/indicnlpweb/indicnlpws/transliterate_bulk/en/hi/manish%20joe/statistical


JSONDecodeError: Expecting value: line 1 column 1 (char 0)