In [5]:
from nltk.tag import tnt
from nltk.corpus import indian
import nltk
nltk.download('indian')
from nltk.tree import Tree

[nltk_data] Downloading package indian to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package indian is already up-to-date!


In [18]:
def hindi_model():
    train_data = indian.tagged_sents('hindi.pos')
    tnt_pos_tagger = tnt.TnT()
    tnt_pos_tagger.train(train_data)
    return tnt_pos_tagger

In [19]:
def get_keywords(pos):
    grammar = r"""NP:{<NN.*>}"""
    chunkParser = nltk.RegexpParser(grammar)
    chunked = chunkParser.parse(pos)
    continuous_chunk = set()
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.add(named_entity)
                current_chunk = []
            else:
                continue
    return (continuous_chunk)

In [48]:
text1="पाकिस्तान की पूर्व प्रधानमंत्री बेनजीर भुट्टो पर लगे भ्रष्टाचार के आरोप । पाकिस्तान की हाईकोर्ट के द्वारा बेनजीर की सुनवाई स्थगित ।"
text2="भारत के प्रधानमंत्री नरेंद्र मोदी हैं ।"
text3="साईंराज और समीर भाई है ।"

In [49]:
model = hindi_model()
new_tagged = (model.tag(nltk.word_tokenize(text1)))
print(new_tagged)
print()
print("====KEYWORDS===")
print(get_keywords(new_tagged))


[('पाकिस्तान', 'NNP'), ('की', 'PREP'), ('पूर्व', 'JJ'), ('प्रधानमंत्री', 'NN'), ('बेनजीर', 'NNPC'), ('भुट्टो', 'NNP'), ('पर', 'PREP'), ('लगे', 'VFM'), ('भ्रष्टाचार', 'NN'), ('के', 'PREP'), ('आरोप', 'NVB'), ('।', 'PUNC'), ('पाकिस्तान', 'NNP'), ('की', 'PREP'), ('हाईकोर्ट', 'NNPC'), ('के', 'PREP'), ('द्वारा', 'PREP'), ('बेनजीर', 'NNP'), ('की', 'PREP'), ('सुनवाई', 'NN'), ('स्थगित', 'JVB'), ('।', 'PUNC')]

====KEYWORDS===
{'प्रधानमंत्री बेनजीर भुट्टो', 'पाकिस्तान हाईकोर्ट', 'भ्रष्टाचार', 'पाकिस्तान', 'सुनवाई', 'बेनजीर'}


In [50]:
new_tagged = (model.tag(nltk.word_tokenize(text2)))
print(new_tagged)
print()
print("====KEYWORDS===")
print(get_keywords(new_tagged))


[('भारत', 'NNP'), ('के', 'PREP'), ('प्रधानमंत्री', 'NNC'), ('नरेंद्र', 'NNPC'), ('मोदी', 'NNP'), ('हैं', 'VFM'), ('।', 'PUNC')]

====KEYWORDS===
{'प्रधानमंत्री नरेंद्र मोदी', 'भारत'}


In [51]:
new_tagged = (model.tag(nltk.word_tokenize(text3)))
print(new_tagged)
print()
print("====KEYWORDS===")
print(get_keywords(new_tagged))


[('साईंराज', 'NNPC'), ('और', 'CC'), ('समीर', 'NNPC'), ('भाई', 'NN'), ('है', 'VFM'), ('।', 'PUNC')]

====KEYWORDS===
{'समीर भाई', 'साईंराज'}


In [52]:
from collections import defaultdict

def build_conditional_probabilities(corpus):
 

    tokenized_string = corpus.split()
    previous_word = ""
    dictionnary = defaultdict(list)

    for current_word in tokenized_string:
        if previous_word != "":
            dictionnary[previous_word].append(current_word)
        previous_word = current_word
    


    for key in dictionnary.keys():
        next_words = dictionnary[key]
        unique_words = set(next_words) # removes duplicated
        nb_words = len(next_words)
        probabilities_given_key = {}
        for unique_word in unique_words:
            probabilities_given_key[unique_word] = \
                float(next_words.count(unique_word)) / nb_words
        dictionnary[key] = probabilities_given_key

    return dictionnary


def bigram_next_word_predictor(conditional_probabilities, current, next_candidate):


    if current in conditional_probabilities:
        if next_candidate in conditional_probabilities[current]:
            return conditional_probabilities[current][next_candidate]
    return 0.0

# call the conditional probability dictionnary builder function
conditional_probabilities = build_conditional_probabilities(text1)


In [53]:
bigram_next_word_predictor(conditional_probabilities, "पाकिस्तान","की")

1.0

In [54]:
bigram_next_word_predictor(conditional_probabilities, "की", "पूर्व")

0.3333333333333333

In [55]:
bigram_next_word_predictor(conditional_probabilities, "के", "आरोप")

0.5