In [28]:
import nltk
from nltk import word_tokenize, pos_tag

from nltk.corpus import indian
X3= nltk.corpus.indian
X3_marathi_sent = X3.tagged_sents('marathi.pos')

In [29]:
marathi_numbers = [chr(0x0966), chr(0x0967), chr(0x0968), chr(0x0969), chr(0x096A),
                   chr(0x096B), chr(0x096C), chr(0x096D), chr(0x096E), chr(0x096F)]
print("Marathi numbers",marathi_numbers)


def isNumberMarathi(word):
    isNum = True
    for i in list(word):
        if i not in marathi_numbers:
            isNum = False
            break;
    
    return isNum

    
def marathi_features(sentence, index):
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': sentence[index][0] if sentence[index] != '' else '',
        'prefix-2': sentence[index][:2] if sentence[index] != '' else '',
        'prefix-3': sentence[index][:3] if sentence[index] != '' else '',
        'suffix-1': sentence[index][-1] if sentence[index] != '' else '',
        'suffix-2': sentence[index][-2:] if sentence[index] != '' else '',
        'suffix-3': sentence[index][-3:] if sentence[index] != '' else '',
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit() or isNumberMarathi(sentence[index])
    }

import pprint 
pprint.pprint(marathi_features(['महाराष्ट्र', 'अध्यक्ष', 'यशवंत', 'होते'], 1))

def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]


cutoff = int(.75 * len(X3_marathi_sent))
training_sentences = X3_marathi_sent[:cutoff]
test_sentences = X3_marathi_sent[cutoff:]
 
print(len(training_sentences))
print(len(test_sentences))

 
def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(marathi_features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y
 
X, y = transform_to_dataset(training_sentences)


Marathi numbers ['०', '१', '२', '३', '४', '५', '६', '७', '८', '९']
{'has_hyphen': False,
 'is_first': False,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'यशवंत',
 'prefix-1': 'अ',
 'prefix-2': 'अध',
 'prefix-3': 'अध्',
 'prev_word': 'महाराष्ट्र',
 'suffix-1': 'ष',
 'suffix-2': '्ष',
 'suffix-3': 'क्ष',
 'word': 'अध्यक्ष'}


897
300


In [36]:
from sklearn.naive_bayes  import GaussianNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

size=10000

clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', GaussianNB())
])
clf.fit(X[:size], y[:size])
 
print('training OK')
 
X_test, y_test = transform_to_dataset(test_sentences)
 
print("Accuracy:", clf.score(X_test, y_test))

training OK
Accuracy: 0.7377425412059253


In [39]:
def pos_tag(sentence):
    print('checking...')
    tagged_sentence = []
    tags = clf.predict([marathi_features(sentence, index) for index in range(len(sentence))])
    return zip(sentence, tags)

import platform
print(platform.python_version())

print(list(pos_tag(word_tokenize('मोठ्या देशांमध्ये, या सारख्या गोष्टी लहान असतात.'))))

3.6.4
checking...
[('मोठ्या', 'JJ'), ('देशांमध्ये', 'NN'), (',', 'CC'), ('या', 'DEM'), ('सारख्या', 'JJ'), ('गोष्टी', 'NN'), ('लहान', 'JJ'), ('असतात', 'VM'), ('.', 'SYM')]


In [2]:
import sys
sys.path.insert(0, r'C:\Megatron\Jupyter\RDRPOSTagger-python-3-master\pSCRDRtagger')
import RDRPOSTagger as rpt

FileNotFoundError: [WinError 2] The system cannot find the file specified: './pSCRDRtagger'