In [1]:
import re
import numpy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
import _pickle as cPickle
import nltk

In [2]:
filename = [
            'Dataset/Corpus/00ne_pos.txt',
            'Dataset/Corpus/01ne_pos.txt',
            'Dataset/Corpus/02ne_pos.txt',
            ]
data=[]
for f in filename:
    file = open(f, "r")
    for line in file:
        data.append(line)


In [3]:
a = [tuple(e.split())for e in data]


In [4]:
tagged_sentences=[]
for i in range(len(a)):
    d=[]
    for e in a[i]:
        pos = re.findall('(?<=\<).*?(?=\>)', e)
        for p in pos:
            word = (e.replace(f'<{p}>',''))
        if len(word)>0 and len(pos)>0:
            d.append((word,' '.join(pos)))
    tagged_sentences.append(d)


In [5]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]


In [6]:
def features(sentence, index):
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
    }


In [7]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y
 
X, y = transform_to_dataset(tagged_sentences)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [9]:
clf = Pipeline([
    ('vectorizer', DictVectorizer()),
    ('classifier', RandomForestClassifier(n_estimators = 20, random_state = 43))
    ])
 
clf.fit(X_train,y_train)
print("Accuracy:", clf.score(X_test, y_test))


Accuracy: 0.7736738174311103


In [10]:
pickle_out = open("RandomForest.pickle","wb")
cPickle.dump(clf, pickle_out)
pickle_out.close()


In [11]:
pickle_out = open("RandomForest.pickle","rb")
clf1=cPickle.load(pickle_out)
pickle_out.close()


In [16]:
print("Accuracy:", clf1.score(X_test, y_test))


Accuracy: 0.7736738174311103


In [18]:
# Predictions
y_pred = clf1.predict(X_test)
# F1-Score
f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' to handle class imbalances
print("F1-Score:", f1)

F1-Score: 0.7675818461258838


In [19]:
def tagger(sentence):
    tags = clf1.predict([features(sentence, index) for index in range(len(sentence))])
    return zip(sentence, tags)


In [20]:
a = tagger(nltk.word_tokenize('६१ वर्षीय पियरे भिन्केन नोभेम्बर २९ बाट सल्लाहकार को रूप मा सञ्चालक समिति मा आउनुहुनेछ ।'))


In [21]:
for e in a :
    print(e)


('६१', 'CD')
('वर्षीय', 'JJ')
('पियरे', 'NNP')
('भिन्केन', 'NNP')
('नोभेम्बर', 'NNP')
('२९', 'CD')
('बाट', 'POP')
('सल्लाहकार', 'CD')
('को', 'PKO')
('रूप', 'CD')
('मा', 'POP')
('सञ्चालक', 'NN')
('समिति', 'CD')
('मा', 'POP')
('आउनुहुनेछ', 'VBF')
('।', 'YF')
