In [1]:
import glob
import re

In [2]:
text_files = glob.glob("pos_data/data_1/*.txt")

In [3]:
sentences = []
for file in text_files:
    with open(file,'r',encoding="UTF-8-SIG") as f:
        for line in f.readlines():
            sentences.append(line)

In [4]:
len(sentences)

4291

In [5]:
def create_data(sentence):
    tokens = re.split(r"<\w+><\w+>\s?|<\w+>\s?",sentence)[:-1]
    tags = re.findall(r"<\w+><\w+>|<\w+>",sentence)
    return [tokens,tags]

In [6]:
data_set = list(map(create_data,sentences))
len(data_set)

4291

In [7]:
'''
The number of tokens and tags has to be of same length.
So, checking if we have not equal items in the dataset.
If we have unequal item then we take it's index value and store it.

Previously, the tokens and tags were not of same length in some senteces.
So, I needed to change teh regex pattern for getting the tags and tokens.
'''
not_equal = []
for items in data_set:
    if len(items[0]) != len(items[1]):
        not_equal.append(data_set.index(items))

In [8]:
not_equal

[]

In [9]:
'''
Extracting features of the words from a sentence
'''
def extract_features(sentence,index):
    return {
        'word':sentence[index],
        'is_first':index==0,
        'is_last':index ==len(sentence)-1,
        'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*)',sentence[index])))),
        'prefix-1':sentence[index][0],
        'prefix-2':sentence[index][:2],
        'prefix-3':sentence[index][:3],
        'prefix-3':sentence[index][:4],
        'suffix-1':sentence[index][-1],
        'suffix-2':sentence[index][-2:],
        'suffix-3':sentence[index][-3:],
        'suffix-3':sentence[index][-4:],
        'prev_word':'' if index == 0 else sentence[index-1],
        'next_word':'' if index < len(sentence) else sentence[index+1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
  }

In [10]:
'''
This function will use the above extract_features method and give us the feature along with its
respective PoS tag
'''
def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for sentence,tags in tagged_sentences:
        sent_word_features, sent_tags = [],[]
        for index in range(len(sentence)):
            sent_word_features.append(extract_features(sentence,index))
            sent_tags.append(tags[index])
        X.append(sent_word_features)
        y.append(sent_tags)
    return X,y

In [11]:
train_size = int(0.8*len(data_set))
print ("The training size is: ",train_size)
print ("The testing size is: ",len(data_set)-train_size)

train_data = data_set[:train_size]
test_data = data_set[train_size:]

The training size is:  3432
The testing size is:  859


In [12]:
X_train, y_train = transform_to_dataset(train_data)

In [13]:
X_test,y_test = transform_to_dataset(test_data)

In [14]:
'''
Training the tagger
'''

from sklearn_crfsuite import CRF

model = CRF(
    algorithm="lbfgs",
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True
)

In [15]:
print("Starting to train")
model.fit(X_train,y_train)
print("Finished training")

loading training data to CRFsuite:  11%|█         | 382/3432 [00:00<00:00, 3814.75it/s]

Starting to train


loading training data to CRFsuite: 100%|██████████| 3432/3432 [00:00<00:00, 4257.63it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 60841
Seconds required: 0.182

L-BFGS optimization
c1: 0.010000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=1.17  loss=313124.60 active=60817 feature_norm=1.00
Iter 2   time=1.22  loss=286093.73 active=60581 feature_norm=15.72
Iter 3   time=0.61  loss=148439.47 active=59328 feature_norm=17.52
Iter 4   time=0.62  loss=116877.51 active=59943 feature_norm=16.44
Iter 5   time=0.60  loss=102131.69 active=60066 feature_norm=16.55
Iter 6   time=0.60  loss=86194.64 active=60255 feature_norm=18.39
Iter 7   time=0.61  loss=63172.98 active=60026 feature_norm=23.71
Iter 8   time=0.59  loss=51281.11 active=59722 feature_norm=30.61
Iter 9   time=0.61  loss=42312.98 active=60332 feature_norm=33.08
I

In [16]:
# predicting
pred_result = model.predict(X_test)

In [17]:
'''
List of the different PoS tags (classes) that we have
'''
model.classes_

['<CD>',
 '<NN>',
 '<POP>',
 '<FB>',
 '<NNP>',
 '<PLE>',
 '<JJ>',
 '<CC>',
 '<VBF>',
 '<YF>',
 '<RBO>',
 '<PKO>',
 '<YM>',
 '<PP>',
 '<VBNE>',
 '<JJM>',
 '<VBI>',
 '<VBKO>',
 '<VBX>',
 '<PLAI>',
 '<DM>',
 '<PPR>',
 '<HRU>',
 '<CS>',
 '<VBO>',
 '<YQ>',
 '<DUM>',
 '<RP>',
 '<CL>',
 '<RBM>',
 '<JJD>',
 '<SYM>',
 '<OD>',
 '<QW>',
 '<UNW>',
 '<HRU><NN>',
 '<NN><NN>',
 '<JJ><JJ>',
 '<VBI><NN>',
 '<PLE><NN>',
 '<YB>',
 '<FW>',
 '<VBKO><VBKO>',
 '<CD><CD>',
 '<YB><VBX>',
 '<PKO><NNP>',
 '<POP><NN>',
 '<YQ><NNP>',
 '<PLAI><NN>',
 '<YM><NNP>',
 '<PP><PP>',
 '<YF><YF>',
 '<YF><VBF>']

In [18]:
len(model.classes_)

53

In [19]:
from sklearn_crfsuite.metrics import flat_classification_report,flat_f1_score,flat_accuracy_score,flat_recall_score,flat_precision_score

print("Accuracy score on Test Data")
print(flat_accuracy_score(y_test,pred_result,))
print("**************")
print("F1 score on Test Data")
print(flat_f1_score(y_test,pred_result,average="weighted",labels=model.classes_))
print("**************")
print("Precision score on Test Data")
print(flat_precision_score(y_test,pred_result,average="weighted",labels=model.classes_))
print("**************")
print("Recall score on Test Data")
print(flat_recall_score(y_test,pred_result,average="weighted",labels=model.classes_))


Accuracy score on Test Data
0.9523727276652705
**************
F1 score on Test Data
0.9518721361828959
**************
Precision score on Test Data
0.9515483971852243
**************
Recall score on Test Data
0.952742980561555


  average, "true nor predicted", 'F-score is', len(true_sum)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
print("Classification Report")
print(flat_classification_report(y_test,pred_result,labels=model.classes_,digits=3))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report
              precision    recall  f1-score   support

        <CD>      0.990     0.982     0.986       728
        <NN>      0.929     0.960     0.944      5980
       <POP>      0.983     0.984     0.984      1510
        <FB>      0.944     0.971     0.957        69
       <NNP>      0.902     0.876     0.889      1658
       <PLE>      0.992     0.999     0.995       701
        <JJ>      0.906     0.854     0.879      2014
        <CC>      0.995     0.986     0.990       565
       <VBF>      0.986     0.964     0.975       786
        <YF>      0.999     0.998     0.998       881
       <RBO>      0.845     0.855     0.850       433
       <PKO>      0.995     0.997     0.996      1566
        <YM>      1.000     0.997     0.999       725
        <PP>      0.990     0.993     0.991       286
      <VBNE>      0.960     0.968     0.964       371
       <JJM>      0.904     0.837     0.869       202
       <VBI>      0.961     0.948     0.954       386
     

In [21]:
text = "म आजा भात खान्छु , अनि बल्ल खेल्न जान्छु ।"

tokens = text.split(" ")
features = [extract_features(tokens,index) for index in range(len(tokens))]

result = model.predict_single(features)
# print(result)
for i in range(len(tokens)):
    print(tokens[i],"\t", result[i])

म 	 <PP>
आजा 	 <NN>
भात 	 <NN>
खान्छु 	 <VBF>
, 	 <YM>
अनि 	 <CC>
बल्ल 	 <NN>
खेल्न 	 <VBI>
जान्छु 	 <VBF>
। 	 <YF>


In [22]:
import pickle

file_name = "./pos_model/pos_tag_crf.sav"
pickle.dump(model,open(file_name,'wb'))

In [None]:
# load_model = pickle.load(open(file_name,'rb'))