In [1]:
import pycrfsuite
import features
import data_parser 
import pickle as pickle

freq = features.frequencies('train.txt') 
train_sents=data_parser.load('train.txt')
test_sents=data_parser.load('test.txt')

In [2]:
def word2features(sent, i):
    word = sent[i][0]
    category = sent[i][1]
    if(word not in freq): freq[word]=0
    feat = {
        'bias':1,
        'word': word,
        'word.isdigit':features.isdigit(word),
        'category':str(category),
        'freq':float(freq[word]),
        'BOS':'0',
        'EOS':'0',
    }
    if i > 0:
        if i > 1:
            word2 = sent[i-2][0]
            if(word2 not in freq): freq[word2]=0
            category2 = sent[i-2][1]
            
            feat.update({
                '-2:word':word2,
                '-2:word.isdigit':features.isdigit(word2),
                '-2:word.istitle()':word2.istitle(),
                '-2:category':str(category2),
                '-2:freq':float(freq[word2]),
                
            })
        word1 = sent[i-1][0]
        if(word1 not in freq): freq[word1]=0
        category1 = sent[i-1][1]
        feat.update({
            '-1:word':word1,
            '-1:word.isdigit':features.isdigit(word1),
            '-1:word.istitle()':word1.istitle(),
            '-1:category':str(category1),
            '-1:freq':float(freq[word1]),
        })
       
    else:
        feat.update({'BOS':'1'})
        
    if i < len(sent)-1:
        if i < len(sent)-2:
            word2 = sent[i+2][0]
            if(word2 not in freq): freq[word2]=0
            category2 = sent[i+2][1]
            feat.update({
                '+2:word':word2,
                '+2:word.isdigit':features.isdigit(word2),
                '+2:word.istitle()':word2.istitle(),
                '+2:category':str(category2),
                '+2:freq':float(freq[word2]),
            })
        word1 = sent[i+1][0]
        if(word1 not in freq): freq[word1]=0
        category1 = sent[i+1][1]
        feat.update({
            '+1:word':word1,
            '+1:word.isdigit':features.isdigit(word1),
            '+1:word.istitle()':word1.istitle(),
            '+1:category':str(category1),
            '+1:freq':float(freq[word1]),
        })
    else:
        feat.update({'EOS':'1'})
                
    return feat


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for word, category, label in sent]

def sent2tokens(sent):
    return [word for word,postag,label in sent]

In [3]:
sent2features(train_sents[0])[2]

{'bias': 1,
 'word': 'नलियों',
 'word.isdigit': False,
 'category': 'X',
 'freq': 2.0,
 'BOS': '0',
 'EOS': '0',
 '-2:word': 'सूक्ष्म',
 '-2:word.isdigit': False,
 '-2:word.istitle()': False,
 '-2:category': 'AJ',
 '-2:freq': 2.0,
 '-1:word': 'श्वास',
 '-1:word.isdigit': False,
 '-1:word.istitle()': False,
 '-1:category': 'N',
 '-1:freq': 2.0,
 '+2:word': 'कोई',
 '+2:word.isdigit': False,
 '+2:word.istitle()': False,
 '+2:category': 'N',
 '+2:freq': 6.0,
 '+1:word': 'में',
 '+1:word.isdigit': False,
 '+1:word.istitle()': False,
 '+1:category': 'N',
 '+1:freq': 261.0}

In [4]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [5]:
trainer = pycrfsuite.Trainer(algorithm = 'lbfgs',verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [6]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 500,
    'feature.possible_transitions': True
})

In [7]:
trainer.train('hindiNER.crfsuite') #train and save model to file 'hindiNER.crfsuite'

In [8]:
tagger = pycrfsuite.Tagger()
tagger.open('hindiNER.crfsuite')

<contextlib.closing at 0x7f4b57e994e0>

In [9]:
true_positive_C =true_false_positive_C = true_false_negative_C = 0
true_positive_P =true_false_positive_P = true_false_negative_P = 0
true_positive_S =true_false_positive_S = true_false_negative_S = 0
true_positive_D =true_false_positive_D = true_false_negative_D = 0

for i in test_sents:
    example_sent = i#test_sents[0]
    print(' '.join(sent2tokens(example_sent)))
    predicted = ' '.join(tagger.tag(sent2features(example_sent)))
    print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
    correct=' '.join(sent2labels(example_sent))
    print("Correct:  ", ' '.join(sent2labels(example_sent)))
    predicted=predicted.split(' ')
    correct=correct.split(' ')
    for i in range(len(predicted)):
        if predicted[i]==correct[i]:
            if predicted[i] == 'C':
                true_positive_C = true_positive_C + 1
            elif predicted[i] == 'P':
                true_positive_P = true_positive_P + 1
            elif predicted[i] == 'S':
                true_positive_S= true_positive_S + 1
            elif predicted[i] == 'D':
                true_positive_D = true_positive_D + 1
        
        if predicted[i] == 'C':
                true_false_positive_C = true_false_positive_C + 1
        elif predicted[i] == 'P':
                true_false_positive_P = true_false_positive_P + 1
        elif predicted[i] == 'S':
                true_false_positive_S = true_false_positive_S + 1
        elif predicted[i] == 'D':
                true_false_positive_D = true_false_positive_D + 1
        
        
        if correct[i] == 'C':
                true_false_negative_C = true_false_negative_C + 1
        elif correct[i] == 'P':
                true_false_negative_P = true_false_negative_P + 1
        elif correct[i] == 'S':
                true_false_negative_S = true_false_negative_S + 1
        elif correct[i] == 'D':
                true_false_negative_D = true_false_negative_D + 1
        
precision_C = true_positive_C/true_false_positive_C
precision_P = true_positive_P/true_false_positive_P
precision_S = true_positive_S/true_false_positive_S
precision_D = true_positive_D/true_false_positive_D


recall_C = true_positive_C/true_false_negative_C
recall_P = true_positive_P/true_false_negative_P
recall_S = true_positive_S/true_false_negative_S
recall_D = true_positive_D/true_false_negative_D



f1_score_C = 2*(precision_C*recall_C)/(precision_C+recall_C)
f1_score_P = 2*(precision_C*recall_P)/(precision_C+recall_P)
f1_score_S = 2*(precision_C*recall_S)/(precision_C+recall_S)
f1_score_D = 2*(precision_C*recall_D)/(precision_C+recall_D)

print("\tPrecision\tRecall\tF1_score")
print("Consumable:\t"+str(precision_C)+"\t"+str(recall_C)+"\t"+str(f1_score_C))
print("Person:\t"+str(precision_P)+"\t"+str(recall_P)+"\t"+str(f1_score_P))
print("Symptoms:\t"+str(precision_S)+"\t"+str(recall_S)+"\t"+str(f1_score_S))
print("Disease:\t"+str(precision_D)+"\t"+str(recall_D)+"\t"+str(f1_score_D))
print((f1_score_C+f1_score_P+f1_score_S+f1_score_D)/4)

मांसपेशियों में तनाव रहने या किसी चोट की वजह से भी घुटनों का दर्द आपको परेशान कर सकता है
Predicted: 0 0 S 0 0 0 0 0 0 0 0 0 0 S 0 0 0 0 0
Correct:   0 0 D 0 0 0 S 0 0 0 0 0 0 0 0 0 0 0 0
घुटनों को ठंडा सेक देने से यह रक्त वाहिकाओं को कसता है जिससे रक्त प्रवाह कम होता है और सूजन भी घटती है
Predicted: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Correct:   0 0 S 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
पूरी तरह ठीक होने तक रोजाना इस टॉनिक का सेवन करें
Predicted: 0 0 0 0 0 0 0 0 0 0 0
Correct:   0 0 0 0 0 0 0 C 0 0 0
इससे घुटनों के दर्द में काफी राहत मिलेगी एक चम्मच सेब का सिरका और जैतून के तेल को बराबर भागों में मिलाकर घुटनों की मालिश करें फायदा होगा
Predicted: 0 0 0 S 0 0 0 0 0 0 0 0 0 0 0 0 C 0 0 0 0 0 0 0 0 0 0 0
Correct:   0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 C 0 0 0 0 0 0 0 0 0 0 0
एक से डेढ़ कप तेल में दो बड़े चम्मच लाल मिर्च पाउडर डालकर एक पेस्ट तैयार करें
Predicted: 0 0 0 0 C 0 0 0 0 0 C C 0 0 0 0 0
Correct:   0 0 0 0 C 0 0 0 0 0 C C 0 0 0 0 0
घुटनों के दर्द से राहत मिलेगी एक कप सेब के 

In [10]:
#calulating average f1 score
print((f1_score_C+f1_score_P+f1_score_S+f1_score_D)/4)

0.696813725490196


In [11]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%s -> %s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(100))

Top likely transitions:
P -> P 1.528770
C -> C 1.420098
D -> 0 0.715140
S -> S 0.577037
C -> 0 0.572466
0 -> 0 0.565511
0 -> C 0.441020
0 -> P 0.337122
0 -> c -0.018747
0 -> D -0.248614
C -> S -0.913393
S -> C -0.938041


In [12]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
10.411765 C      word:पानी
9.708881 P      word:व्यक्ति
7.940485 P      word:लोग
7.546601 S      word:संक्रमण
7.124682 D      word:डायरिया
7.099334 S      word:जलन
6.951602 D      word:एड्स
6.907771 C      word:लहसुन
6.560577 P      word:रोगी
6.459804 D      word:माइग्रेन
6.447686 S      word:दस्त
6.403939 S      word:थकान
6.399406 S      word:दर्द
6.398224 c      word:प्याज
6.353085 C      word:दही
6.299573 D      word:एनोरेक्सिया
6.297467 C      word:एसिड
6.228638 S      word:मतली
6.211460 C      word:शहद
6.207718 S      word:वायरल

Top negative:
-0.208670 C      +1:word:के
-0.228758 0      -1:word:में
-0.255242 0      +1:word:का
-0.266007 0      -2:word:में
-0.382804 0      +2:word:समस्या
-0.439070 P      -2:category:N
-0.444048 0      +2:word:को
-0.470445 S      BOS:1
-0.482277 D      +1:category:X
-0.488506 0      -1:word:ही
-0.565288 0      +1:word:होते
-0.637153 D      +2:word:दौरान
-0.762342 0      +1:word:रात
-0.784291 0      word:दूध
-0.795871 0      +1:word:मस्