https://github.com/SuperSai9/NER-using-CRF-on-Clinical-Prescription

In [46]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [49]:
df = pd.read_csv('/Users/sdeshpande/Desktop/bioinformatices/NER-using-CRF-on-Clinical-Prescription/NLP_dataset.csv', encoding = "ISO-8859-1")
df = df[:100000]
df.head()
df.shape

(216, 4)

In [50]:
df.isnull().sum()

Sentence #    210
Word            0
POS             0
Tag             0
dtype: int64

In [51]:
df = df.fillna(method='ffill')
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

(6, 80, 5)

In [52]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,Dose,14
1,Frequency,43
2,Medicine,35
3,O,113
4,Person,11


In [53]:
X = df.drop('Tag', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = df.Tag.values
classes = np.unique(y)
classes = classes.tolist()

In [55]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.7-cp38-cp38-macosx_10_13_x86_64.whl (186 kB)
[K     |████████████████████████████████| 186 kB 6.9 MB/s eta 0:00:01
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [56]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

In [57]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(df)
sentences = getter.sentences

In [58]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [59]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [60]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [62]:
new_classes = classes.copy()
new_classes.remove('O')
new_classes

['Dose', 'Frequency', 'Medicine', 'Person']

In [63]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))
print("Accuracy: %f"%(sklearn_crfsuite.metrics.flat_accuracy_score(y_test, y_pred)))

              precision    recall  f1-score   support

        Dose       1.00      0.75      0.86         4
   Frequency       0.89      1.00      0.94         8
    Medicine       1.00      0.75      0.86         8
      Person       1.00      1.00      1.00         4

   micro avg       0.95      0.88      0.91        24
   macro avg       0.97      0.88      0.91        24
weighted avg       0.96      0.88      0.91        24

Accuracy: 0.921569




In [66]:
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))
print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(5))
print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-5:])

Top likely transitions:
Frequency -> O       1.913464
Medicine -> Medicine 1.636850
Medicine -> Dose    1.231886
Person -> Person  1.052644
Dose   -> Frequency 0.836473

Top unlikely transitions:
Person -> Medicine -0.513618
Person -> Frequency -0.726086
Medicine -> Person  -0.798690
O      -> Person  -0.993984
Medicine -> O       -1.538591


In [67]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(10))
print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-10:])

Top positive:
1.687348 Dose     word[-2:]:mg
1.609308 Frequency word.isdigit()
1.228623 Person   word.istitle()
1.192162 O        +1:postag[:2]:NN
1.145563 Person   BOS
1.143241 O        -1:postag[:2]:NN
1.027509 Medicine postag:NNP
1.024770 Person   postag:NNP
1.004573 O        +1:word.lower():years
1.004077 O        -1:postag:NNS

Top negative:
-0.288096 O        +1:postag:CD
-0.288096 O        +1:postag[:2]:CD
-0.312406 Dose     bias
-0.320021 Frequency +1:postag:IN
-0.320021 Frequency +1:postag[:2]:IN
-0.324014 Frequency +1:word.istitle()
-0.346968 Frequency -1:postag:NNP
-0.420040 O        word.istitle()
-0.571004 Frequency -1:word.istitle()
-0.862610 O        postag:NNP


In [69]:
pip install eli5

Collecting eli5
  Downloading eli5-0.10.1-py2.py3-none-any.whl (105 kB)
[K     |████████████████████████████████| 105 kB 5.2 MB/s eta 0:00:01
Collecting graphviz
  Downloading graphviz-0.14.2-py2.py3-none-any.whl (18 kB)
Installing collected packages: graphviz, eli5
Successfully installed eli5-0.10.1 graphviz-0.14.2
Note: you may need to restart the kernel to use updated packages.


In [70]:
import eli5
eli5.show_weights(crf)



From \ To,Dose,Frequency,Medicine,O,Person
Dose,0.0,0.836,0.0,-0.51,0.0
Frequency,0.0,0.0,-0.232,1.913,-0.088
Medicine,1.232,-0.19,1.637,-1.539,-0.799
O,-0.327,0.832,0.058,0.622,-0.994
Person,-0.298,-0.726,-0.514,0.685,1.053

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+1.687,word[-2:]:mg,,,
+0.951,word[-3:]:0mg,,,
+0.907,postag[:2]:CD,,,
+0.907,postag:CD,,,
+0.238,word[-3:]:5mg,,,
+0.238,word.lower():32.5mg,,,
+0.238,-1:word.lower():solution,,,
+0.238,+1:word.lower():by,,,
+0.081,+1:word.lower():3,,,
+0.053,+1:postag:IN,,,

Weight?,Feature
1.687,word[-2:]:mg
0.951,word[-3:]:0mg
0.907,postag[:2]:CD
0.907,postag:CD
0.238,word[-3:]:5mg
0.238,word.lower():32.5mg
0.238,-1:word.lower():solution
0.238,+1:word.lower():by
0.081,+1:word.lower():3
0.053,+1:postag:IN

Weight?,Feature
+1.609,word.isdigit()
+0.758,"+1:postag:,"
+0.758,"+1:postag[:2]:,"
+0.758,"+1:word.lower():,"
+0.689,+1:word.lower():times
+0.647,+1:word.lower():time
+0.647,word[-2:]:1
+0.647,word.lower():1
+0.647,word[-3:]:1
+0.628,+1:word.lower():days

Weight?,Feature
+1.028,postag:NNP
+0.898,postag[:2]:NN
+0.791,-1:postag:NNP
+0.598,"-1:word.lower():,"
+0.598,"-1:postag:,"
+0.598,"-1:postag[:2]:,"
+0.591,-1:word.lower():tablet
+0.481,word[-2:]:et
+0.481,word[-3:]:let
+0.481,word.lower():tablet

Weight?,Feature
+1.192,+1:postag[:2]:NN
+1.143,-1:postag[:2]:NN
+1.005,+1:word.lower():years
+1.004,-1:postag:NNS
+0.827,bias
+0.814,postag:NNS
+0.783,word[-3:]:day
+0.783,word.lower():day
+0.783,+1:word.lower():for
+0.783,word[-2:]:ay

Weight?,Feature
+1.229,word.istitle()
+1.146,BOS
+1.025,postag:NNP
+0.279,+1:postag[:2]:CD
+0.279,+1:postag:CD
+0.189,postag[:2]:NN
+0.137,+1:word.istitle()
+0.078,word.lower():shekhar
+0.078,-1:word.lower():shweta
+0.078,word[-2:]:ar


In [71]:
eli5.show_weights(crf,top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])

Weight?,Feature
-0.099,word.istitle()
-0.221,word.isdigit()

Weight?,Feature
1.609,word.isdigit()
-0.046,word.istitle()

Weight?,Feature
0.362,word.isupper()

Weight?,Feature
-0.092,word.isdigit()
-0.42,word.istitle()

Weight?,Feature
1.229,word.istitle()
-0.031,word.isdigit()
