In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [6]:
df = pd.read_csv('train_dataset_BOI_updated.csv')

In [7]:
df.head(100)

Unnamed: 0,Sentence #,Pattern,Word,POS,Tag
0,Sentence 0,"medicare does not cover most dental care, dent...",medicare,PROPN,O
1,,,does,AUX,O
2,,,not,PART,O
3,,,cover,VERB,O
4,,,most,ADJ,O
...,...,...,...,...,...
95,,,as,SCONJ,O
96,,,non,ADJ,B_INCLUSION
97,,,routine,ADJ,I_INCLUSION
98,,,services,NOUN,I_INCLUSION


In [8]:
df.isnull().sum()

Sentence #    654
Pattern       654
Word            0
POS             0
Tag             0
dtype: int64

In [9]:
df = df.fillna(method='ffill')

In [10]:
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

(22, 252, 5)

In [11]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B_EXCLUSION,20
1,B_INCLUSION,28
2,I_EXCLUSION,88
3,I_INCLUSION,80
4,O,460


In [12]:
X = df.drop('Tag', axis=1)
X.head()

Unnamed: 0,Sentence #,Pattern,Word,POS
0,Sentence 0,"medicare does not cover most dental care, dent...",medicare,PROPN
1,Sentence 0,"medicare does not cover most dental care, dent...",does,AUX
2,Sentence 0,"medicare does not cover most dental care, dent...",not,PART
3,Sentence 0,"medicare does not cover most dental care, dent...",cover,VERB
4,Sentence 0,"medicare does not cover most dental care, dent...",most,ADJ


In [13]:
X.columns

Index(['Sentence #', 'Pattern', 'Word', 'POS'], dtype='object')

In [14]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
X.shape

(676, 311)

In [15]:
y = df.Tag.values

In [16]:
classes = np.unique(y)

In [17]:
classes = classes.tolist()
classes

['B_EXCLUSION', 'B_INCLUSION', 'I_EXCLUSION', 'I_INCLUSION', 'O']

In [18]:
X.shape, y.shape

((676, 311), (676,))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)

In [20]:
X_train.shape, y_train.shape

((452, 311), (452,))

In [21]:
new_classes = classes.copy()
new_classes.pop()
new_classes

['B_EXCLUSION', 'B_INCLUSION', 'I_EXCLUSION', 'I_INCLUSION']

In [22]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

-- Epoch 1
-- Epoch 1
Norm: 7.87, NNZs: 47, Bias: -3.000000, T: 452, Avg. loss: 0.103982
Total training time: 0.00 seconds.
-- Epoch 1
Norm: 7.75, NNZs: 54, Bias: -3.000000, T: 452, Avg. loss: 0.139381
Total training time: 0.00 seconds.
-- Epoch 1
-- Epoch 1
Norm: 11.75, NNZs: 98, Bias: -3.000000, T: 452, Avg. loss: 0.362832Norm: 12.49, NNZs: 97, Bias: -3.000000, T: 452, Avg. loss: 0.331858Norm: 15.43, NNZs: 135, Bias: 2.000000, T: 452, Avg. loss: 0.672566
Total training time: 0.00 seconds.

Total training time: 0.01 seconds.

Total training time: 0.01 seconds.


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished


Perceptron(max_iter=5, n_jobs=-1, verbose=10)

In [23]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))

              precision    recall  f1-score   support

 B_EXCLUSION       0.00      0.00      0.00         5
 B_INCLUSION       0.50      0.22      0.31         9
 I_EXCLUSION       0.61      0.39      0.48        28
 I_INCLUSION       0.42      0.68      0.52        25

   micro avg       0.44      0.45      0.44        67
   macro avg       0.38      0.32      0.33        67
weighted avg       0.48      0.45      0.44        67



In [24]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [25]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [26]:
getter = SentenceGetter(df)

In [27]:
sent = getter.get_next()
print(sent)

None


In [28]:
sentences = getter.sentences

In [29]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [30]:

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [32]:

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [33]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=new_classes)

0.22775082510432226

In [34]:
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

              precision    recall  f1-score   support

 B_EXCLUSION       0.00      0.00      0.00         4
 B_INCLUSION       0.25      0.40      0.31         5
 I_EXCLUSION       0.21      0.15      0.17        27
 I_INCLUSION       0.30      0.70      0.42        10

   micro avg       0.24      0.28      0.26        46
   macro avg       0.19      0.31      0.23        46
weighted avg       0.22      0.28      0.23        46





In [38]:

import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=new_classes)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   10.3s finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000273B40B2390>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000273B40B2630>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B_EXCLUSION', 'B_INCLUSION', 'I_EXCLUSION', 'I_INCLUSION']),
                   verbose=1)

In [39]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 1.0583654901042938, 'c2': 0.009960380485977551}
best CV score: 0.3798732527198914
model size: 0.01M


In [40]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=new_classes))

              precision    recall  f1-score   support

 B_EXCLUSION       0.00      0.00      0.00         4
 B_INCLUSION       0.33      0.20      0.25         5
 I_EXCLUSION       0.36      0.15      0.21        27
 I_INCLUSION       0.30      0.30      0.30        10

   micro avg       0.30      0.17      0.22        46
   macro avg       0.25      0.16      0.19        46
weighted avg       0.31      0.17      0.22        46



In [41]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B_INCLUSION -> I_INCLUSION 4.936456
B_EXCLUSION -> I_EXCLUSION 4.047024
I_EXCLUSION -> I_EXCLUSION 3.696099
O      -> O       3.246837
I_INCLUSION -> I_INCLUSION 3.218298
O      -> B_INCLUSION 0.268701
I_EXCLUSION -> O       0.071171
O      -> B_EXCLUSION 0.005227
I_EXCLUSION -> B_EXCLUSION -0.055687
I_INCLUSION -> I_EXCLUSION -0.535248
I_EXCLUSION -> B_INCLUSION -1.054146
O      -> I_INCLUSION -1.085796
I_INCLUSION -> B_EXCLUSION -1.111218
O      -> I_EXCLUSION -1.345747

Top unlikely transitions:
B_INCLUSION -> I_INCLUSION 4.936456
B_EXCLUSION -> I_EXCLUSION 4.047024
I_EXCLUSION -> I_EXCLUSION 3.696099
O      -> O       3.246837
I_INCLUSION -> I_INCLUSION 3.218298
O      -> B_INCLUSION 0.268701
I_EXCLUSION -> O       0.071171
O      -> B_EXCLUSION 0.005227
I_EXCLUSION -> B_EXCLUSION -0.055687
I_INCLUSION -> I_EXCLUSION -0.535248
I_EXCLUSION -> B_INCLUSION -1.054146
O      -> I_INCLUSION -1.085796
I_INCLUSION -> B_EXCLUSION -1.111218
O      -> I_EXCLUSION -1.34

In [42]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
1.764252 B_EXCLUSION -1:word.lower():or
1.386426 O        bias
1.267688 I_EXCLUSION -1:word.lower():not
1.240816 B_EXCLUSION -1:word.lower():,
1.176094 O        postag:CCONJ
1.176094 O        postag[:2]:CC
1.174279 B_INCLUSION +1:word.lower():use
1.161673 I_EXCLUSION postag:ADP
1.022461 O        word[-3:]:res
0.971759 I_INCLUSION -1:postag:ADJ
0.836874 I_EXCLUSION -1:postag[:2]:AD
0.827802 B_EXCLUSION -1:word.lower():cover
0.737290 B_EXCLUSION word[-2:]:al
0.679649 B_INCLUSION +1:word.lower():and
0.635002 I_EXCLUSION +1:word.lower():or
0.624919 I_INCLUSION +1:word.lower():and
0.487035 B_EXCLUSION word.lower():methadone
0.487035 B_EXCLUSION word[-3:]:one
0.465356 O        -1:word.lower():,
0.447152 I_EXCLUSION +1:postag:SCONJ
0.447152 I_EXCLUSION +1:postag[:2]:SC
0.407145 O        postag:SCONJ
0.407145 O        postag[:2]:SC
0.388584 I_EXCLUSION postag:PUNCT
0.388584 I_EXCLUSION postag[:2]:PU
0.384053 O        word[-3:]:are
0.384053 O        word[-2:]:re
0.379360 B_EXCLUSI

In [43]:
import eli5

eli5.show_weights(crf, top=10)

Using TensorFlow backend.


From \ To,B_EXCLUSION,B_INCLUSION,I_EXCLUSION,I_INCLUSION,O
B_EXCLUSION,0.0,0.0,4.047,0.0,0.0
B_INCLUSION,0.0,0.0,0.0,4.936,0.0
I_EXCLUSION,-0.056,-1.054,3.696,0.0,0.071
I_INCLUSION,-1.111,0.0,-0.535,3.218,0.0
O,0.005,0.269,-1.346,-1.086,3.247

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+1.764,-1:word.lower():or,,,
+1.241,"-1:word.lower():,",,,
+0.828,-1:word.lower():cover,,,
+0.737,word[-2:]:al,,,
+0.487,word[-3:]:one,,,
+0.487,word.lower():methadone,,,
+0.379,word.lower():dental,,,
+0.370,word.lower():other,,,
+0.370,word[-3:]:her,,,
+0.340,word[-2:]:ne,,,

Weight?,Feature
+1.764,-1:word.lower():or
+1.241,"-1:word.lower():,"
+0.828,-1:word.lower():cover
+0.737,word[-2:]:al
+0.487,word[-3:]:one
+0.487,word.lower():methadone
+0.379,word.lower():dental
+0.370,word.lower():other
+0.370,word[-3:]:her
+0.340,word[-2:]:ne

Weight?,Feature
+1.174,+1:word.lower():use
+0.680,+1:word.lower():and
+0.369,+1:word.lower():services
+0.358,word[-2:]:al
+0.356,-1:postag:PUNCT
+0.356,-1:postag[:2]:PU
+0.217,-1:postag[:2]:DE
+0.217,-1:postag:DET
+0.203,-1:word.lower():and
… 7 more positive …,… 7 more positive …

Weight?,Feature
+1.268,-1:word.lower():not
+1.162,postag:ADP
+0.837,-1:postag[:2]:AD
+0.635,+1:word.lower():or
+0.447,+1:postag[:2]:SC
+0.447,+1:postag:SCONJ
+0.389,postag[:2]:PU
+0.389,postag:PUNCT
… 8 more positive …,… 8 more positive …
… 9 more negative …,… 9 more negative …

Weight?,Feature
+0.972,-1:postag:ADJ
+0.625,+1:word.lower():and
+0.343,word[-3:]:ion
+0.321,word.lower():test
+0.321,word[-3:]:est
+0.239,-1:postag[:2]:NO
+0.239,-1:postag:NOUN
+0.128,postag[:2]:NO
… 5 more positive …,… 5 more positive …
… 3 more negative …,… 3 more negative …

Weight?,Feature
+1.386,bias
+1.176,postag[:2]:CC
+1.176,postag:CCONJ
+1.022,word[-3:]:res
… 34 more positive …,… 34 more positive …
… 16 more negative …,… 16 more negative …
-0.540,-1:postag:CCONJ
-0.540,-1:postag[:2]:CC
-0.697,+1:postag:CCONJ
-0.697,+1:postag[:2]:CC


In [44]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=200,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)
eli5.show_weights(crf, top=10)



From \ To,B_EXCLUSION,B_INCLUSION,I_EXCLUSION,I_INCLUSION,O
B_EXCLUSION,0.0,0.0,0.0,0.0,0.0
B_INCLUSION,0.0,0.0,0.0,0.0,0.0
I_EXCLUSION,0.0,0.0,0.0,0.0,0.0
I_INCLUSION,0.0,0.0,0.0,0.0,0.0
O,0.0,0.0,0.0,0.0,0.96

0,1,2,3,4
y=B_EXCLUSION  top features,y=B_INCLUSION  top features,y=I_EXCLUSION  top features,y=I_INCLUSION  top features,y=O  top features
,,,,


In [45]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
)
crf.fit(X_train, y_train);
eli5.show_weights(crf, top=5, show=['transition_features'])



From \ To,B_EXCLUSION,B_INCLUSION,I_EXCLUSION,I_INCLUSION,O
B_EXCLUSION,-0.261,-0.131,2.704,-0.556,-0.89
B_INCLUSION,-0.006,-0.381,-0.463,3.221,-1.254
I_EXCLUSION,-0.24,-1.132,3.398,-0.835,0.0
I_INCLUSION,-1.277,0.0,-1.292,2.396,0.0
O,0.577,0.305,-2.352,-2.509,3.53


In [46]:
eli5.show_weights(crf, top=10, targets=['O', 'B_EXCLUSION', 'I_INCLUSION'])



From \ To,O,B_EXCLUSION,I_INCLUSION
O,3.53,0.577,-2.509
B_EXCLUSION,-0.89,-0.261,-0.556
I_INCLUSION,0.0,-1.277,2.396

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+1.096,bias,
+1.061,"-1:word.lower():,",
+0.973,postag:CCONJ,
+0.973,postag[:2]:CC,
+0.910,word[-3:]:res,
+0.896,-1:word.lower():treatment,
… 154 more positive …,… 154 more positive …,
… 82 more negative …,… 82 more negative …,
-0.756,-1:word.lower():cover,
-0.840,+1:word.lower():services,

Weight?,Feature
+1.096,bias
+1.061,"-1:word.lower():,"
+0.973,postag:CCONJ
+0.973,postag[:2]:CC
+0.910,word[-3:]:res
+0.896,-1:word.lower():treatment
… 154 more positive …,… 154 more positive …
… 82 more negative …,… 82 more negative …
-0.756,-1:word.lower():cover
-0.840,+1:word.lower():services

Weight?,Feature
+1.226,"-1:word.lower():,"
+1.200,-1:word.lower():or
+0.986,word.lower():dental
+0.960,BOS
+0.898,word[-2:]:al
+0.818,word[-3:]:one
+0.818,word.lower():methadone
+0.786,word[-2:]:ne
… 45 more positive …,… 45 more positive …
… 5 more negative …,… 5 more negative …

Weight?,Feature
+0.787,word.lower():test
+0.787,word[-3:]:est
+0.757,-1:word.lower():(
+0.743,-1:postag:ADJ
+0.723,+1:word.lower():and
+0.684,-1:word.lower():outpatient
+0.660,-1:postag[:2]:AD
+0.607,-1:postag:NOUN
+0.607,-1:postag[:2]:NO
+0.606,+1:word.lower():agonist


In [47]:
eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])



Weight?,Feature
0.006,word.isdigit()
