In [3]:
import pandas as pd
import numpy as np

data = pd.read_csv('data/ner_dataset.csv', encoding='latin1')
data = data.fillna(method='ffill')

In [33]:
data.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [9]:
vocab = set(data.Word.tolist())
vocab_size = len(vocab)

In [48]:
class SentenceGetter:
    def __init__(self, data):
        self.pos = 1
        self.data = data
        self.empty = False
        
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s.Word.tolist(),
                                                          s.POS.tolist(),
                                                          s.Tag.tolist())]
        
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def reset(self):
        self.pos = 1
        
    def get_next(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.pos)]
            self.pos += 1
            return s
        except:
            return None

In [49]:
getter = SentenceGetter(data)

In [50]:
sent = getter.get_next()

In [51]:
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [53]:
sentences = getter.sentences

In [55]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [61]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [68]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False,
          verbose=True)

In [69]:
from sklearn.cross_validation import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

In [70]:
#pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5, verbose=2)

In [71]:
crf.fit(X, y)

loading training data to CRFsuite: 100%|██████████| 47959/47959 [00:17<00:00, 2779.98it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 152663
Seconds required: 3.983

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=3.30  loss=1685559.31 active=151597 feature_norm=1.00
Iter 2   time=3.39  loss=1321251.59 active=149864 feature_norm=4.40
Iter 3   time=1.72  loss=1032583.16 active=143571 feature_norm=3.85
Iter 4   time=8.45  loss=567601.33 active=145448 feature_norm=3.24
Iter 5   time=1.69  loss=475669.88 active=147274 feature_norm=4.08
Iter 6   time=1.68  loss=369032.77 active=145972 feature_norm=5.87
Iter 7   time=1.68  loss=320589.25 active=138308 feature_norm=7.20
Iter 8   time=1.69  loss=285565.86 active=132252 feature_norm=8.19
Iter 9   time=1.68  loss=246725.43 active=123606 feature_no

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=True)

In [72]:
pred = crf.predict(X)
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)


             precision    recall  f1-score   support

      B-art       0.93      0.73      0.82       402
      B-eve       0.85      0.69      0.76       308
      B-geo       0.91      0.95      0.93     37644
      B-gpe       0.98      0.95      0.97     15870
      B-nat       0.90      0.60      0.72       201
      B-org       0.91      0.85      0.88     20143
      B-per       0.94      0.93      0.94     16990
      B-tim       0.96      0.92      0.94     20333
      I-art       0.94      0.77      0.85       297
      I-eve       0.87      0.67      0.76       253
      I-geo       0.90      0.92      0.91      7414
      I-gpe       0.96      0.69      0.80       198
      I-nat       0.95      0.71      0.81        51
      I-org       0.94      0.93      0.94     16784
      I-per       0.94      0.96      0.95     17251
      I-tim       0.94      0.88      0.91      6528
          O       1.00      1.00      1.00    887908

avg / total       0.99      0.99      0.99  