# Building Model CRF
### By **Néstor Suat** in 2020

**Descripción:** Generando un modelo ML para la tarea de NER en tweets de accidentes para las etiquetas `loc` y `time` usando el estandar BIO.

**Input:**
* TSV con dataset etiquetado con BIO

**Output:**
* Model

**Tomado de**: https://www.depends-on-the-definition.com/named-entity-recognition-conditional-random-fields-python/
***

### Importando librerías

In [1]:
import pandas as pd

In [83]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            #s = self.grouped["Sentence: {}".format(self.n_sent)]
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

### Craft features

Now we craft a set of features and prepare the dataset.

In [90]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

### Importando dataset anotado

In [42]:
file = 'ner-crf-training-data.tsv'
dir_ = "../../../data/v1/NER/"
data = pd.read_csv(dir_+file, delimiter = "\t", quoting = 3, names=['Sentence #','Word','POS','Tag'])
#dataset[:50]

In [43]:
words = list(set(data['Word'].values))
n_words = len(words)
n_words

455

In [84]:
getter = SentenceGetter(data)

In [87]:
sent = getter.get_next()
print(sent)

[('2', 'NUM', 'O'), ('ciclistas', 'NOUN', 'O'), ('muertos', 'ADJ', 'O'), ('hoy', 'ADV', 'B-time'), ('de', 'ADP', 'O'), ('5:30', 'NUM', 'B-time'), ('am', 'NOUN', 'I-time'), ('a', 'ADP', 'I-time'), ('9:45', 'NUM', 'I-time'), ('am', 'NOUN', 'I-time'), ('en', 'ADP', 'O'), ('el', 'DET', 'O'), ('corto', 'ADJ', 'O'), ('tramo', 'NOUN', 'O'), ('de', 'ADP', 'O'), ('puente', 'NOUN', 'B-loc'), ('guaduas', 'ADJ', 'I-loc'), ('al', 'ADP', 'I-loc'), ('puente', 'NOUN', 'I-loc'), ('de', 'ADP', 'I-loc'), ('la', 'DET', 'I-loc'), ('rotonda', 'NOUN', 'I-loc'), ('de', 'ADP', 'I-loc'), ('Siberia', 'PROPN', 'I-loc'), ('.', 'PUNCT', 'O'), ('.', 'PUNCT', 'O'), ('este', 'DET', 'O'), ('año', 'NOUN', 'O'), ('en', 'ADP', 'O'), ('ese', 'DET', 'O'), ('mismo', 'DET', 'O'), ('tramo', 'NOUN', 'O'), ('van', 'VERB', 'O'), ('por', 'ADP', 'O'), ('poquito', 'PRON', 'O'), ('10', 'NUM', 'O'), ('ciclistas', 'NOUN', 'O'), ('muertos', 'ADJ', 'O'), ('por', 'ADP', 'O'), ('accidente', 'NOUN', 'O'), (',', 'PUNCT', 'O'), ('delicado', '

In [89]:
sentences = getter.sentences
sentences

[[('sectormovilidad', 'NOUN', 'O'),
  ('giro', 'NOUN', 'O'),
  ('paa', 'NOUN', 'O'),
  ('tomar', 'VERB', 'O'),
  ('la', 'DET', 'O'),
  ('calle', 'NOUN', 'B-loc'),
  ('80', 'NUM', 'I-loc'),
  ('al', 'ADP', 'O'),
  ('occidente', 'NOUN', 'O'),
  ('en', 'ADP', 'O'),
  ('av', 'NOUN', 'B-loc'),
  ('Boyaca', 'PROPN', 'I-loc'),
  ('bloqueado', 'ADJ', 'O'),
  ('por', 'ADP', 'O'),
  ('accidente', 'NOUN', 'O'),
  ('simple', 'ADJ', 'O'),
  (',', 'PUNCT', 'O'),
  ('paso', 'NOUN', 'O'),
  ('bloqueado', 'ADJ', 'O')],
 [('sectormovilidad', 'NOUN', 'O'),
  ('hubo', 'AUX', 'O'),
  ('un', 'DET', 'O'),
  ('choque', 'NOUN', 'O'),
  ('entre', 'ADP', 'O'),
  ('un', 'DET', 'O'),
  ('sitp', 'NOUN', 'O'),
  ('y', 'CCONJ', 'O'),
  ('una', 'DET', 'O'),
  ('camioneta', 'NOUN', 'O'),
  (',', 'PUNCT', 'O'),
  ('en', 'ADP', 'O'),
  ('la', 'DET', 'O'),
  ('oreja', 'NOUN', 'O'),
  ('de', 'ADP', 'O'),
  ('la', 'DET', 'O'),
  ('68', 'NOUN', 'B-loc'),
  ('que', 'PRON', 'I-loc'),
  ('sale', 'VERB', 'I-loc'),
  ('a', 'ADP',

In [91]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [92]:
X

[[{'bias': 1.0,
   'word.lower()': 'sectormovilidad',
   'word[-3:]': 'dad',
   'word[-2:]': 'ad',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'postag': 'NOUN',
   'postag[:2]': 'NO',
   'BOS': True,
   '+1:word.lower()': 'giro',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'NOUN',
   '+1:postag[:2]': 'NO'},
  {'bias': 1.0,
   'word.lower()': 'giro',
   'word[-3:]': 'iro',
   'word[-2:]': 'ro',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'postag': 'NOUN',
   'postag[:2]': 'NO',
   '-1:word.lower()': 'sectormovilidad',
   '-1:word.istitle()': False,
   '-1:word.isupper()': False,
   '-1:postag': 'NOUN',
   '-1:postag[:2]': 'NO',
   '+1:word.lower()': 'paa',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'NOUN',
   '+1:postag[:2]': 'NO'},
  {'bias': 1.0,
   'word.lower()': 'paa',
   'word[-3:]': 'paa',
   'word[-2:]': 'aa',
   'word.isupper

## Model CRF
Now we can initialize the algorithm. We use the conditional random field (CRF) implementation provided by sklearn-crfsuite.

In [102]:
from sklearn_crfsuite import CRF
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report
import eli5

In [94]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [98]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)


In [99]:
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

              precision    recall  f1-score   support

       B-loc       0.74      0.55      0.63        62
      B-time       0.00      0.00      0.00         8
       I-loc       0.85      0.74      0.80       180
      I-time       0.00      0.00      0.00         8
           O       0.92      0.98      0.95       954

    accuracy                           0.91      1212
   macro avg       0.50      0.45      0.47      1212
weighted avg       0.89      0.91      0.90      1212



  'precision', 'predicted', average, warn_for)


In [100]:
crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [103]:
eli5.show_weights(crf, top=30)

From \ To,O,B-loc,I-loc,B-time,I-time
O,3.694,3.102,0.0,1.322,0.0
B-loc,-1.802,0.0,4.348,0.0,0.0
I-loc,0.029,0.0,4.525,0.0,0.0
B-time,-0.228,0.0,0.0,0.234,1.272
I-time,-0.512,0.0,0.0,0.0,2.732

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+2.314,EOS,,,
+1.530,bias,,,
+1.232,-1:word.lower():sur,,,
+0.989,postag[:2]:PU,,,
+0.989,postag:PUNCT,,,
+0.969,word.lower():accidente,,,
+0.968,word.lower():choque,,,
+0.939,+1:word.lower():5:30,,,
+0.938,BOS,,,
+0.926,-1:word.lower():bogotá,,,

Weight?,Feature
+2.314,EOS
+1.530,bias
+1.232,-1:word.lower():sur
+0.989,postag[:2]:PU
+0.989,postag:PUNCT
+0.969,word.lower():accidente
+0.968,word.lower():choque
+0.939,+1:word.lower():5:30
+0.938,BOS
+0.926,-1:word.lower():bogotá

Weight?,Feature
+1.735,word.istitle()
+1.594,+1:word.lower():norte
+1.351,-1:word.lower():la
+1.165,-1:word.lower():del
+1.135,word.lower():corabastos
+1.125,word.lower():av
+1.063,word.lower():autonorte
+1.022,word[-3:]:uto
+1.022,word.lower():auto
+0.919,+1:word.lower():caro

Weight?,Feature
+1.415,-1:word.lower():con
+1.309,+1:word.lower():choque
+1.233,-1:word.lower():puente
+1.149,-1:word.lower():128
+1.138,-1:word.lower():auto
+1.112,word[-3:]:sur
+1.091,-1:word.lower():8
+0.974,-1:word.lower():calle
+0.974,+1:word.lower():hacia
+0.912,-1:word.lower():autonorte

Weight?,Feature
+1.785,postag:ADV
+1.681,word.lower():viernes
+1.441,word[-3:]:hoy
+1.441,word[-2:]:oy
+1.441,word.lower():hoy
+1.401,word[-3:]:nes
+1.095,word[-3:]::30
+1.095,word.lower():5:30
+1.089,word.lower():mañana
+1.075,word[-3:]:ana

Weight?,Feature
+0.914,-1:postag:NUM
+0.914,-1:postag[:2]:NU
+0.774,-1:word.lower():5:30
+0.727,word[-3:]:am
+0.727,word.lower():am
+0.719,word[-2:]:am
+0.696,word.lower():mayo
+0.696,word[-2:]:yo
+0.696,word[-3:]:ayo
+0.685,+1:word.lower():nuestras


### Improve the model with regularization

In [113]:
crf = CRF(algorithm='lbfgs',
c1=10,
c2=0.1,
max_iterations=100,
all_possible_transitions=False)

In [115]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

In [116]:
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

              precision    recall  f1-score   support

       B-loc       0.85      0.27      0.41        62
      B-time       0.00      0.00      0.00         8
       I-loc       0.86      0.35      0.50       180
      I-time       0.00      0.00      0.00         8
           O       0.84      0.99      0.91       954

    accuracy                           0.84      1212
   macro avg       0.51      0.32      0.36      1212
weighted avg       0.84      0.84      0.81      1212



In [114]:
crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=10, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [117]:
eli5.show_weights(crf, top=30)

From \ To,O,B-loc,I-loc,B-time,I-time
O,2.136,0.83,0.0,0.0,0.0
B-loc,0.0,0.0,2.909,0.0,0.0
I-loc,0.0,0.0,2.646,0.0,0.0
B-time,0.0,0.0,0.0,0.0,0.0
I-time,0.0,0.0,0.0,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
+1.410,bias,,,
+0.130,postag:PUNCT,,,
+0.130,postag[:2]:PU,,,
+0.062,+1:postag[:2]:NO,,,
+0.062,+1:postag:NOUN,,,
+0.009,EOS,,,
-0.009,+1:postag:ADP,,,
-0.087,word.isdigit(),,,
-0.249,+1:postag:NUM,,,
-0.249,+1:postag[:2]:NU,,,

Weight?,Feature
1.41,bias
0.13,postag:PUNCT
0.13,postag[:2]:PU
0.062,+1:postag[:2]:NO
0.062,+1:postag:NOUN
0.009,EOS
-0.009,+1:postag:ADP
-0.087,word.isdigit()
-0.249,+1:postag:NUM
-0.249,+1:postag[:2]:NU

Weight?,Feature
1.721,-1:word.lower():la
-0.165,postag[:2]:AD

Weight?,Feature
1.703,word.isdigit()
0.138,word[-3:]:con
0.138,word.lower():con
0.071,-1:postag:NOUN
0.071,-1:postag[:2]:NO
