# Building Model CRF
### By **Néstor Suat** in 2020

**Descripción:** Generando un modelo ML para la tarea de NER en tweets de accidentes para las etiquetas `loc` y `time` usando el estandar BIO.

**Input:**
* TSV con dataset etiquetado con BIO

**Output:**
* Model

**Tomado de**: https://www.depends-on-the-definition.com/named-entity-recognition-conditional-random-fields-python/
***

### Importando librerías

In [1]:
import pandas as pd
import numpy as np
import fasttext
import fasttext.util

### Source code

La clase `StenteceGetter` es una clase generica en muchos proyectos de NER, permite tomar el dataset y prepararlo en una lista python para trabajar por oraciones.

In [2]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t, n) for w, p, t, n in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["NER"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            #s = self.grouped["Sentence: {}".format(self.n_sent)]
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

### Selección de características

Para el algoritmo CRF se seleccionan unas caracteristicas respecto a las reglas gramaticas y formologicas de las palabra.

In [3]:
def wordshape(text):
    import re
    t1 = re.sub('[A-ZÁÉÍÓÚñ]', 'X',text)
    t2 = re.sub('[a-záéíóúñ]', 'x', t1)
    return re.sub('[0-9]', 'd', t2)

In [4]:
ft = fasttext.load_model('../../../data/v1/fasttext/cc.es.300.bin')
def word2vector(word):
    return ft.get_word_vector(word)



In [6]:
#word2vector("hola")

In [7]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    nertag = sent[i][2]
    tag = sent[i][3]
    vector = word2vector(word)

    features = {
        'bias': 1,
        'word': word,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],    
        'word[-2:]': word[-2:],
        'word.len()': len(word),
        'word.shape()': wordshape(word),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    for iv,value in enumerate(vector):
        features['v{}'.format(iv)]=value
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        nertag1 = sent[i-1][2]    
        features.update({
            '-1:word': word1,
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:nertag': nertag1,           
        })
        
        if i > 2:
            word2 = sent[i-2][0]
            postag2 = sent[i-2][1]
            nertag2 = sent[i-1][2]    
            features.update({
                '-2:word': word2,
                '-2:word.lower()': word2.lower(),
                '-2:word.istitle()': word2.istitle(),
                '-2:word.isupper()': word2.isupper(),
                '-2:postag': postag2,
                '-2:postag[:2]': postag2[:2],
                '-2:nertag': nertag2,             
            })

    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word': word1,
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
        
        if i < len(sent)-2:
            word2 = sent[i+2][0]
            postag2 = sent[i+2][1]
            features.update({
                '+2:word': word2,
                '+2:word.lower()': word2.lower(),
                '+2:word.istitle()': word2.istitle(),
                '+2:word.isupper()': word2.isupper(),
                '+2:postag': postag2,
                '+2:postag[:2]': postag2[:2],
            })
        
    else:
        features['EOS'] = True
        
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, nertag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, nertag, label in sent]

### Importando dataset anotado

El archivo `ner-crf-training-data.tsv` fue construido anteriormente transformando el formato de anotación de Standoff a BIO.

In [8]:
file = 'ner-crf-training-data-v2-corenlp.tsv'
dir_ = "../../../data/v1/NER/train/ner_tag/"
train = pd.read_csv(dir_+file, delimiter = "\t", quoting = 3, names=['Sentence #','Word','POS','NER','Tag'])
#dataset[:50]

In [9]:
train['Tag'].value_counts()

O         20242
I-loc      3768
B-loc      1462
B-time      131
I-time      112
Name: Tag, dtype: int64

In [10]:
file = 'ner-crf-test-data-v2-corenlp.tsv'
dir_ = "../../../data/v1/NER/test/ner_tag/"
test = pd.read_csv(dir_+file, delimiter = "\t", quoting = 3, names=['Sentence #','Word','POS','NER','Tag'])

In [11]:
test['Tag'].value_counts()

O         5038
I-loc      893
B-loc      369
I-time      33
B-time      33
Name: Tag, dtype: int64

In [18]:
test[:50]

Unnamed: 0,Sentence #,Word,POS,NER,Tag
0,1261,bogotatransito,ADJ,O,O
1,1261,Accidente,PROPN,CAUSE_OF_DEATH,O
2,1261,y,CCONJ,O,O
3,1261,bloqueo,NOUN,O,O
4,1261,en,ADP,O,O
5,1261,la,DET,O,O
6,1261,Calle,PROPN,MISC,B-loc
7,1261,13,NUM,NUMBER,I-loc
8,1261,Ori-Occ,PROPN,MISC,O
9,1261,en,ADP,O,O


#### **Preparando el dataset**

Se construye un corpus de todas las palabras presentes en los tweets, se agrega un token esepcial para rellenar llamado ENDPAD, finalmente se calcula el tamaño del corpus de palabras. Esto mismo se hace para las etiquetas, aunque en este caso es más fácil porque son 5: `b-loc`, `i-loc`, `b-time`, `i-time` y `o`.

words = list(set(train['Word'].values))
words.sort()
n_words = len(words); n_words

Se toman los datos y se construye el arreglo de las oraciones a trabajar

In [12]:
getter = SentenceGetter(train)
train_sentences = getter.sentences
#sentences

In [32]:
train_sentences

[[('Grave', 'ADJ', 'O', 'O'),
  ('accidente', 'NOUN', 'CAUSE_OF_DEATH', 'O'),
  ('en', 'ADP', 'O', 'O'),
  ('la', 'DET', 'O', 'O'),
  ('autopista', 'NOUN', 'O', 'B-loc'),
  ('norte', 'ADJ', 'O', 'I-loc'),
  ('con', 'ADP', 'O', 'I-loc'),
  ('calle', 'NOUN', 'O', 'I-loc'),
  ('153', 'NUM', 'NUMBER', 'I-loc'),
  (',', 'PUNCT', 'O', 'O'),
  ('(', 'PUNCT', 'O', 'O'),
  ('N-S', 'NOUN', 'O', 'O'),
  (')', 'PUNCT', 'O', 'O'),
  (',', 'PUNCT', 'O', 'O'),
  ('una', 'DET', 'O', 'O'),
  ('persona', 'NOUN', 'O', 'O'),
  ('falleció', 'VERB', 'O', 'O'),
  ('y', 'CCONJ', 'O', 'O'),
  ('tres', 'NUM', 'NUMBER', 'O'),
  ('más', 'ADV', 'O', 'O'),
  ('heridas', 'ADJ', 'O', 'O'),
  (',', 'PUNCT', 'O', 'O'),
  ('fuerte', 'ADJ', 'O', 'O'),
  ('congestión', 'NOUN', 'O', 'O'),
  ('caracolradio', 'ADJ', 'O', 'O')],
 [('Accidente', 'NOUN', 'CAUSE_OF_DEATH', 'O'),
  ('de', 'ADP', 'O', 'O'),
  ('particular', 'NOUN', 'O', 'O'),
  ('con', 'ADP', 'O', 'O'),
  ('volqueta', 'NOUN', 'O', 'O'),
  ('en', 'ADP', 'O', 'O'),


In [13]:
getter = SentenceGetter(test)
test_sentences = getter.sentences

In [14]:
X_train = [sent2features(s) for s in train_sentences]
y_train = [sent2labels(s) for s in train_sentences]

In [15]:
X_test = [sent2features(s) for s in test_sentences]
y_test = [sent2labels(s) for s in test_sentences]

In [16]:
X_test[4][0]

{'bias': 1,
 'word': 'jos',
 'word.lower()': 'jos',
 'word[-3:]': 'jos',
 'word[-2:]': 'os',
 'word.len()': 3,
 'word.shape()': 'xxx',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'postag': 'NOUN',
 'postag[:2]': 'NO',
 'v0': -0.054774597,
 'v1': -0.12820101,
 'v2': 0.017376121,
 'v3': 0.080688976,
 'v4': -0.028134141,
 'v5': 0.0551589,
 'v6': -0.07430153,
 'v7': 0.12057606,
 'v8': -0.22909378,
 'v9': 0.15130052,
 'v10': -0.009647433,
 'v11': -0.0027012527,
 'v12': -0.028943967,
 'v13': 0.07798338,
 'v14': -0.06300786,
 'v15': -0.10963216,
 'v16': -0.060295448,
 'v17': 0.0126417205,
 'v18': -0.051427104,
 'v19': 0.17969474,
 'v20': 0.031245083,
 'v21': 0.22055624,
 'v22': -0.10489838,
 'v23': 0.11296649,
 'v24': -0.26547697,
 'v25': -0.033219915,
 'v26': 0.04425973,
 'v27': 0.015035076,
 'v28': 0.06765505,
 'v29': -0.06941456,
 'v30': 0.24615023,
 'v31': 0.012449157,
 'v32': 0.046670426,
 'v33': 0.00050055515,
 'v34': -0.034879662,
 'v35': -0.069712676

## Train and Test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [17]:
print("Train:",len(X_train), len(y_train))
print("Test:",len(X_test), len(y_test))

Train: 1072 1072
Test: 268 268


## Model CRF

In [18]:
from sklearn_crfsuite import CRF
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite import metrics
import eli5

In [49]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

## Evaluate

### **Cross Validation**
Validación usando cross_validation con K=5

In [19]:
labels = ['B-loc', 'I-loc', 'B-time', 'I-time','O']

### **Split train & test validation**

In [20]:
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False,
         verbose=1)
crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|██████████| 1072/1072 [00:06<00:00, 160.95it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 36842
Seconds required: 0.877

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.32  loss=27558.50 active=36569 feature_norm=1.00
Iter 2   time=0.18  loss=23042.84 active=34831 feature_norm=0.86
Iter 3   time=0.14  loss=21154.07 active=29560 feature_norm=1.05
Iter 4   time=0.14  loss=15418.37 active=36470 feature_norm=1.10
Iter 5   time=0.14  loss=11965.73 active=36605 feature_norm=1.35
Iter 6   time=0.14  loss=10067.55 active=32321 feature_norm=2.14
Iter 7   time=0.15  loss=8288.90  active=35109 feature_norm=2.27
Iter 8   time=0.15  loss=7361.85  active=34222 feature_norm=2.71
Iter 9   time=0.15  loss=6473.01  active=33317 feature_norm=3.19
Iter 10  time=

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=1)

#### **Testset**

In [21]:
labels = ['B-loc', 'I-loc', 'B-time', 'I-time']

y_pred = crf.predict(X_test)
print("F1-score: {:.1%}".format(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))

# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=4
))

F1-score: 91.7%
              precision    recall  f1-score   support

       B-loc     0.9169    0.8374    0.8754       369
       I-loc     0.9523    0.9395    0.9459       893
      B-time     0.8400    0.6364    0.7241        33
      I-time     0.8846    0.6970    0.7797        33

   micro avg     0.9393    0.8976    0.9180      1328
   macro avg     0.8985    0.7776    0.8313      1328
weighted avg     0.9380    0.8976    0.9166      1328



In [22]:
print("Accuracy: "+str(round(metrics.flat_accuracy_score(y_test, y_pred),6)))

print("F1-Micro: "+str(round(metrics.flat_f1_score(y_test, y_pred, average='micro', labels=labels),6)))
print("Recall-Micro: "+str(round(metrics.flat_recall_score(y_test, y_pred, average='micro', labels=labels),6)))
print("Precision-Micro: "+str(round(metrics.flat_precision_score(y_test, y_pred, average='micro', labels=labels),6)))

print("F1-Macro: "+str(round(metrics.flat_f1_score(y_test, y_pred, average='macro', labels=labels),6)))
print("Recall-Macro: "+str(round(metrics.flat_recall_score(y_test, y_pred, average='macro', labels=labels),6)))
print("Precision-Macro: "+str(round(metrics.flat_precision_score(y_test, y_pred, average='macro', labels=labels),6)))

print("F1-Weighted: "+str(round(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels),6)))
print("Recall-Weighted: "+str(round(metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=labels),6)))
print("Precision-Weighted: "+str(round(metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=labels),6)))

Accuracy: 0.970939
F1-Micro: 0.917982
Recall-Micro: 0.89759
Precision-Micro: 0.939322
F1-Macro: 0.83126
Recall-Macro: 0.777565
Precision-Macro: 0.898464
F1-Weighted: 0.916646
Recall-Weighted: 0.89759
Precision-Weighted: 0.938013


Accuracy: 0.970154
F1-Micro: 0.914902
Recall-Micro: 0.894578
Precision-Micro: 0.93617
F1-Macro: 0.823406
Recall-Macro: 0.767957
Precision-Macro: 0.894302
F1-Weighted: 0.91337
Recall-Weighted: 0.894578
Precision-Weighted: 0.934648


### Inspeccionando el Modelo

Visualización de las matrices de probabilidad de transición de una etiqueta a otra. Tambien se puede ver que caracteristicas son más importantes para predecir una etiqueta u otra.

In [69]:
eli5.show_weights(crf, top=30)

From \ To,O,B-loc,I-loc,B-time,I-time
O,1.423,0.086,-5.898,-0.021,0.0
B-loc,-1.883,-0.674,3.61,0.0,0.0
I-loc,-1.422,-2.516,2.731,-1.127,0.0
B-time,-1.04,0.221,0.0,-0.217,7.014
I-time,-0.648,0.0,0.0,-0.146,6.089

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+2.923,BOS,,,
+2.918,postag[:2]:PU,,,
+2.918,postag:PUNCT,,,
+1.744,EOS,,,
+1.728,word.lower():choque,,,
+1.696,word.shape():X-X,,,
+1.662,word.lower():accidente,,,
+1.584,postag:PRON,,,
+1.568,-1:postag:ADV,,,
+1.505,+2:word:Cierres,,,

Weight?,Feature
+2.923,BOS
+2.918,postag[:2]:PU
+2.918,postag:PUNCT
+1.744,EOS
+1.728,word.lower():choque
+1.696,word.shape():X-X
+1.662,word.lower():accidente
+1.584,postag:PRON
+1.568,-1:postag:ADV
+1.505,+2:word:Cierres

Weight?,Feature
+3.432,word.lower():autonorte
+2.057,word.lower():chapinero
+1.948,word.lower():barrio
+1.942,-2:word:Frente
+1.855,word:autonorte
+1.805,word[-2:]:in
+1.744,word.lower():nqs
+1.685,-2:word.lower():frente
+1.638,word.lower():puente
+1.566,word:puente

Weight?,Feature
+3.422,-1:word.lower():av
+3.417,-1:word.lower():av.
+2.132,-1:word.lower():avenida
+2.107,-1:word.lower():cra
+1.984,-1:word:Av.
+1.861,-1:word.lower():calle
+1.634,-1:word.lower():con
+1.628,word.shape():X
+1.625,+1:word.lower():mayo
+1.614,+1:word.lower():avenida

Weight?,Feature
+4.405,word.shape():dd:dd
+3.855,word.shape():d:dd
+1.661,postag:ADV
+1.441,word:anoche
+1.411,-1:word:bogota
+1.400,word.lower():madrugada
+1.400,word:madrugada
+1.358,word[-3:]:che
+1.354,word[-2:]:he
+1.302,-1:word.lower():bogota

Weight?,Feature
+1.355,word.lower():pm
+0.967,word[-3:]:ado
+0.951,word:minutos
+0.951,word.lower():minutos
+0.915,postag:ADJ
+0.886,word[-2:]:as
+0.838,-1:word.lower():a.
+0.798,-2:word:informan
+0.787,-2:word.lower():ocurrió
+0.787,-2:word:ocurrió


### Evaluando desempeño con una sentencia

In [23]:
import numpy as np
i = 1
p = crf.predict_single(X_test[i])

print("{:15} ({:5}): {}".format("Word", "True", "Pred"))
for w,true, pred in zip(X_test[i],y_test[i],p):
    print("{:15} ({:5}): {}".format(w['word.lower()'],true,pred))

Word            (True ): Pred
movilidad       (O    ): O
bogota          (O    ): O
acueducto       (O    ): O
trancon         (O    ): O
accidente       (O    ): O
llevó           (B-time): O
3               (I-time): O
horas           (I-time): O
en              (O    ): O
el              (O    ): O
carro           (O    ): O
bajando         (O    ): O
de              (O    ): O
la              (O    ): O
calera          (B-loc): O
y               (O    ): O
muchos          (O    ): O
buses           (O    ): O
escolares       (O    ): O
con             (O    ): O
niños           (O    ): O
pequeños        (O    ): O
de              (O    ): O
los             (O    ): O
colegios        (O    ): O
,               (O    ): O
nada            (O    ): O
que             (O    ): O
quitan          (O    ): O
el              (O    ): O
camión          (O    ): O
del             (O    ): O
acueducto       (O    ): O
que             (O    ): O
se              (O    ): O
accidentó       (O    

# Mejorando el modelo con regularización

In [115]:
crf = CRF(algorithm='lbfgs',
          c1=10, #Supuestamente mayor número representa que el modelo no dependa tanto de la palabra, sino contexto
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

### Cross Validation

In [None]:
#pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
pred = cross_val_predict(estimator=crf, X=X_train, y=y_train, cv=5)

report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

In [None]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_train, pred, labels=sorted_labels, digits=2
))

### Testset Evaluate

In [116]:
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=10, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [117]:
y_pred = crf.predict(X_test)

In [118]:
#labels = list(crf.classes_)
#labels.remove('O')
#labels

print("F1-score: {:.1%}".format(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))

F1-score: 93.1%


In [None]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=2
))

In [None]:
eli5.show_weights(crf, top=30)

In [None]:
import numpy as np
i = 1
p = crf.predict_single(X_test[i])

print("{:15} ({:5}): {}".format("Word", "True", "Pred"))
for w,true, pred in zip(X_test[i],y_test[i],p):
    print("{:15} ({:5}): {}".format(w['word.lower()'],true,pred))