In [1]:
import numpy as np
import re
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from keras.layers import LSTM, Dense, TimeDistributed, Dropout, Bidirectional
from keras.regularizers import L1L2
from tf2crf import CRF
from tensorflow_addons.text.crf import crf_log_likelihood

import keras.backend as K

from nltk.tokenize import word_tokenize, sent_tokenize, WordPunctTokenizer
from razdel import tokenize, sentenize
from tabulate import tabulate


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [2]:
def unpack_data(data):
    if len(data) == 2:
        return data[0], data[1], None
    elif len(data) == 3:
        return data
    else:
        raise TypeError("Expected data to be a tuple of size 2 or 3.")

class ModelWithCRFLoss(tf.keras.Model):

    def __init__(self, base_model, sparse_target=True, metrics_fn=tf.keras.metrics.Accuracy(name='accuracy')):
        super().__init__()
        self.base_model = base_model
        self.model_layers = [layer for layer in self.base_model.layers]
        self.sparse_target = sparse_target
        self.metrics_fn = metrics_fn
        self.loss_tracker = tf.keras.metrics.Mean(name='loss')

    def call(self, inputs, training=False):
        output = inputs
        for layer in self.model_layers:
            output = layer(output)
        if training:
            return output
        else:
            return output[0]

    def compute_loss(self, x, y, training=False):
        viterbi_sequence, potentials, sequence_length, chain_kernel = self(x, training=training)
        # we now add the CRF loss:
        crf_loss = -crf_log_likelihood(potentials, y, sequence_length, chain_kernel)[0]
        return viterbi_sequence, sequence_length, tf.reduce_mean(crf_loss)

    def train_step(self, data):
        x, y, sample_weight = unpack_data(data)
        # y : '(batch_size, seq_length)'
        if self.sparse_target:
            assert len(y.shape) == 2
        else:
            y = tf.argmax(y, axis=-1)
        with tf.GradientTape() as tape:
            viterbi_sequence, sequence_length, crf_loss = self.compute_loss(x, y, training=True)
            loss = crf_loss + tf.cast(tf.reduce_sum(self.losses), crf_loss.dtype)
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        self.loss_tracker.update_state(loss)
        self.metrics_fn.update_state(y, viterbi_sequence, tf.sequence_mask(sequence_length, y.shape[1]))
        return {"loss": self.loss_tracker.result(), self.metrics_fn.name: self.metrics_fn.result()}

    @property
    def metrics(self):
        return [self.loss_tracker, self.metrics_fn]

    def test_step(self, data):
        x, y, sample_weight = unpack_data(data)
        # y : '(batch_size, seq_length)'
        if self.sparse_target:
            assert len(y.shape) == 2
        else:
            y = tf.argmax(y, axis=-1)
        viterbi_sequence, sequence_length, crf_loss = self.compute_loss(x, y, training=True)
        loss = crf_loss + tf.cast(tf.reduce_sum(self.losses), crf_loss.dtype)
        self.loss_tracker.update_state(loss)
        self.metrics_fn.update_state(y, viterbi_sequence, tf.sequence_mask(sequence_length, y.shape[1]))
        return {"loss": self.loss_tracker.result(), f'{self.metrics_fn.name}': self.metrics_fn.result()}

In [3]:
class MacroF1(keras.metrics.Metric):

  def __init__(self, n_tags, name='macro_f1', **kwargs):
    super().__init__(name=name, **kwargs)
    self.n_tags=n_tags
    self.macro_f = self.add_weight(name='macro_f1', initializer='zeros')

  def recall_m(self, y_true, y_pred):
        TP = K.sum(K.round(y_true * y_pred))
        Positives = K.sum(K.round(y_true))

        recall = TP / (Positives+K.epsilon())
        return recall

  def precision_m(self, y_true, y_pred):
      TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
      Pred_Positives = K.sum(K.round(K.clip(y_pred, 0, 1)))

      precision = TP / (Pred_Positives+K.epsilon())
      return precision

  def update_state(self, y_true, y_pred, sample_weight=None):
    y_true = tf.one_hot(tf.cast(y_true, 'uint8'), depth=self.n_tags)
    y_pred = tf.one_hot(tf.cast(y_pred, 'uint8'), depth=self.n_tags)

    f_classes = []
    for i in range(self.n_tags):   # Считаем f_score для каждого класса и усредняем
        y_t = y_true[:, :, i]
        y_p = y_pred[:, :, i]
        precision, recall = self.precision_m(y_t, y_p), self.recall_m(y_t, y_p)
        f = 2*((precision*recall)/(precision+recall+K.epsilon()))
        f_classes.append(f)
    self.macro_f = tf.math.reduce_mean(f_classes)

  def result(self):
    return self.macro_f

  def reset_state(self):
    self.macro_f = tf.zeros([1])

In [4]:
class NERModel(keras.Model):
    def __init__(
        self, num_tags, maxlen=100, embed_dim=1024
    ):
        super().__init__()
        self.bilstm = Bidirectional(LSTM(units=embed_dim, return_sequences=True,),
                                    input_shape=(maxlen, embed_dim,))
        self.dropout1 = Dropout(0.3)
        self.lstm =LSTM(units=embed_dim * 2, return_sequences=True, kernel_regularizer=L1L2(l1=0.01, l2=0.0))
        self.dropout2 = Dropout(0.3)
        self.dense = TimeDistributed(Dense(512, activation="relu", kernel_regularizer=L1L2(l1=0.01, l2=0.0)))
        self.dropout3 = Dropout(0.5)
        self.crf = CRF(num_tags)

    def call(self, inputs, training=False):
        x = self.bilstm(inputs)
        x = self.dropout1(x)
        x = self.lstm(x)
        x = self.dropout2(x)
        x = self.dense(x)
        x = self.dropout3(x)
        x = self.crf(x)
        
        return x

In [5]:
macro_f1 = MacroF1(n_tags = 8)

ner_model = ModelWithCRFLoss(NERModel(num_tags=8), metrics_fn=macro_f1)
ner_model.compile(optimizer="adam")

In [6]:
# ner_model.load_weights('./checkpoints/elmo_bilstm_crf/best')
# ner_model.save_weights('./models/elmo_bilstm_crf/elmo_bilstm_crf')

In [7]:
ner_model.load_weights('./models/elmo_bilstm_crf')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x186cb2bed30>

In [8]:
tags_inv = {
    0: '[PAD]',
    1: 'O',
    2: 'B-PER',
    3: 'I-PER',
    4: 'B-ORG',
    5: 'I-ORG',
    6: 'B-LOC',
    7: 'I-LOC',
    8: 'STRUCT'
}

tag_names = list(tags_inv.values())

In [9]:
elmo = hub.load(r'elmo_ru-news_wmt11-16_1.5M_steps')



In [10]:
def neural_tags(sentences):
    ls = [len(s) for s in sentences]
    emb = tf.keras.utils.pad_sequences(sentences, maxlen=100, padding='post', value='', dtype=object)
    emb = elmo.signatures['tokens'](tokens=tf.cast(emb, tf.string), sequence_len = tf.cast(ls, tf.int32))['elmo']
    tags = ner_model.predict(emb)
    out = []
    for i in range(len(ls)):
        out.append([sentences[i], tags[i][:ls[i]]])       
    tf.keras.backend.clear_session()
    return out

In [11]:
def print_tags(data):
    tokens, tags = data
    t_inv = [tags_inv[t] for t in tags]
    print(tabulate(zip(tokens, t_inv)))

In [12]:
def tokenize_text(text):
    s_t = [_.text for _ in list(sentenize(text))]
    return [[_.text for _ in list(tokenize(s))] for s in s_t]

def neural_predict(text):
    tok = tokenize_text(text)
    for sent in neural_tags(tok):
        print_tags(sent)

In [13]:
tel_pattern = r'(((8|\+7)[\- \s]?)?(\(?\d{3}\)?[\-\s]?)?[\d\-\s]{7,16})'
inn_pattern = r'(([0-9]{12})|([0-9]{10}))'
snils_pattern = r'((\d{3}[\- ]?){3} ?\d{2})'
pass_pattern = r'((\d{2}\s?\d{2})\D{0,10}(\d{6}))'
oms_pattern = r'((\d{4}[\s-]?){4})'

In [14]:
rules = [tel_pattern, inn_pattern, snils_pattern, pass_pattern, oms_pattern]

In [15]:
def tokenize_with_pos(text):
    sentences = [_.text for _ in list(sentenize(text))]
    tokens = [[_.text for _ in list(tokenize(s))] for s in sentences]
    token_pos = [[(_.start, _.stop) for _ in list(tokenize(s))] for s in sentences]
    return sentences, tokens, token_pos

In [16]:
def rules_tags(text):
    s_t, tokens, token_pos = tokenize_with_pos(text)
    
    struct_pos = []
    for sent in s_t:
        sent_pos = []
        for pat in rules:
            for match in re.finditer(pat, sent):
                sent_pos.append((match.start(), match.end()))
        struct_pos.append(sent_pos)

    token_tags = [np.zeros(len(_), dtype=np.uint8) for _ in tokens]

    for i, s_pos in enumerate(struct_pos):
        token_starts = np.array([tp[0] for tp in token_pos[i]])

        for start, end in s_pos:
            token_tags[i][(token_starts >= start) & (token_starts < end)] = 8

    return token_tags

In [17]:
def hybrid_out(text):
    neural_out = neural_tags(tokenize_text(text))
    rule_out = rules_tags(text)
    h_out = []
    for i, [sent, n_tags] in enumerate(neural_out):
        r_tags = rule_out[i]
        h_tags = r_tags
        h_tags[h_tags == 0] = n_tags[h_tags == 0]
        was_loc = False
        count = 0
        for j, token in enumerate(sent):
            if h_tags[j] in [6, 7]:
                was_loc = True
                count = 0
            elif h_tags[j] in [2, 3, 4, 5] or count > 4:
                was_loc = False

            if was_loc and re.match(r'\d{1,4}', token) and h_tags[j]<=1:
                h_tags[j] = 8
                count = 0
        h_out.append([sent, h_tags])
    return h_out

In [18]:
text = '''\
Пожалуйста, предоставьте Михаилу Ивановичу Резниченко (ИНН: 1112223334, снилс 111-111-111 99) офис по адресу г. Москва, улица Орджоникидзе, 15.
Я, Иван Платонович Клопик, паспорт серия 1234 номер 123456, тел: 8 (925) 111 22 33, проживающий по адресу: г. Воркута, ул. Карла Маркса, 34с1, кв. 123, встретил Людмилу Штепсель вчера в 11 часов вечера.
'''

In [19]:
def hybrid_predict(text):
    for sent in hybrid_out(text):
        print_tags(sent)

In [20]:
hybrid_predict(text)

------------  ------
Пожалуйста    O
,             O
предоставьте  O
Михаилу       B-PER
Ивановичу     I-PER
Резниченко    I-PER
(             O
ИНН           O
:             O
1112223334    STRUCT
,             O
снилс         O
111-111-111   STRUCT
99            STRUCT
)             O
офис          O
по            O
адресу        O
г             O
.             O
Москва        B-LOC
,             O
улица         O
Орджоникидзе  B-LOC
,             O
15            STRUCT
.             O
------------  ------
-----------  ------
Я            O
,            O
Иван         B-PER
Платонович   I-PER
Клопик       I-PER
,            O
паспорт      O
серия        O
1234         STRUCT
номер        STRUCT
123456       STRUCT
,            O
тел          O
:            O
8            STRUCT
(            STRUCT
925          STRUCT
)            STRUCT
111          STRUCT
22           STRUCT
33           STRUCT
,            O
проживающий  O
по           O
адресу       O
:            O
г            O