In [1]:
import numpy as np
from collections import Counter
from keras_preprocessing.sequence import pad_sequences
import pickle
import platform

In [2]:
def _parse_data(fh):
    #  in windows the new line is '\r\n\r\n' the space is '\r\n' . so if you use windows system,
    #  you have to use recorsponding instructions    
    
    if platform.system() == 'Windows':
        split_text = '\n'
    else:
        split_text = '\n'
    
    string = fh.read().decode('utf-8')
    data = [[row.split() for row in sample.split(split_text)] for
            sample in
            string.strip().split(split_text + split_text)]
    fh.close()
    return data

In [3]:
def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
    if maxlen is None:
        maxlen = max(len(s) for s in data)
    word2idx = dict((w, i) for i, w in enumerate(vocab))
    x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]
    y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
    
    x = pad_sequences(x, maxlen)
    y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
    
    if onehot:
        y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk]
    else:
        y_chunk = np.expand_dims(y_chunk, 2)
    return x, y_chunk

In [4]:
def process_data(data, vocab, maxlen=100):
    word2idx = dict((w, i) for i, w in enumerate(vocab))
    x = [word2idx.get(w[0].lower(), 100) for w in data]
    length = len(x)
    x = pad_sequences([x], maxlen)
    return x, length

In [5]:
def load_data():
    train = _parse_data(open('data/train_data.data', 'rb'))
    test = _parse_data(open('data/test_data.data', 'rb'))
    
    word_counts = Counter(row[0].lower() for sample in train for row in sample)
    vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
    chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"]
    
    with open('models/config.pkl', 'wb') as outp:
        pickle.dump((vocab, chunk_tags), outp)
    
    # save initial config data
    train = _process_data(train, vocab, chunk_tags)
    test = _process_data(test, vocab, chunk_tags)
    return train, test, (vocab, chunk_tags)

In [6]:
train, test, (vocab, chunk_tags) = load_data()

In [7]:
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM
from keras_contrib.layers import CRF
import pickle

Using TensorFlow backend.


In [8]:
EMBED_DIM = 200
BIRNN_UNITS = 200

In [9]:
def create_model(train=True):
    if train:
        (train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data()
    else:
        with open('models/config.pkl', 'rb') as inp:
            (vocab, chunk_tags) = pickle.load(inp)
    
    model = Sequential()
    model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True))
    model.add(Bidirectional(LSTM(BIRNN_UNITS // 2, return_sequences=True)))
    crf = CRF(len(chunk_tags), sparse_target=True)
    model.add(crf)
    model.summary()
    model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
    if train:
        return model, (train_x, train_y), (test_x, test_y)
    else:
        return model, (vocab, chunk_tags)

In [10]:
EPOCH = 10

In [11]:
model, (train_x, train_y), (test_x, test_y) = create_model()

W0723 12:32:18.580166  6724 deprecation_wrapper.py:119] From D:\Anaconda3\envs\tensorflow\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0723 12:32:18.592107  6724 deprecation_wrapper.py:119] From D:\Anaconda3\envs\tensorflow\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0723 12:32:18.594101  6724 deprecation_wrapper.py:119] From D:\Anaconda3\envs\tensorflow\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0723 12:32:18.808525  6724 deprecation.py:323] From D:\Anaconda3\envs\tensorflow\lib\site-packages\keras\backend\tensorflow_backend.py:2974: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 200)         851600    
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 200)         240800    
_________________________________________________________________
crf_1 (CRF)                  (None, None, 7)           1470      
Total params: 1,093,870
Trainable params: 1,093,870
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.fit(train_x, train_y, batch_size=32, epochs=EPOCH, validation_data=[test_x, test_y])

W0723 12:32:20.682747  6724 deprecation_wrapper.py:119] From D:\Anaconda3\envs\tensorflow\lib\site-packages\keras\backend\tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 50658 samples, validate on 4631 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x277adab7a90>

In [13]:
model.save('models/crf.h5')

In [14]:
predict_model, (vocab, chunk_tags) = create_model(train=False)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 200)         851600    
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 200)         240800    
_________________________________________________________________
crf_2 (CRF)                  (None, None, 7)           1470      
Total params: 1,093,870
Trainable params: 1,093,870
Non-trainable params: 0
_________________________________________________________________




In [16]:
predict_model.load_weights('models/crf.h5')

In [23]:
predict_text = '中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下，连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚'
# predict_text = '我留言板球球群你们加下找我要回关'
text, length = process_data(predict_text, vocab)
raw = model.predict(text)[0][-length:]
result = [np.argmax(row) for row in raw]
result_tags = [chunk_tags[i] for i in result]

In [24]:
per, loc, org = '', '', ''

for s, t in zip(predict_text, result_tags):
    if t in ('B-PER', 'I-PER'):
        per += ' ' + s if (t == 'B-PER') else s
    if t in ('B-ORG', 'I-ORG'):
        org += ' ' + s if (t == 'B-ORG') else s
    if t in ('B-LOC', 'I-LOC'):
        loc += ' ' + s if (t == 'B-LOC') else s

In [25]:
print(['person:' + per, 'location:' + loc, 'organzation:' + org])

['person: 周恩来 陈毅', 'location: 埃塞俄比亚 非洲 阿尔巴尼亚', 'organzation: 中华人民共和国国务院 外交部']
