# 使用 LSTM 进行命名实体识别



In [1]:
import time
from os.path import join
from codecs import open

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

import torch.optim as optim

## 读取数据

In [2]:
def build_corpus(split, make_vocab=True, data_dir="./ResumeNER"):
    """读取数据"""
    assert split in ['train', 'dev', 'test']

    word_lists = []
    tag_lists = []
    with open(join(data_dir, split+".char.bmes"), 'r', encoding='utf-8') as f:
        word_list = []
        tag_list = []
        for line in f:
            if line != '\n':
                word, tag = line.strip('\n').split()
                word_list.append(word)
                tag_list.append(tag)
            else:
                word_lists.append(word_list)
                tag_lists.append(tag_list)
                word_list = []
                tag_list = []

    # 如果make_vocab为True，还需要返回word2id和tag2id
    if make_vocab:
        word2id = build_map(word_lists)
        tag2id = build_map(tag_lists)
        return word_lists, tag_lists, word2id, tag2id
    else:
        return word_lists, tag_lists


def build_map(lists):
    maps = {}
    for list_ in lists:
        for e in list_:
            if e not in maps:
                maps[e] = len(maps)

    return maps


In [3]:
train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train")
dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

In [4]:
print("训练集输入数据：", ''.join(train_word_lists[0]))
print("训练集标签数据：", ' '.join(train_tag_lists[0]))

训练集输入数据： 高勇：男，中国国籍，无境外居留权，
训练集标签数据： B-NAME E-NAME O O O B-CONT M-CONT M-CONT E-CONT O O O O O O O O


- LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
- 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到)

In [5]:
def extend_maps(word2id, tag2id):
    '''
    添加 PAD 和 UNK 索引
    '''
    word2id['<unk>'] = len(word2id)
    word2id['<pad>'] = len(word2id)
    tag2id['<unk>'] = len(tag2id)
    tag2id['<pad>'] = len(tag2id)
    
    return word2id, tag2id


In [6]:
# LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id)
print('UNK 的索引：{}'.format(bilstm_word2id.get('<unk>')))
print('PAD 的索引：{}'.format(bilstm_word2id.get('<pad>')))

UNK 的索引：1792
PAD 的索引：1793


## 建立模型

In [7]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, out_size):
        """初始化参数：
            vocab_size:字典的大小
            emb_size:词向量的维数
            hidden_size：隐向量的维数
            out_size:标注的种类
        """
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.bilstm = nn.LSTM(emb_size, hidden_size,
                              batch_first=True,
                              bidirectional=True)

        self.lin = nn.Linear(2*hidden_size, out_size)

    def forward(self, sents_tensor, lengths):
        emb = self.embedding(sents_tensor)  # [B, L, emb_size]

        packed = pack_padded_sequence(emb, lengths, batch_first=True)
        rnn_out, _ = self.bilstm(packed)
        # rnn_out:[B, L, hidden_size*2]
        rnn_out, _ = pad_packed_sequence(rnn_out, batch_first=True)

        scores = self.lin(rnn_out)  # [B, L, out_size]

        return scores

    def test(self, sents_tensor, lengths):
        """
        测试
        """
        logits = self.forward(sents_tensor, lengths)  # [B, L, out_size]
        _, batch_tagids = torch.max(logits, dim=2)

        return batch_tagids


In [8]:
# ******** LSTM模型 工具函数*************

def tensorized(batch, maps):
    '''
    将分割好的 batch 数据，转换为等长的tensor
    参数
    batch：输入的 batch 数据，列表
    maps：字到索引的字典
    '''
    PAD = maps.get('<pad>')
    UNK = maps.get('<unk>')

    max_len = len(batch[0])  # 由于排序了，所以选取第一个最长字符串为 max_len
    batch_size = len(batch)
    # 构建 输入 tensor
    batch_tensor = torch.ones(batch_size, max_len).long() * PAD
    for i, l in enumerate(batch):
        for j, e in enumerate(l):
            batch_tensor[i][j] = maps.get(e, UNK)
    # batch各个元素的长度
    lengths = [len(l) for l in batch]

    return batch_tensor, lengths


def sort_by_lengths(word_lists, tag_lists):
    '''
    将数据排序，最长的句子在第一个，为使用函数pack_padded_sequence() 做准备
    '''
    pairs = list(zip(word_lists, tag_lists))
    # 根据 序列长度，对序列的 index 排序！
    indices = sorted(range(len(pairs)),
                     key=lambda k: len(pairs[k][0]),
                     reverse=True)
    pairs = [pairs[i] for i in indices]
    # pairs.sort(key=lambda pair: len(pair[0]), reverse=True)
    # 添加 * 表示 解压
    word_lists, tag_lists = list(zip(*pairs))

    return word_lists, tag_lists, indices


In [9]:
def cal_loss(logits, targets, tag2id):
    """
    计算损失
    参数:
        logits: [B, L, out_size]
        targets: [B, L]
        lengths: [B]
    """
    PAD = tag2id.get('<pad>')
    assert PAD is not None
    # 选取非 PAD 数据
    mask = (targets != PAD)  # [B, L]
    targets = targets[mask]
    out_size = logits.size(2)
    logits = logits.masked_select(  # 返回一维张量
        mask.unsqueeze(2).expand(-1, -1, out_size)  # 将mask 扩展成 [B, L, out_size] 
    ).contiguous().view(-1, out_size)

    assert logits.size(0) == targets.size(0)
    loss = F.cross_entropy(logits, targets)

    return loss


## 参数设置

In [11]:
# 设置GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [12]:
# 设置lstm训练参数
class TrainingConfig(object):
    batch_size = 64
    # 学习速率
    lr = 0.001
    epoches = 8
    print_step = 5


class LSTMConfig(object):
    emb_size = 128  # 词向量的维数
    hidden_size = 128  # lstm隐向量的维数


In [13]:
# vocab_size:词典大小
# out_size:标注种类
vocab_size = len(word2id)
out_size = len(tag2id)
# 加载模型参数
emb_size = LSTMConfig.emb_size
hidden_size = LSTMConfig.hidden_size

model = BiLSTM(vocab_size, emb_size, hidden_size, out_size).to(device)

In [14]:
# 加载训练参数：
epoches = TrainingConfig.epoches
print_step = TrainingConfig.print_step
lr = TrainingConfig.lr
batch_size = TrainingConfig.batch_size


In [15]:
# 初始化优化器
optimizer = optim.Adam(model.parameters(), lr=lr)

In [16]:
# 初始化其他指标
step = 0
best_val_loss = 1e18
best_model = None

## 训练

In [17]:
def validate(dev_word_lists, dev_tag_lists, word2id, tag2id):
    model.eval()
    with torch.no_grad():
        val_losses = 0.
        val_step = 0
        for ind in range(0, len(dev_word_lists), batch_size):
            val_step += 1
            # 准备batch数据
            batch_sents = dev_word_lists[ind:ind+batch_size]
            batch_tags = dev_tag_lists[ind:ind+batch_size]
            tensorized_sents, lengths = tensorized(batch_sents, word2id)
            tensorized_sents = tensorized_sents.to(device)
            targets, lengths = tensorized(batch_tags, tag2id)
            targets = targets.to(device)

            # forward
            scores = model(tensorized_sents, lengths)

            # 计算损失
            loss = cal_loss(scores, targets, tag2id).to(device)
            val_losses += loss.item()
        val_loss = val_losses / val_step

        if val_loss < best_val_loss:
            print("保存模型...")
            best_model = model
            _best_val_loss = val_loss

        return val_loss


In [18]:
# 对数据集按照长度进行排序
word_lists, tag_lists, _ = sort_by_lengths(train_word_lists, train_tag_lists)
dev_word_lists, dev_tag_lists, _ = sort_by_lengths(dev_word_lists, dev_tag_lists)

B = batch_size
for e in range(1, epoches+1):
    step = 0
    losses = 0.
    for ind in range(0, len(word_lists), B):
        batch_sents = word_lists[ind:ind+B]
        batch_tags = tag_lists[ind:ind+B]
        
        model.train()  # 什么时候需要设置模型用于训练？
        step += 1
        # 准备数据
        tensorized_sents, lengths = tensorized(batch_sents, word2id)
        tensorized_sents = tensorized_sents.to(device)
        targets, lengths = tensorized(batch_tags, tag2id)
        targets = targets.to(device)

        # forward
        scores = model(tensorized_sents, lengths)

        # 计算损失 更新参数
        optimizer.zero_grad()
        loss = cal_loss(scores, targets, tag2id).to(device)
        loss.backward()
        optimizer.step()

        losses += loss.item()

        if step % TrainingConfig.print_step == 0:
            total_step = (len(word_lists) // B + 1)
            print("Epoch {}, step/total_step: {}/{} {:.2f}% Loss:{:.4f}".format(
                e, step, total_step,
                100. * step / total_step,
                losses / print_step
            ))
            losses = 0.

    # 每轮结束测试在验证集上的性能，保存最好的一个
    val_loss = validate(dev_word_lists, dev_tag_lists, word2id, tag2id)
    print("Epoch {}, Val Loss:{:.4f}".format(e, val_loss))


Epoch 1, step/total_step: 5/60 8.33% Loss:3.2675
Epoch 1, step/total_step: 10/60 16.67% Loss:2.9339
Epoch 1, step/total_step: 15/60 25.00% Loss:2.3871
Epoch 1, step/total_step: 20/60 33.33% Loss:1.5599
Epoch 1, step/total_step: 25/60 41.67% Loss:1.2821
Epoch 1, step/total_step: 30/60 50.00% Loss:1.1483
Epoch 1, step/total_step: 35/60 58.33% Loss:1.0458
Epoch 1, step/total_step: 40/60 66.67% Loss:1.0502
Epoch 1, step/total_step: 45/60 75.00% Loss:0.9976
Epoch 1, step/total_step: 50/60 83.33% Loss:0.9972
Epoch 1, step/total_step: 55/60 91.67% Loss:1.4571
Epoch 1, step/total_step: 60/60 100.00% Loss:1.4277
保存模型...
Epoch 1, Val Loss:0.9873
Epoch 2, step/total_step: 5/60 8.33% Loss:1.2247
Epoch 2, step/total_step: 10/60 16.67% Loss:0.8507
Epoch 2, step/total_step: 15/60 25.00% Loss:0.7380
Epoch 2, step/total_step: 20/60 33.33% Loss:0.6014
Epoch 2, step/total_step: 25/60 41.67% Loss:0.5827
Epoch 2, step/total_step: 30/60 50.00% Loss:0.5087
Epoch 2, step/total_step: 35/60 58.33% Loss:0.4742
E

Epoch 13, step/total_step: 45/60 75.00% Loss:0.0243
Epoch 13, step/total_step: 50/60 83.33% Loss:0.0257
Epoch 13, step/total_step: 55/60 91.67% Loss:0.0228
Epoch 13, step/total_step: 60/60 100.00% Loss:0.0169
保存模型...
Epoch 13, Val Loss:0.1459
Epoch 14, step/total_step: 5/60 8.33% Loss:0.0784
Epoch 14, step/total_step: 10/60 16.67% Loss:0.0550
Epoch 14, step/total_step: 15/60 25.00% Loss:0.0514
Epoch 14, step/total_step: 20/60 33.33% Loss:0.0293
Epoch 14, step/total_step: 25/60 41.67% Loss:0.0332
Epoch 14, step/total_step: 30/60 50.00% Loss:0.0244
Epoch 14, step/total_step: 35/60 58.33% Loss:0.0266
Epoch 14, step/total_step: 40/60 66.67% Loss:0.0255
Epoch 14, step/total_step: 45/60 75.00% Loss:0.0200
Epoch 14, step/total_step: 50/60 83.33% Loss:0.0214
Epoch 14, step/total_step: 55/60 91.67% Loss:0.0188
Epoch 14, step/total_step: 60/60 100.00% Loss:0.0135
保存模型...
Epoch 14, Val Loss:0.1490
Epoch 15, step/total_step: 5/60 8.33% Loss:0.0702
Epoch 15, step/total_step: 10/60 16.67% Loss:0.047

Epoch 26, step/total_step: 15/60 25.00% Loss:0.0117
Epoch 26, step/total_step: 20/60 33.33% Loss:0.0053
Epoch 26, step/total_step: 25/60 41.67% Loss:0.0061
Epoch 26, step/total_step: 30/60 50.00% Loss:0.0042
Epoch 26, step/total_step: 35/60 58.33% Loss:0.0052
Epoch 26, step/total_step: 40/60 66.67% Loss:0.0045
Epoch 26, step/total_step: 45/60 75.00% Loss:0.0040
Epoch 26, step/total_step: 50/60 83.33% Loss:0.0042
Epoch 26, step/total_step: 55/60 91.67% Loss:0.0035
Epoch 26, step/total_step: 60/60 100.00% Loss:0.0026
保存模型...
Epoch 26, Val Loss:0.1472
Epoch 27, step/total_step: 5/60 8.33% Loss:0.0165
Epoch 27, step/total_step: 10/60 16.67% Loss:0.0090
Epoch 27, step/total_step: 15/60 25.00% Loss:0.0102
Epoch 27, step/total_step: 20/60 33.33% Loss:0.0045
Epoch 27, step/total_step: 25/60 41.67% Loss:0.0052
Epoch 27, step/total_step: 30/60 50.00% Loss:0.0038
Epoch 27, step/total_step: 35/60 58.33% Loss:0.0046
Epoch 27, step/total_step: 40/60 66.67% Loss:0.0038
Epoch 27, step/total_step: 45/6

# 待整理

In [31]:
def bilstm_train_and_eval(train_data, dev_data, test_data,
                          word2id, tag2id, crf=True, remove_O=False):
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data

    start = time.time()
    
    bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf)
    bilstm_model.train(train_word_lists, train_tag_lists,
                       dev_word_lists, dev_tag_lists, word2id, tag2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(bilstm_model, "./ckpts/"+model_name+".pkl")

    print("训练完毕,共用时{}秒.".format(int(time.time()-start)))
    print("评估{}模型中...".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(
        test_word_lists, test_tag_lists, word2id, tag2id)

    metrics = Metrics(test_tag_lists, pred_tag_lists, remove_O=remove_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    return pred_tag_lists


In [29]:
def test(self, word_lists, tag_lists, word2id, tag2id):
    """返回最佳模型在测试集上的预测结果"""
    # 准备数据
    word_lists, tag_lists, indices = sort_by_lengths(word_lists, tag_lists)
    tensorized_sents, lengths = tensorized(word_lists, word2id)
    tensorized_sents = tensorized_sents.to(self.device)

    self.best_model.eval()
    with torch.no_grad():
        batch_tagids = self.best_model.test(
            tensorized_sents, lengths, tag2id)

    # 将id转化为标注
    pred_tag_lists = []
    id2tag = dict((id_, tag) for tag, id_ in tag2id.items())
    for i, ids in enumerate(batch_tagids):
        tag_list = []
        if self.crf:
            for j in range(lengths[i] - 1):  # crf解码过程中，end被舍弃
                tag_list.append(id2tag[ids[j].item()])
        else:
            for j in range(lengths[i]):
                tag_list.append(id2tag[ids[j].item()])
        pred_tag_lists.append(tag_list)

    # indices存有根据长度排序后的索引映射的信息
    # 比如若indices = [1, 2, 0] 则说明原先索引为1的元素映射到的新的索引是0，
    # 索引为2的元素映射到新的索引是1...
    # 下面根据indices将pred_tag_lists和tag_lists转化为原来的顺序
    ind_maps = sorted(list(enumerate(indices)), key=lambda e: e[1])
    indices, _ = list(zip(*ind_maps))
    pred_tag_lists = [pred_tag_lists[i] for i in indices]
    tag_lists = [tag_lists[i] for i in indices]

    return pred_tag_lists, tag_lists


In [None]:
def save_model(model, file_name):
    """用于保存模型"""
    with open(file_name, "wb") as f:
        pickle.dump(model, f)


def load_model(file_name):
    """用于加载模型"""
    with open(file_name, "rb") as f:
        model = pickle.load(f)
    return model
