In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

"""
cwd_path: 当前工作路径
knowledge_path: 知识库读取路径
train_path: 训练数据读取路径
test_path: 测试数据读取路径
stopwords_path: 停用词读取路径
embedding_path: 中文词向量读取路径
"""

cwd_path = os.getcwd()
knowledge_path = os.path.join(cwd_path, 'QA_system_knowledge.txt')
train_path = os.path.join(cwd_path, 'QA_system_train.txt')
test_path = os.path.join(cwd_path, 'QA_system_test.txt')
stopwords_path = os.path.join(cwd_path, 'QA_system_stop_words.txt')
embedding_path = '/Users/xujie/Desktop/Python/nlp/data/word_embeding'

In [2]:
# 定义函数load_stopwords(), 读取停用词
def load_stopwords(filename):
    """
    IN: 
       filename: 停用词文件保存路径
    OUT:
       stopwords: 加载好的停用词数据
    """
    # 读取停用词, 并输出
    stopwords = set()
    for line in open(filename):
        stopwords.add(line.strip())
    return stopwords

stopwords = load_stopwords(stopwords_path)

In [3]:
import warnings
warnings.filterwarnings('ignore')
import jieba.posseg as psg
    
# 基于jieba库, 定义分词函数tokenizer()
def tokenizer(filename, stopwords):
    """
    IN:
       filename:  目标文本文件保存路径
       stopwords: 加载好的停用词数据
    OUT:
       texts: 去除停用词和低频词后的分词文本
    """
    # 读取文件、分词、去除停用词
    texts = []
    for line in open(filename):
        texts.append([token for token, _ in psg.cut(line.strip()) if token not in stopwords])
    
    # 统计单词词频
    frequency = {}
    for line in texts:
        for word in line:
            frequency[word] = frequency[word]+1 if word in frequency else 1
    
    # 去除词频等于1的单词
    texts = [[word for word in line if frequency[word]>1] for line in texts]
    return texts

In [4]:
import pickle
from gensim import corpora

# 基于gensim库, 生成词典及知识库的词袋数据(CountVectorize)
def gen_corpora_and_dictionary(knowledge_path, train_path, filename, stopwords, loading=False):
    """
    IN:
       knowledge_path: 知识库文件保存路径
       train_path: 训练集文件保存路径
       filename: 数据文本的保存加载路径
       stopwords: 加载好的停用词数据
       loading: bool型, 为True时直接载入数据
    OUT:
       dictionary: 知识库+训练集语料构成的词典
       corpus: doc2bow后的知识库文本数据
    """
    # 判断载入数据还是重新生成数据
    if loading:
        # 读取dictionary数据文件
        with open(os.path.join(filename, 'dictionary'), 'rb') as f:
            dictionary = pickle.load(f)
        
        # 读取corpus数据文件
        with open(os.path.join(filename, 'corpus'), 'rb') as f:
            corpus = pickle.load(f)
    else:
        # 读取知识库文本, 分词、去除停用词和低频词
        knowledge_texts = tokenizer(knowledge_path, stopwords)
    
        # 读取训练集文本, 分词、去除停用词和低频词
        train_texts = tokenizer(train_path, stopwords)
    
        # 基于gensim库中的corpora生成词典
        dictionary = corpora.Dictionary(knowledge_texts+train_texts)
    
        # 将知识库文本转化为CountVectorize形式
        corpus = [dictionary.doc2bow(text) for text in knowledge_texts]
        
        # 保存生成的dictionary数据到相应的文件
        with open(os.path.join(filename, 'dictionary'), 'wb') as f:
            pickle.dump(dictionary, f)
                     
        # 保存生成的corpus数据到相应的文件
        with open(os.path.join(filename, 'corpus'), 'wb') as f:
            pickle.dump(corpus, f)
    return dictionary, corpus

dictionary, corpus = gen_corpora_and_dictionary(knowledge_path, train_path, cwd_path, stopwords, loading=True)

In [5]:
from gensim import models, similarities

# 定义函数topk_similarity_idx()生成top_k相似度矩阵
def topk_similarity_idx(filename, stopwords, k, loading=False):
    """
    IN:
       filename: 需要计算top_k相似度的目标文件
       stopwords: 加载好的停用词数据
       k: 参数, 从知识库中抽取最相似背景数的大小
       loading: bool型, 为True时直接载入数据
    OUT:
       sim_idx: 在知识库中提取与目标文件背景/问题最相关的k条知识所生成的矩阵
    """
    # 生成输出文件sim_idx的保存路径
    sim_path = os.getcwd()+'/sim_'+filename.split('/')[-1].split('_')[-1].split('.')[0]
    
    # 判断载入数据还是重新生成数据
    if loading:
        # 读取保存好的sim_idx数据文件
        with open(sim_path, 'rb') as f:
            sim_idx = pickle.load(f)
    else:
        # 读取dictionary数据文件
        with open(os.path.join(os.getcwd(), 'dictionary'), 'rb') as f:
            dictionary = pickle.load(f)
        
        # 读取corpus数据文件
        with open(os.path.join(os.getcwd(), 'corpus'), 'rb') as f:
            corpus = pickle.load(f)
        
        # 初始化潜在语意模型(Latent Semantic Indexing Model), 设置的主题数为10个
        lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10)
        
        # 将corpus数据转换到潜在语意空间(Latent Semantic Space), 并做Indexing操作
        idx = similarities.MatrixSimilarity(lsi[corpus])
        
        # 在知识库中提取与目标文件背景/问题最相关的k条知识
        sim_idx, tmp = [], []
        for i, line in enumerate(open(filename)):
            if i%6 == 0: # 提取背景(B)
                tmp.extend([token for token, _ in psg.cut(line.strip()) if token not in stopwords])
            if i%6 == 1: # 提取问题(Q)
                tmp.extend([token for token, _ in psg.cut(line.strip()) if token not in stopwords])
                sim_query = idx[lsi[dictionary.doc2bow(tmp)]]
                sim_topk = [i for i, j in sorted(enumerate(sim_query), key=lambda item: -item[1])[:k]]
                sim_idx.append(sim_topk)
                tmp.clear()
        
        # 保存生成好的sim_idx数据文件
        with open(sim_path, 'wb') as f:
            pickle.dump(sim_idx, f)
    return sim_idx

sim_train = topk_similarity_idx(train_path, stopwords, 5, loading=True) # SIZE(10570, 5)
sim_test = topk_similarity_idx(test_path, stopwords, 5, loading=True) # SIZE(2039, 5)

In [6]:
import numpy as np

# 定义函数gen_and_save_embedding(), 提取原始(未处理)的词向量文件中的数据
def gen_and_save_embedding(filename):
    """
    IN: 
       filename: 原始(未处理)词向量文件的保存路径
    OUT:
       None
    """
    # 初始化参数和变量
    word2idx_ch, weights_ch = {}, []
    
    # 生成word2idx_ch(dict类型)和weights_ch(np.array类型)
    for line in open(os.path.join(filename, 'ch_50d_word2vec')):
        word = line.split()[0]
        vector = list(map(float, line.split()[1:]))
        
        # 确保每个单词读入的词向量维度都是50，如果不是跳过该单词及其词向量
        try:
            assert len(vector) == 50
        except:
            continue
        
        word2idx_ch[word] = len(word2idx_ch)
        weights_ch.append(vector)
    weights_ch = np.array(weights_ch, dtype=np.float64)
    
    # 保存生成的word2idx_ch数据到相应的文件
    with open(os.path.join(filename, 'word2idx_ch'), 'wb') as f:
        pickle.dump(word2idx_ch, f)
                     
    # 保存生成的weights_ch数据到相应的文件
    with open(os.path.join(filename, 'weights_ch'), 'wb') as f:
        pickle.dump(weights_ch, f)
    return
                     
# gen_and_save_embedding(embedding_path)

In [7]:
# 定义函数load_embedding(), 用于读取词向量相关文件
def load_embedding(filename):
    """
    IN:
       filename: 词向量文件的保存路径
    OUT:
       word2idx_ch: 词典类型, 单词和ID的对应表
       weights_ch: Array类型, 词库的词向量矩阵, SIZE(598453, 50)
    """
    # 读取word2idx_ch文件
    with open(os.path.join(filename, 'word2idx_ch'), 'rb') as f:
        word2idx_ch = pickle.load(f)
        
    # 读取weights_ch文件
    with open(os.path.join(filename, 'weights_ch'), 'rb') as f:
        weights_ch = pickle.load(f)
    return word2idx_ch, weights_ch

In [8]:
import re

# 定义函数expend_token(), 在word2idx_ch\weights原数据基础上扩充特殊标识符 
def expend_token(embedding_path, expend_list=['</pad>', '</unk>', '</num>']):
    """
    IN:
       texts: 带转换的目标文本(已由分词函数tokenizer()预处理)
       embedding_path: 用于读取已保存的word2idx和weights文件
       expend_list:  用于在word2idx中扩展特殊的标识符
    OUT:
       word2idx_ch: 增加特殊符号标记的word2idx文件(用于将单词转化到对应数字ID的map)
       weights_ch: 增加特殊符号扩充的weights文件(np.array类型保存单词对应的词向量)
    """
    # 从embedding_path中读取已保存的word2idx和weights文件
    word2idx_ch, weights_ch = load_embedding(embedding_path)
    
    # expend_len为需要扩充的特殊标识符的数量
    expend_len = len(expend_list)
    
    # 对word2idx_ch进行扩充
    word2idx_ch = {k:(v+expend_len) for k, v in word2idx_ch.items()}
    for i, token in enumerate(expend_list):
        word2idx_ch[token] = i
        
    # 对weights_ch进行扩充, 新增特殊标识符的词向量统一为零向量
    weights_ch = np.vstack((np.zeros((expend_len, 50), dtype=np.float64), weights_ch))
    return word2idx_ch, weights_ch

word2idx_ch, weights_ch = expend_token(embedding_path)

In [9]:
# 定义函数word_list2idx(), 将目标文本word_list逐词转化为数字ID
def word_list2idx(word_list, word2idx_ch):
    """
    IN:
       list2idx: 将目标文本中的所有单词替换为对应的数字ID
       word2idx_ch: 增加特殊符号标记的word2idx文件(用于将单词转化到对应数字ID的map)
    OUT:
       list2idx: 将目标文本中的所有单词替换为对应的数字ID
    """
    # 将word_list中的单词逐一转化为对应的数字ID
    list2idx = []
    for word in word_list:
        if word in word2idx_ch:
            idx = word2idx_ch[word]
        else:
            # 若不存在于word2idx_ch中，按照是否出现数字划分为</num>和</unk>
            idx = word2idx_ch['</num>'] if re.match(r'\d+', word) else word2idx_ch['</unk>']
        list2idx.append(idx)
    return list2idx

In [87]:
def query_answer_split(knowledge_path, filename, stopwords, word2idx_ch, sim_idx):
    """
    IN:
       knowledge_path: 知识库文件保存路径
       filename: 待加载的训练集或测试集文件路径
       stopwords: 加载好的停用词数据
       word2id_ch: 增加特殊符号标记的word2idx文件(用于将单词转化到对应数字ID的map)
       weights_ch: 增加特殊符号扩充的weights文件(np.array类型保存单词对应的词向量)
       sim_idx: 在知识库中提取与目标文件背景/问题最相关的k条知识所生成的矩阵
    OUT:
       quries_num: 目标文件中的Question/Answer pairs的数量
       queries: 已转化为数字ID的题目+背景信息矩阵 SIZE(40760,)
       answers: 已转化为数字ID的正确答案矩阵 SIZE(40760,)
       labels: 记录正确答案和错误答案的位置信息 SIZE(40760,)
    """
    # 读取、预处理知识库文本和目标文本
    knowledge_texts = tokenizer(filename, stopwords)
    texts = tokenizer(filename, stopwords)
    
    # 初始化相关参数和数据
    queries_num = 0
    tmp = []
    queries, answers, labels = [], [], []
    
    # 生成queries, answers, labels
    for i, line in enumerate(open(filename)):
        if i%6 == 0:
            queries_num += 1
            # 在题目+背景信息中合并与其最相似的k条知识
            for j in sim_idx[i//6]:
                tmp.extend(knowledge_texts[j])
            tmp.extend(texts[i])
        elif i%6 == 1:
            tmp.extend(texts[i])
            queries.append(word_list2idx(tmp, word2idx_ch))
        else:
            """
               由于一道完整的题目由背景(B)、问题(Q)、一个正确答案(R)和三个错误答案(W1, W2, W3)组成
               因此一道问题生成的样本为: query = (B+Q), answer = (R, W1, W2, W3), label(正确答案所做四个选项中的位置)
            """
            # 统计、更新正确答案标签
            if line[0] == 'R':
                labels.append(i%4)
                
            answers.append(word_list2idx(texts[i], word2idx_ch))
                
            # 清空、初始化tmp
            if i%6 == 5:
                tmp.clear()
    return queries_num, queries, answers, labels

train_query_num, train_query, train_answer, train_label = query_answer_split(knowledge_path, 
                                                           train_path, stopwords, word2idx_ch, sim_train)
test_query_num, test_query, test_answer, test_label = query_answer_split(knowledge_path, 
                                                           test_path, stopwords, word2idx_ch, sim_test)

In [11]:
# 导入tensorflow, keras深度学习框架
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras import optimizers, losses
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [88]:
# 设置问题的最大长度和答案的最大长度(超出最大长度的部分截断,未达到最大长度时用Padding进行填充)
max_query_len = 100 # max([len(line) for line in queries])
max_ans_len = 30 # max(max([len(line) for line in true_answers]), max([len(line) for line in false_answers]))

# 对训练集的问题和答案进行处理,统一长度
train_query = pad_sequences(np.array(train_query), value = 0, padding = 'post', maxlen=max_query_len) # SIZE(, 100)
train_answer = pad_sequences(np.array(train_answer), value = 0, padding = 'post', maxlen=max_ans_len)
train_answer = train_answer.reshape([train_query_num, 4, max_ans_len]) # SIZE(, 4, 30)

# 对测试集的问题和答案进行处理,统一长度
test_query = pad_sequences(np.array(test_query), value = 0, padding = 'post', maxlen=max_query_len) # SIZE(, 100)
test_answer = pad_sequences(np.array(test_answer), value = 0, padding = 'post', maxlen=max_ans_len)
test_answer = test_answer.reshape([test_query_num, 4, max_ans_len]) # SIZE(, 4, 30)

In [125]:
# 继承keras中的Model模块,DIY深度神经网络
class Model(keras.Model):
    def __init__(self, weights=weights_ch, units=64):
        super(Model, self).__init__()
        self.embedding = Embedding(weights.shape[0], weights.shape[1], weights=[weights], trainable=False)
        self.lstm1 = Bidirectional(LSTM(units//2, return_sequences=True))
        self.lstm2 = Bidirectional(LSTM(units//2))
        self.dropout = Dropout(0.4)
        self.normalization = BatchNormalization()
        self.dense = Dense(4, activation='softmax')
    
    # 定义biLSTM层,结构: 嵌入层+biLSTM1+随机失活层+biLSTM2+批标准化层
    def biLSTM(self, tmp):
        res = self.embedding(tmp)
        res = self.lstm1(res)
        res = self.dropout(res)
        res = self.lstm2(res)
        res = self.normalization(res)
        return res
    
    # 定义问题和答案选项的相似度计算(余弦相似度)
    def get_cosSim(self, q, a):
        q_norm = tf.sqrt(tf.reduce_sum(q*q, 1))
        a_norm = tf.sqrt(tf.reduce_sum(a*a, 1))
        mul = tf.reduce_sum(q*a, 1)
        # 考虑epsilon防止计算余弦相似度时分母除0
        epsilon = tf.fill(mul.get_shape(), 1.0)
        cosSim = tf.divide(mul, q_norm*a_norm+epsilon)
        cosSim = tf.expand_dims(cosSim, axis=1)
        return cosSim
    
    # 定义深度网络的前馈计算机制
    def call(self, query, answer, training=None):
        # 输出问题经过biLSTM层作用后的隐变量
        query = self.biLSTM(query)
        cosSim = []
        # 针对answer中四个选项(A, B, C, D),计算选项与问题的余弦相似度
        for option in tf.unstack(answer, axis=1):
            # 输出答案选项经过biLSTM层作用后的隐变量
            option = self.biLSTM(option)
            cosSim.append(self.get_cosSim(query, option))
        # 合并 SIZE(, 100)
        merged = tf.concat(cosSim, axis=1)
        # 加入一个全连接层,并选择激活函数为softmax函数
        output = self.dense(merged)
        return output

In [152]:
import random

lr = 0.01 # 优化器学习率
epochs = 1 # 迭代次数
batch_size = 64 # 批量
perm = [i for i in range(train_query_num)] # 随机抽样

train_label = np.array(train_label)
test_label = np.array(test_label)

# model = Model()
optimizer = optimizers.Adam(lr) # 优化器:Adam

for epoch in range(epochs):
    random.shuffle(perm)
    for i in range(train_query_num//batch_size):
        
        # 将训练集随机打乱之后分批次进行训练
        sample = perm[i*batch_size:(i+1)*batch_size]
        
        with tf.GradientTape() as tape:
            # ouput为biLSTM深度网络的输出结果
            output = model(train_query[sample], train_answer[sample]) 
            one_hot = tf.one_hot(train_label[sample], depth = 4)
            # 计算交叉熵损失值
            loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=one_hot, logits=output)
            loss = tf.reduce_sum(loss)/batch_size
            
        # 每十个批次统计模型在训练集上的损失值和准确率
        if i%10==0:
            acc = tf.equal(tf.argmax(output, axis=1), tf.argmax(one_hot, axis=1))
            acc = tf.reduce_mean(tf.cast(acc, tf.float32))
            print('Iteration {} --- Batch ID: {}, Loss: {:.4f}, Acc: {:.4f} ---'.format(epoch+1, i+1, loss, acc))
            
        # 根据损失和BP算法更新model中的训练参数
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
    # 每一轮迭代统计模型在验证集上的损失值和准确率
    sample = np.random.randint(0, test_query_num, batch_size)
    output = model(test_query[sample], test_answer[sample]) 
    one_hot = tf.one_hot(test_label[sample], depth = 4)
    loss_val = tf.nn.sigmoid_cross_entropy_with_logits(labels=one_hot, logits=output)
    loss_val = tf.reduce_sum(loss_val)/batch_size
    acc_val = tf.equal(tf.argmax(output, axis=1), tf.argmax(one_hot, axis=1))
    acc_val = tf.reduce_mean(tf.cast(acc_val, tf.float32))
    print('--- Iteration ID: {}, Loss_Val: {:.4f}, Acc_Val: {:.4f} ---'.format(epoch+1, loss_val, acc_val))

Iteration 1 --- Batch ID: 1, Loss: 2.8219, Acc: 0.5938 ---
Iteration 1 --- Batch ID: 11, Loss: 2.9576, Acc: 0.4375 ---
Iteration 1 --- Batch ID: 21, Loss: 2.8838, Acc: 0.5469 ---
Iteration 1 --- Batch ID: 31, Loss: 2.8925, Acc: 0.4688 ---
Iteration 1 --- Batch ID: 41, Loss: 2.9231, Acc: 0.4531 ---
Iteration 1 --- Batch ID: 51, Loss: 2.8609, Acc: 0.5156 ---
Iteration 1 --- Batch ID: 61, Loss: 2.9883, Acc: 0.3281 ---
Iteration 1 --- Batch ID: 71, Loss: 2.9201, Acc: 0.4688 ---
Iteration 1 --- Batch ID: 81, Loss: 2.8574, Acc: 0.5625 ---
Iteration 1 --- Batch ID: 91, Loss: 2.8998, Acc: 0.4844 ---
Iteration 1 --- Batch ID: 101, Loss: 2.8942, Acc: 0.4688 ---
Iteration 1 --- Batch ID: 111, Loss: 2.8608, Acc: 0.5469 ---
Iteration 1 --- Batch ID: 121, Loss: 2.9181, Acc: 0.4844 ---
Iteration 1 --- Batch ID: 131, Loss: 2.8820, Acc: 0.5000 ---
Iteration 1 --- Batch ID: 141, Loss: 2.8384, Acc: 0.5625 ---
Iteration 1 --- Batch ID: 151, Loss: 3.0166, Acc: 0.3281 ---
--- Iteration ID: 1, Loss_Val: 3.10

In [153]:
# 保存模型参数
weight_path = os.path.join(cwd_path, 'model_weight_v0.h5')
model.save_weights(weight_path)