# Read file

In [1]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.1/242.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
# import libaray
import re
import torch
import jieba
from jieba import analyse
import pandas as pd
import numpy as np
from gensim import corpora, models, similarities
from termcolor import colored
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel

In [3]:
def read_and_split_the_excel(path):
    """
    :func: 根据xlsx文件获取问题list和答案list（需要更新openyxl）
    :param path: 文件路径
    :return: 问题list，答案list
    """
    # 读取文件
    df1 = pd.read_excel(path)
    # 分开
    question_list = df1.iloc[:,0].tolist()
    answer_list = df1.iloc[:,1].tolist()
    # 返回
    return question_list,answer_list

In [4]:
# 测试read_and_split_the_excel
question_list,answer_list = read_and_split_the_excel("../input/uic-cn-admission/CN_QA_dataset_all.xlsx")
display(question_list[:3])

['UIC的办学性质是什么？', '学校现在有多少在校生？', 'UIC的全称是什么？']

# TF-IDF-CN

In [5]:
# 导入停用词表
def obtain_stop_word(path):
    """
    :func: 获取stop_word
    :param path: 文件路径
    :return: 返回stop_word list
    """
    stop_words = [line.strip() for line in open(path).readlines()]
    stop_words.extend([""," "])
    return stop_words

In [6]:
# obtain stop word 代码测试
# 使用的是cn_stopwords，在kaggle搜索哈工大第一个
path = '../input/english-and-chinese-stopwords/cn_stopwords.txt'
stop_words = obtain_stop_word(path)
stop_words[:25]

['$',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '?',
 '_',
 '“',
 '”',
 '、',
 '。',
 '《',
 '》',
 '一',
 '一些',
 '一何',
 '一切',
 '一则',
 '一方面']

In [7]:
def cn_stop_word_rm(sentence,stop_words):
    """
    :func: 将输入的句子分词并且移除stopword，返回list
    :param stop_words: 需要移除的stopword（用的是cn_stopwords）
        eg:
                ['$', '0', '1', '2', '3', '4', '5', '6', '7', '8',
                '9', '?', '_', '“', '”', '、', '。','《', '》', '一',
                '一些', '一何', '一切', '一则', '一方面', '一旦', '一来']
    :param sentence: 句子
        eg:
            "今天我想摆烂，你能拿我咋办，摸鱼我说了算"
    :return: 返回分词后的token list
    """
    # split the sentence
    word_tokens = list(jieba.cut_for_search(sentence))
    
    # remove stop words
    query = [w.lower() for w in word_tokens if not w in stop_words] 
#     print(query)
#     question_list[index] = ' '.join(line for line in query)
    return query

In [8]:
# 测试cn_stop_word_rm
sentence = "我们认为，关键问题就是所谓问题的关键，所以问题的关键在于我们如何把握关键问题，这个是我们任务的关键"
query = cn_stop_word_rm(sentence,stop_words)
query

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.044 seconds.
Prefix dict has been built successfully.


['认为',
 '关键',
 '问题',
 '关键问题',
 '所谓',
 '问题',
 '关键',
 '问题',
 '关键',
 '关键在于',
 '把握',
 '关键',
 '问题',
 '关键问题',
 '任务',
 '关键']

In [9]:
def generate_question_t_list(question_list, stop_words):
    """
    :func: 将输入的问句分词逐个转为token list
    
    :param question_list: 句子列表
        eg:
            ["今天我想摆烂"，
            "你能拿我咋办"，
            "摸鱼我说了算"]
            
    :param stop_words: 需要移除的stopword（用的是cn_stopwords）
        eg:
                ['$', '0', '1', '2', '3', '4', '5', '6', '7', '8',
                '9', '?', '_', '“', '”', '、', '。','《', '》', '一',
                '一些', '一何', '一切', '一则', '一方面', '一旦', '一来']
    
    :return: 返回question_list每句分词后的token list
    """
    
    # transfer the question list into the token form
    question_token_list = ['' for i in range(len(question_list))]
    
    for index in range(len(question_list)):
        # split the sentence
        question_token_list[index] = cn_stop_word_rm(question_list[index],stop_words)
    # return the token list
    return question_token_list

In [10]:
# 测试generate_question_t_list
question_token_list = generate_question_t_list(question_list, stop_words)
question_token_list[:4]

[['uic', '办学', '性质'],
 ['学校', '现在', '在校', '在校生'],
 ['uic', '全称'],
 ['北师', '北师港', '浸大', '全称']]

In [11]:
def similarity_cn(Corp, query,topk = 3, threshold = 0.7,all_score_without_rank=0):
    """
    :func: 计算问题与知识库中问题的相似度
    :param Corp: 分词后的问题
        eg:
                [['UIC', '学校', '办学', '性质'],
                 ['学校', '现在', '在校', '在校生'],
                 ['UIC', '全称'],
                 ['北师', '北师港', '浸大', '全称']]
    :param query: 分词后的问题
        eg:
                ['UIC', '全称', '名字']
    :return: 返回满足阈值要求的问题所在行索引——对应答案所在的行索引
    """

    # 建立词典
    dictionary = corpora.Dictionary(Corp)

    # 基于词典，将分词列表集转换成稀疏向量集，即语料库
    corpus = [dictionary.doc2bow(text) for text in Corp]
    
    # 训练TF-IDF模型，传入语料库进行训练
    tfidf = models.TfidfModel(corpus)

    # 用训练好的TF-IDF模型处理被检索文本，即语料库
    corpus_tfidf = tfidf[corpus]
    
#     print(corpus_tfidf)
    # # 得到TF-IDF值
#     for temp in corpus_tfidf:
#         print(temp)

    vec_bow = dictionary.doc2bow(query)
    vec_tfidf = tfidf[vec_bow]
#     print(vec_tfidf)
    
    index = similarities.MatrixSimilarity(corpus_tfidf)
#     print(index)
    
    sims = index[vec_tfidf]
#     print(sims)


    if all_score_without_rank:
        return sims
    else:
        max_loc = np.argsort(sims)[::-1][:topk]
    #     print(np.argsort(sims)[::-1])

    #     top_max_sim = sims[max_loc]
    #     print(top_max_sim)

        # if the score is larger than the threshold
        if sims[max_loc[0]] < threshold:
            if_vaild = 0
        else:
            if_vaild = 1

        return if_vaild, max_loc, sims

# 未来可能的改进
# row_index默认为-1，即未匹配到满足相似度阈值的问题

In [12]:
# 测试
Corp = question_token_list
query = ['uic', '全称', '名字']
if_vaild, max_loc, top_max_sim = similarity_cn(Corp, query)
display(if_vaild)
display(max_loc)
display(top_max_sim[:3])

1

array([2, 3, 5])

array([0.10827172, 0.        , 1.        ], dtype=float32)

In [13]:
# 单轮测试的Q&A
# TF-IDF的测试代码
def cn_main_single(question, data_file_path, stopword_file_path, topk = 3, threshold = 0.95):
    # read the file
    question_list,answer_list = read_and_split_the_excel(data_file_path)

    # stop words list
    stop_words = obtain_stop_word(stopword_file_path)
    
    # genetate question token list
    question_token_list = generate_question_t_list(question_list, stop_words)


    # can be modify as recurrence
    

    # 对查询的问题进行处理
    query = cn_stop_word_rm(question, stop_words)

    # 得到问题（答案）所对应的行索引
    if_vaild, topk_idx, score = similarity_cn(question_token_list, query, topk, threshold)
    
    # 返回最相似的问题
    print('top %d questions similar to "%s"' % (topk, colored(question, 'green')))
    for idx in topk_idx:
        print('TF-IDF; %s\t%s' % (colored('%.4f' % score[idx], 'cyan'), colored(question_list[idx], 'yellow')))
    print("The best similarity is:", score[topk_idx[0]])
    
    # get the answer
    if if_vaild:
        print(answer_list[topk_idx[0]])
    else:
        print("Sorry, I don't know what you say")
    return 0

In [14]:
# 测试
data_file_path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
stopword_file_path = '../input/english-and-chinese-stopwords/cn_stopwords.txt'
quest_in = '我想知道，UIC的全称叫什么名字'
cn_main_single(quest_in, data_file_path, stopword_file_path)

top 3 questions similar to "[32m我想知道，UIC的全称叫什么名字[0m"
TF-IDF; [36m1.0000[0m	[33mUIC的全称是什么？[0m
TF-IDF; [36m0.4206[0m	[33m北师港浸大的全称是什么？[0m
TF-IDF; [36m0.3891[0m	[33mUIC是什么？[0m
The best similarity is: 1.0
UIC的全称是北京师范大学-香港浸会大学联合国际学院。


0

In [15]:
# 最终的CN_main函数
# TF-IDF的测试代码
def cn_main(data_file_path, stopword_file_path):
    # read the file
    question_list,answer_list = read_and_split_the_excel(data_file_path)

    # stop words list
    stop_words = obtain_stop_word(stopword_file_path)
    
    # genetate question token list
    question_token_list = generate_question_t_list(question_list, stop_words)
    
    while True:
        question = input('Your question: ')
        if question == "quit":
            break
        
        # 对查询的问题进行处理
        query = cn_stop_word_rm(question, stop_words)

        # 得到问题（答案）所对应的行索引
        row_index = similarity_cn(question_token_list, query)

        # 判断是否有结果
        if row_index != -2:
            question, answer = get_qestion_answer(question_list, answer_list, row_index)
            print(question)
            print(answer)
        else:
            print("Sorry, I don't know what you say")   

    return 0

In [16]:
# 测试
data_file_path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
stopword_file_path = '../input/english-and-chinese-stopwords/cn_stopwords.txt'
# cn_main(data_file_path, stopword_file_path)

# BERT_Embedding CN

In [17]:
def transfer_sentence_vector(sentence,tokenizer,model):
    """
    :func: 把句子embedding成向量
    :param sentence: 句子
    :param tokenizer: 分词器
    :param model: 模型
    :return: 转成的向量
    """
    # generate question vector
    encoded_input = tokenizer(sentence, return_tensors='pt')
    output = model(**encoded_input)[1].detach().numpy()
    return output.tolist()[0]

In [18]:
# 测试
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese')
sentence = '学校现在有多少在校生？'
print(transfer_sentence_vector(sentence,tokenizer,model))

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/393M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[0.9995291829109192, 0.999877393245697, 0.9812514185905457, 0.9879932999610901, 0.9224331974983215, 0.6182746887207031, -0.6998699903488159, 0.9265468120574951, 0.9752839803695679, -0.9983439445495605, 0.9999923706054688, 0.9989728331565857, 0.9653666019439697, -0.791135311126709, 0.9990866780281067, -0.9998182654380798, 0.8282744288444519, 0.9996429681777954, 0.9712729454040527, 0.28822869062423706, 0.9996145963668823, -0.9999863505363464, -0.9714978933334351, -0.9338252544403076, -0.08185579627752304, 0.9993001222610474, 0.9836791753768921, 0.2707619369029999, -0.999713659286499, 0.997393012046814, 0.9350379109382629, 0.9993559122085571, 0.9562469720840454, -0.999920666217804, -0.9999659657478333, 0.42838069796562195, -0.9091864228248596, 0.7876582145690918, -0.6098946928977966, -0.9389352798461914, -0.9700639247894287, -0.1435333788394928, 0.8450793027877808, -0.9975762367248535, -0.9844191670417786, 0.4589720070362091, -0.9999994039535522, -0.9997289776802063, 0.8935610055923462, 0

In [19]:
def transfer_all_q2v(sentence_list,tokenizer,model):
    """
    :func: 把句子list都embedding成向量
    :param sentence: 句子的list
    :param tokenizer: 分词器
    :param model: 模型
    :return: 转成的向量list
    """
    doc_vecs=[]
    for sentence in sentence_list:
        doc_vecs.append(transfer_sentence_vector(sentence,tokenizer,model))
    doc_vecs = np.array(doc_vecs)
    return doc_vecs

In [20]:
# 测试
doc_vecs = transfer_all_q2v(question_list,tokenizer,model)
doc_vecs

array([[ 0.99958241,  0.99990588,  0.81502736, ..., -0.996768  ,
        -0.99450332,  0.95210838],
       [ 0.99952918,  0.99987739,  0.98125142, ..., -0.99081439,
        -0.99445051,  0.97859687],
       [ 0.99907655,  0.99989069,  0.9948076 , ..., -0.99802631,
        -0.97939253,  0.93087041],
       ...,
       [ 0.99916857,  0.99923247,  0.99998689, ..., -0.99986476,
        -0.99519753,  0.66582525],
       [ 0.99823868,  0.99970043,  0.9967646 , ..., -0.99984264,
        -0.99581432,  0.73592114],
       [ 0.99856842,  0.9996742 ,  0.99790406, ..., -0.99984467,
        -0.99580306,  0.72018516]])

In [21]:
def get_similar_q_id(query_vec,doc_vecs,tokenizer,model,topk = 5,threshold = 0.95, all_score_without_rank=0):
    """
    :func: 通过cosine similarity找到相似句子
    :param sentence: 转为向量的句子
    :param doc_vecs: 已经转换为向量的句子列表
    :param topk: 显示前topk个最相似的句子
    :param threshold: 认为是匹配的问句的有效阈值
    :param tokenizer: 分词器
    :param model: 模型
    :return: 是否达到要求，返回满足阈值要求的问题所在行索引——对应答案所在的行索引的np.array，相似度分数
    """
    # compute normalized dot product as score
    score = np.sum(query_vec * doc_vecs, axis=1) / np.linalg.norm(doc_vecs, axis=1) / np.linalg.norm(query_vec)
    
    if all_score_without_rank:
        return score
    else:
        # get the top "topk" score's id
        topk_idx = np.argsort(score)[::-1][:topk]

        # if the score is larger than the threshold
        if score[topk_idx[0]] < threshold:
            if_vaild = 0
        else:
            if_vaild = 1

        return if_vaild, topk_idx, score

In [22]:
# 测试
sentence = '学校现在有多少在校生？'
sentence_vec = transfer_sentence_vector(sentence,tokenizer,model)
topk = 5
threshold = 0.95
if_vaild, topk_idx, score = get_similar_q_id(sentence_vec,doc_vecs,tokenizer,model,topk,threshold)
display(if_vaild)
display(topk_idx)
display(score[:5])

1

array([  1,  39, 503,  29,  10])

array([0.96110737, 1.        , 0.94652379, 0.92392588, 0.93067066])

In [23]:
# BERT的测试代码
def main_single(query,data_path, bert_model_name, topk = 5, threshold = 0.95):
    """
    :func: 一遍的匹配
    :param data_path: 数据集的路径xlsx
    :param bert_model_name: bert模型名字
    :param topk: 显示前topk个最相似的句子
    :param threshold: 认为是匹配的问句的有效阈值
    :备注: 每次会返回前topk个最相似的问题和答案
    :return: 0 程序执行成功的话
    """
    # initial things
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    model = BertModel.from_pretrained(bert_model_name)
    question_list,answer_list = read_and_split_the_excel(path)
    doc_vecs = transfer_all_q2v(question_list,tokenizer,model)
    
    # 这里开始循环
    
    query_vec = transfer_sentence_vector(query,tokenizer,model)
    
    # 匹配
    if_vaild, topk_idx, score = get_similar_q_id(query_vec,doc_vecs,tokenizer,model,topk,threshold)
    
    # 返回最相似的问题
    print('top %d questions similar to "%s"' % (topk, colored(query, 'green')))
    for idx in topk_idx:
        print('&gt; %s\t%s' % (colored('%.4f' % score[idx], 'cyan'), colored(question_list[idx], 'yellow')))
    print("The best similarity is:", score[topk_idx[0]])
    
    # get the answer
    if if_vaild:
        print(answer_list[topk_idx[0]])
    else:
        print("Sorry, I don't know what you say")
    return 0

In [24]:
# 测试
bert_model_name = 'bert-base-chinese'
path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
topk = 5
threshold = 0.95
query = '学校现在有多少在校生？'
main_single(query, path, bert_model_name,topk, threshold)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


top 5 questions similar to "[32m学校现在有多少在校生？[0m"
&gt; [36m1.0000[0m	[33m学校现在有多少在校生？[0m
&gt; [36m0.9716[0m	[33m学校招生办的通讯地址是什么？[0m
&gt; [36m0.9700[0m	[33m提供哪些小语种可以学习？[0m
&gt; [36m0.9698[0m	[33m学校毕业生的深造就业情况如何？[0m
&gt; [36m0.9677[0m	[33mUIC的教学质量怎么保证？[0m
The best similarity is: 1.0000000000000002
截至2021年10月底，北师港浸大现有本科、硕士、博士在校生共8100余人。


0

In [25]:
# BERT的测试代码
def main(data_path, bert_model_name, topk = 5, threshold = 0.95):
    """
    :func: 多遍的匹配
    :param data_path: 数据集的路径xlsx
    :param bert_model_name: bert模型名字
    :param topk: 显示前topk个最相似的句子
    :param threshold: 认为是匹配的问句的有效阈值
    :备注: 每次会返回前topk个最相似的问题和答案
    :return: 0 程序执行成功的话
    """
    
    # initial things
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    model = BertModel.from_pretrained(bert_model_name)
    question_list,answer_list = read_and_split_the_excel(path)
    doc_vecs = transfer_all_q2v(question_list,tokenizer,model)
    
    # 这里开始循环
    while True:
        # 读取数据
        query = input('Your question: ')
        if query == "quit":
            break
        
        # 转向量
        query_vec = transfer_sentence_vector(query,tokenizer,model)

        # 匹配
        if_vaild, topk_idx, score = get_similar_q_id(query_vec,doc_vecs,tokenizer,model,topk,threshold)
        
        # 返回最相似的问题
        print('top %d questions similar to "%s"' % (topk, colored(query, 'green')))
        for idx in topk_idx:
            print('&gt; %s\t%s' % (colored('%.4f' % score[idx], 'cyan'), colored(question_list[idx], 'yellow')))
        print("The best similarity is:", score[topk_idx[0]])

        # get the answer
        if if_vaild:
            print(answer_list[topk_idx[0]])
        else:
            print("抱歉，请换种方式来回答")
        
    return 0

In [26]:
# 测试
bert_model_name = 'bert-base-chinese'
path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
topk = 5
threshold = 0.995
# main(path, bert_model_name,topk, threshold)

# Merge togerther

In [27]:
# 单轮测试的Q&A
# BERT+TF-IDF的测试代码
def cn_main_mix_single(question, data_file_path, stopword_file_path, bert_model_name, topk_TFIDF = 3, threshold_TFIDF = 0.7, topk_BERT = 5, threshold_BERT = 0.95):
    """
    :func: 一遍的匹配
    :param question: 你提出来的问题
    :param data_path: 数据集的路径xlsx
    :param bert_model_name: bert模型名字
    :param topk_TFIDF: TF-IDF中显示前topk个最相似的句子
    :param threshold_TFIDF: TF-IDF中认为是匹配的问句的有效阈值
    :param topk_BERT: BERT中显示前topk个最相似的句子
    :param threshold_BERT: BERT中认为是匹配的问句的有效阈值
    :备注: 每次会返回前topk个最相似的问题和答案
    :return: 0 程序执行成功的话
    """
    # read the file
    question_list,answer_list = read_and_split_the_excel(data_file_path)
    
    # preparing for the TF-IDF
    # stop words list
    stop_words = obtain_stop_word(stopword_file_path)
    # genetate question token list
    question_token_list = generate_question_t_list(question_list, stop_words)
    
    # preparing for the Bert
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    model = BertModel.from_pretrained(bert_model_name)
    doc_vecs = transfer_all_q2v(question_list,tokenizer,model)

    
    
    
    # 这里开始循环 
    # can be modify as recurrence
    
        
    # get result
    # from TF-IDF
    # 对查询的问题进行处理
    query = cn_stop_word_rm(question, stop_words)
    # 得到问题（答案）所对应的行索引
    if_vaild_TF, topk_idx_TF, score_TF = similarity_cn(question_token_list, query, topk_TFIDF, threshold_TFIDF)
    # from embedding by BERT
    query_vec = transfer_sentence_vector(question,tokenizer,model)
    # 匹配
    if_vaild_BERT, topk_idx_BERT, score_BERT = get_similar_q_id(query_vec,doc_vecs,tokenizer,model,topk_BERT,threshold_BERT)
    
    
#     # print the result
#     print('The questions similar to "%s"' % (colored(question, 'green')))
#     if(if_vaild_TF):
#         for idx in topk_idx_TF:
#             print('TF-IDF; %s\t%s' % (colored('%.4f' % score_TF[idx], 'cyan'), colored(question_list[idx], 'yellow')))
#         for idx in topk_idx_BERT[:2]:
#             print('BERT_emb; %s\t%s' % (colored('%.4f' % score_BERT[idx], 'cyan'), colored(question_list[idx], 'yellow')))
#         print(answer_list[topk_idx_TF[0]])
#     elif(if_vaild_BERT):
#         for idx in topk_idx_BERT[:3]:
#             print('BERT_emb; %s\t%s' % (colored('%.4f' % score_BERT[idx], 'cyan'), colored(question_list[idx], 'yellow')))
#         for idx in topk_idx_TF[:2]:
#             print('TF-IDF; %s\t%s' % (colored('%.4f' % score_TF[idx], 'cyan'), colored(question_list[idx], 'yellow')))
#         print(answer_list[topk_idx_BERT[0]])
#     else:
#         for idx in topk_idx_BERT[:5]:
#             print('BERT_emb; %s\t%s' % (colored('%.4f' % score_BERT[idx], 'cyan'), colored(question_list[idx], 'yellow')))
#         print("Sorry, I don't know what you say")
    
    # print the result version 2
    # TF-IDF
    # 返回最相似的问题
    print('top few questions(TFIDF: %d, BERT: %d) similar to "%s"' % (topk_TFIDF, topk_BERT, colored(question, 'green')))
    print("The best similarity is:", score_TF[topk_idx_TF[0]])
    print("The best similarity is:", score_BERT[topk_idx_BERT[0]],"\n")
    
    for idx in topk_idx_TF:
        print('TF-IDF; %s\t%s' % (colored('%.4f' % score_TF[idx], 'cyan'), colored(question_list[idx], 'yellow')))
    
    #Bert
    for idx in topk_idx_BERT:
        print('BERT_emb; %s\t%s' % (colored('%.4f' % score_BERT[idx], 'cyan'), colored(question_list[idx], 'yellow')))
    # get the best answer
    if(if_vaild_TF):
        print(answer_list[topk_idx_TF[0]])
    elif(if_vaild_BERT):
        print(answer_list[topk_idx_BERT[0]])
    else:
        print("Sorry, I don't know what you say")
    
    
    return 0

In [28]:
# 测试
data_file_path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
stopword_file_path = '../input/english-and-chinese-stopwords/cn_stopwords.txt'
quest_in = '我想知道，UIC的全称叫什么名字'

bert_model_name = 'bert-base-chinese'
topk_TFIDF = 3
threshold_TFIDF = 0.7
topk_BERT = 5
threshold_BERT = 0.95

cn_main_mix_single(quest_in, data_file_path, stopword_file_path, bert_model_name, topk_TFIDF, threshold_TFIDF, topk_BERT, threshold_BERT)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


top few questions(TFIDF: 3, BERT: 5) similar to "[32m我想知道，UIC的全称叫什么名字[0m"
The best similarity is: 1.0
The best similarity is: 0.9732797151692254 

TF-IDF; [36m1.0000[0m	[33mUIC的全称是什么？[0m
TF-IDF; [36m0.4206[0m	[33m北师港浸大的全称是什么？[0m
TF-IDF; [36m0.3891[0m	[33mUIC是什么？[0m
BERT_emb; [36m0.9733[0m	[33mUIC是什么？[0m
BERT_emb; [36m0.9665[0m	[33mUIC的目标是什么？[0m
BERT_emb; [36m0.9642[0m	[33mUIC的使命是什么？[0m
BERT_emb; [36m0.9624[0m	[33m招生咨询邮箱是什么？[0m
BERT_emb; [36m0.9584[0m	[33mUIC招生信息网是什么[0m
UIC的全称是北京师范大学-香港浸会大学联合国际学院。


0

In [29]:
# 多轮
# 单轮测试的Q&A
# BERT+TF-IDF的测试代码
def cn_main_mix(data_file_path, stopword_file_path, bert_model_name, topk_TFIDF = 3, threshold_TFIDF = 0.7, topk_BERT = 5, threshold_BERT = 0.95):
    """
    :func: 多遍的匹配
    :param data_path: 数据集的路径xlsx
    :param bert_model_name: bert模型名字
    :param topk_TFIDF: TF-IDF中显示前topk个最相似的句子
    :param threshold_TFIDF: TF-IDF中认为是匹配的问句的有效阈值
    :param topk_BERT: BERT中显示前topk个最相似的句子
    :param threshold_BERT: BERT中认为是匹配的问句的有效阈值
    :备注: 每次会返回前topk个最相似的问题和答案
    :return: 0 程序执行成功的话
    """
    # read the file
    question_list,answer_list = read_and_split_the_excel(data_file_path)
    
    # preparing for the TF-IDF
    # stop words list
    stop_words = obtain_stop_word(stopword_file_path)
    # genetate question token list
    question_token_list = generate_question_t_list(question_list, stop_words)
    
    # preparing for the Bert
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    model = BertModel.from_pretrained(bert_model_name)
    doc_vecs = transfer_all_q2v(question_list,tokenizer,model)

    
    
    
    # 这里开始循环 
    # can be modify as recurrence
    
    while True:
        # 读取数据
        question = input('Your question: ')
        if question == "quit":
            break
            
        # get result
        # from TF-IDF
        # 对查询的问题进行处理
        query = cn_stop_word_rm(question, stop_words)
        # 得到问题（答案）所对应的行索引
        if_vaild_TF, topk_idx_TF, score_TF = similarity_cn(question_token_list, query, topk_TFIDF, threshold_TFIDF)
        # from embedding by BERT
        query_vec = transfer_sentence_vector(question,tokenizer,model)
        # 匹配
        if_vaild_BERT, topk_idx_BERT, score_BERT = get_similar_q_id(query_vec,doc_vecs,tokenizer,model,topk_BERT,threshold_BERT)


#         # print the result
#         print('The questions similar to "%s"' % (colored(question, 'green')))
#         if(if_vaild_TF):
#             for idx in topk_idx_TF:
#                 print('TF-IDF; %s\t%s' % (colored('%.4f' % score_TF[idx], 'cyan'), colored(question_list[idx], 'yellow')))
#             for idx in topk_idx_BERT[:2]:
#                 print('BERT_emb; %s\t%s' % (colored('%.4f' % score_BERT[idx], 'cyan'), colored(question_list[idx], 'yellow')))
#             print(answer_list[topk_idx_TF[0]])
#         elif(if_vaild_BERT):
#             for idx in topk_idx_BERT[:3]:
#                 print('BERT_emb; %s\t%s' % (colored('%.4f' % score_BERT[idx], 'cyan'), colored(question_list[idx], 'yellow')))
#             for idx in topk_idx_TF[:2]:
#                 print('TF-IDF; %s\t%s' % (colored('%.4f' % score_TF[idx], 'cyan'), colored(question_list[idx], 'yellow')))
#             print(answer_list[topk_idx_BERT[0]])
#         else:
#             for idx in topk_idx_BERT[:5]:
#                 print('BERT_emb; %s\t%s' % (colored('%.4f' % score_BERT[idx], 'cyan'), colored(question_list[idx], 'yellow')))
#             print("Sorry, I don't know what you say")


        # print the result version 2
        # TF-IDF
        # 返回最相似的问题
        print('top few questions(TFIDF: %d, BERT: %d) similar to "%s"' % (topk_TFIDF, topk_BERT, colored(question, 'green')))
        print("The best similarity is:", score_TF[topk_idx_TF[0]])
        print("The best similarity is:", score_BERT[topk_idx_BERT[0]],"\n")

        for idx in topk_idx_TF:
            print('TF-IDF; %s\t%s' % (colored('%.4f' % score_TF[idx], 'cyan'), colored(question_list[idx], 'yellow')))

        #Bert
        for idx in topk_idx_BERT:
            print('BERT_emb; %s\t%s' % (colored('%.4f' % score_BERT[idx], 'cyan'), colored(question_list[idx], 'yellow')))
        # get the best answer
        if(if_vaild_TF):
            print(answer_list[topk_idx_TF[0]])
        elif(if_vaild_BERT):
            print(answer_list[topk_idx_BERT[0]])
        else:
            print("Sorry, I don't know what you say")
    
    
    return 0

In [30]:
# 测试
data_file_path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
stopword_file_path = '../input/english-and-chinese-stopwords/cn_stopwords.txt'

bert_model_name = 'bert-base-chinese'
topk_TFIDF = 3
threshold_TFIDF = 0.7
topk_BERT = 5
threshold_BERT = 0.95

# cn_main_mix(data_file_path, stopword_file_path, bert_model_name, topk_TFIDF, threshold_TFIDF, topk_BERT, threshold_BERT)

# For the HTML

In [31]:
def prepared(data_file_path, stopword_file_path, bert_model_name, topk_TFIDF = 3, threshold_TFIDF = 0.7, topk_BERT = 5, threshold_BERT = 0.95):
    # 开机启动
    # read the file
    question_list,answer_list = read_and_split_the_excel(data_file_path)

    # preparing for the TF-IDF
    # stop words list
    stop_words = obtain_stop_word(stopword_file_path)
    # genetate question token list
    question_token_list = generate_question_t_list(question_list, stop_words)

    # preparing for the Bert
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    model = BertModel.from_pretrained(bert_model_name)
    doc_vecs = transfer_all_q2v(question_list,tokenizer,model)
    return stop_words,question_token_list,doc_vecs,topk_TFIDF,threshold_TFIDF,topk_BERT,threshold_BERT,tokenizer,model,question_list,answer_list

In [32]:
# 输出函数
# get result
def get_respond(question,inputlist):
    stop_words,question_token_list,doc_vecs,topk_TFIDF,threshold_TFIDF,topk_BERT,threshold_BERT,tokenizer,model,question_list,answer_list = inputlist
    # from TF-IDF
    # 对查询的问题进行处理
    query = cn_stop_word_rm(question, stop_words)
    # 得到问题（答案）所对应的行索引
    if_vaild_TF, topk_idx_TF, score_TF = similarity_cn(question_token_list, query, topk_TFIDF, threshold_TFIDF)
    # from embedding by BERT
    query_vec = transfer_sentence_vector(question,tokenizer,model)
    # 匹配
    if_vaild_BERT, topk_idx_BERT, score_BERT = get_similar_q_id(query_vec,doc_vecs,tokenizer,model,topk_BERT,threshold_BERT)

    # get the best answer
    if(if_vaild_TF):
        return(answer_list[topk_idx_TF[0]])
    elif(if_vaild_BERT):
        return(answer_list[topk_idx_BERT[0]])
    else:
        return("Sorry, I don't know what you say")

In [33]:
# 提前声明
data_file_path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
stopword_file_path = '../input/english-and-chinese-stopwords/cn_stopwords.txt'
bert_model_name = 'bert-base-chinese'
topk_TFIDF = 3
threshold_TFIDF = 0.7
topk_BERT = 5
threshold_BERT = 0.95

inputlist = prepared(data_file_path, stopword_file_path, bert_model_name, topk_TFIDF = 3, threshold_TFIDF = 0.7, topk_BERT = 5, threshold_BERT = 0.95)

# 读取数据
# while True:
#     # 读取数据
#     question = input('Your question: ')
#     if question == "quit":
#         break
#     respond = get_respond(question,inputlist)
#     print(respond)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# For the 01 dataset to validate the accuarcy

In [34]:
import plotly.express as px
from datasets import list_metrics,load_metric
from sklearn.metrics import confusion_matrix

def performance(y_ture,y_pred):
    f1_metric = load_metric("f1")
    re_metric = load_metric("recall")
    pre_metric = load_metric("precision")
    type_c_int = list(set(np.concatenate([y_ture, y_pred])))
    type_c = [str(i) for i in type_c_int]
    
    f1_m_list = []
    re_m_list = []
    pre_m_list = []
    
    for i in type_c_int:
        bi_ture = list(y_ture == i)
        bi_pred = list(y_pred == i)
        f1_m_results = f1_metric.compute(predictions=bi_pred, references=bi_ture, average="macro")
        re_m_results = re_metric.compute(predictions=bi_pred, references=bi_ture, average="macro")
        pre_m_results = pre_metric.compute(predictions=bi_pred, references=bi_ture, average="macro")
        
        f1_m_list.append(f1_m_results["f1"])
        re_m_list.append(re_m_results["recall"])
        pre_m_list.append(pre_m_results["precision"])
        
    data = {'Class_type':type_c_int,'F1-macro':f1_m_list,'Recall-macro':re_m_list,'Precision-macro':pre_m_list}
    df = pd.DataFrame(data)
    display(df)
    
    
    z = confusion_matrix(y_ture, y_pred)
    x_lab = type_c

    fig = px.imshow(z, 
                    text_auto=True,
                    labels=dict(x="True label", y="Predicted label", color="times"),
                    x=x_lab,
                    y=x_lab)
    fig.show()
    
    return z

In [35]:
performance(np.array([1,3,1,4,2,1]),np.array([2,3,1,3,3,2]))

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.21k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Class_type,F1-macro,Recall-macro,Precision-macro
0,1,0.625,0.666667,0.8
1,2,0.333333,0.3,0.375
2,3,0.625,0.8,0.666667
3,4,0.454545,0.5,0.416667


array([[1, 2, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0]])

In [36]:
# 读取测试数据集
def read_01_csv(path):
    """
    :func: 根据xlsx文件获取提出问题list和预计匹配问题list
    :param path: 文件路径
    :return: 问题list，答案list
    """
    # 读取文件
    df1 = pd.read_csv(path)
    # 分开
    ori_list = df1.iloc[:,0].tolist()
    test_list = df1.iloc[:,1].tolist()
    label_list = df1.iloc[:,2].tolist()
    # 返回
    return ori_list,test_list, label_list

In [37]:
# 测试
path = "../input/01-uic-rm-dup/01_all_rm_dup.csv"
ori_list,test_list, label_list = read_01_csv(path)
display(ori_list[:5])
display(test_list[:5])
display(label_list[:5])

['UIC的办学性质是什么？', '学校现在有多少在校生？', '北师港浸大的全称是什么？', '简单介绍一下UIC', 'UIC办学定位是什么？']

['UIC的业务性质是什么？', '目前有多少学生在学校就读？', '北师大香港浸会大学的全称是什么？', 'UIC的简要介绍', 'UIC的定位是什么？']

[1, 1, 1, 1, 1]

In [38]:
# 用read_and_split_the_excel创建字典
question_list,answer_list = read_and_split_the_excel("../input/uic-cn-admission/CN_QA_dataset_all.xlsx")
display(question_list[:3])

dict_index = dict(zip(question_list,range(len(question_list))))
display(list(dict_index.keys())[:4])
display(list(dict_index.values())[:4])
display(type(dict_index))

['UIC的办学性质是什么？', '学校现在有多少在校生？', 'UIC的全称是什么？']

['UIC的办学性质是什么？', '学校现在有多少在校生？', 'UIC的全称是什么？', '北师港浸大的全称是什么？']

[0, 1, 2, 3]

dict

In [39]:
q_index = []
for each in ori_list:
    try:
        q_index.append(dict_index[each])
    except:
        q_index.append("NaN")
        print("This sentence can not found in list:\n",each)
q_index[:5]

[0, 1, 3, 4, 6]

In [40]:
def test_TF_IDF(test_path, data_file_path, stopword_file_path, theresthold):
    # read the file
    ori_list,test_list, label_list = read_01_csv(test_path)
    
    # read the file
    question_list,answer_list = read_and_split_the_excel(data_file_path)
    
    # make sentence dict
    dict_index = dict(zip(question_list,range(len(question_list))))
    
    # make preprocessing for the ori_list
    test_q_index = []
    for each in ori_list:
        try:
            test_q_index.append(dict_index[each])
        except:
            test_q_index.append("NaN")
            print("This sentence can not found in list:\n",each)
    
    #这个是测试所对应的预期匹配问句的index,均是数据库出现的
    ##########################test_q_index
    
    # stop words list
    stop_words = obtain_stop_word(stopword_file_path)
    
    # make preprocessing for the question token list
    question_token_list = generate_question_t_list(question_list, stop_words)
    
    # start to make a test
    test_result_score_list =[]
    
    topk = 3
    
    # 对于数据集的每个问题进行查询
    for question_index, question in enumerate(test_list):
        # 对查询的问题进行处理
        query = cn_stop_word_rm(question, stop_words)
        
        # 得到问题（答案）所对应的行索引
        score = similarity_cn(question_token_list, query, all_score_without_rank=1)
        
        need_score_index = test_q_index[question_index]
        
        test_result_score_list.append(score[need_score_index])
#         if question_index > 20:
#             break
            
    test_ressult_list = np.array(test_result_score_list) > theresthold
    
    test_ressult_list = test_ressult_list+0
    
    return test_ressult_list,label_list

In [41]:
# confirm the test
test_path = "../input/01-uic-rm-dup/01_all_rm_dup.csv"
data_file_path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
stopword_file_path = '../input/english-and-chinese-stopwords/cn_stopwords.txt'
theresthold = 0.7

tf_idf_result,label_list = test_TF_IDF(test_path, data_file_path, stopword_file_path, theresthold)

In [42]:
performance(tf_idf_result,label_list)

Unnamed: 0,Class_type,F1-macro,Recall-macro,Precision-macro
0,0,0.70471,0.711265,0.699171
1,1,0.70471,0.711265,0.699171


array([[ 237,  185],
       [ 221, 1368]])

In [43]:
def test_BERT(test_path, data_file_path, stopword_file_path, theresthold, bert_model_name):
    # read the file
    ori_list,test_list, label_list = read_01_csv(test_path)
    
    # read the file
    question_list,answer_list = read_and_split_the_excel(data_file_path)
    
     # make sentence dict
    dict_index = dict(zip(question_list,range(len(question_list))))
    
    # make preprocessing for the ori_list
    test_q_index = []
    for each in ori_list:
        try:
            test_q_index.append(dict_index[each])
        except:
            test_q_index.append("NaN")
            print("This sentence can not found in list:\n",each)
    
    #这个是测试所对应的预期匹配问句的index,均是数据库出现的
    # score 直接去这个index就得到了需要的
    ##########################test_q_index
    
    # initial things
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    model = BertModel.from_pretrained(bert_model_name)
    
    doc_vecs = transfer_all_q2v(question_list,tokenizer,model)
    
    # start to make a test
    test_result_score_list =[]
    
    # 这里开始循环
    for question_index,query in enumerate(test_list):
        query_vec = transfer_sentence_vector(query,tokenizer,model)
    
        # 匹配
        score = get_similar_q_id(query_vec,doc_vecs,tokenizer,model,all_score_without_rank=1)
        
        need_score_index = test_q_index[question_index]
        
        test_result_score_list.append(score[need_score_index])
    
#         if question_index > 5:
#             break
        
    test_ressult_list = np.array(test_result_score_list) > theresthold
    
    test_ressult_list = test_ressult_list+0
    
    return test_ressult_list,label_list

In [44]:
# confirm the test
test_path = "../input/01-uic-rm-dup/01_all_rm_dup.csv"
data_file_path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
stopword_file_path = '../input/english-and-chinese-stopwords/cn_stopwords.txt'
bert_model_name = 'bert-base-chinese'
theresthold = 0.99

BERT_ressult,label_list = test_BERT(test_path, data_file_path, stopword_file_path, theresthold, bert_model_name)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [45]:
performance(BERT_ressult,label_list)

Unnamed: 0,Class_type,F1-macro,Recall-macro,Precision-macro
0,0,0.317304,0.624002,0.564266
1,1,0.317304,0.624002,0.564266


array([[ 457, 1350],
       [   1,  203]])