In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from datasets import list_metrics,load_metric
from sklearn.metrics import confusion_matrix

## Evaluation function

In [2]:
def performance(y_ture,y_pred):
    f1_metric = load_metric("f1")
    re_metric = load_metric("recall")
    pre_metric = load_metric("precision")
    type_c_int = list(set(np.concatenate([y_ture, y_pred])))
    type_c = [str(i) for i in type_c_int]
    
    f1_m_list = []
    re_m_list = []
    pre_m_list = []
    
    for i in type_c_int:
        bi_ture = list(y_ture == i)
        bi_pred = list(y_pred == i)
        f1_m_results = f1_metric.compute(predictions=bi_pred, references=bi_ture, average="macro")
        re_m_results = re_metric.compute(predictions=bi_pred, references=bi_ture, average="macro")
        pre_m_results = pre_metric.compute(predictions=bi_pred, references=bi_ture, average="macro")
        
        f1_m_list.append(f1_m_results["f1"])
        re_m_list.append(re_m_results["recall"])
        pre_m_list.append(pre_m_results["precision"])
        
    data = {'Class_type':type_c_int,'F1-macro':f1_m_list,'Recall-macro':re_m_list,'Precision-macro':pre_m_list}
    df = pd.DataFrame(data)
    display(df)
    
    
    z = confusion_matrix(y_ture, y_pred)
    x_lab = type_c

    fig = px.imshow(z, 
                    text_auto=True,
                    labels=dict(x="True label", y="Predicted label", color="times"),
                    x=x_lab,
                    y=x_lab)
#     fig.show()
    
    return z,fig

In [3]:
# testing
cf_matrix_test,figure_test = performance([1,3,1,2,2,1],[2,3,1,3,3,2])
figure_test.show()

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.21k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Unnamed: 0,Class_type,F1-macro,Recall-macro,Precision-macro
0,1,0.625,0.666667,0.8
1,2,0.25,0.25,0.25
2,3,0.625,0.8,0.666667


# Loading the data for use

In [4]:
!pip install -U sentence-transformers
!pip install openpyxl

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m334.8 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=c9dc03b515c9275ca900d7797a3acef298a5cafd8e96b4e06bef1a5bc5825fce
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
[0m

In [5]:
from copy import deepcopy
from random import randint,shuffle

In [6]:
def read_and_split_the_excel(QA_path):
    """
    :func: 根据xlsx文件获取问题list和答案list（需要更新openyxl）
    :param path: 文件路径
    :return: 问题list，答案list
    """
    # 读取文件
    df1 = pd.read_excel(QA_path)
    # 分开
    question_list = df1.iloc[:,0].tolist()
    answer_list = df1.iloc[:,1].tolist()
    # 返回
    return question_list,answer_list

In [7]:
# 测试read_and_split_the_excel
question_list,answer_list = read_and_split_the_excel("../input/uic-cn-admission/CN_QA_dataset_all.xlsx")
display(question_list[:3])

['UIC的办学性质是什么？', '学校现在有多少在校生？', 'UIC的全称是什么？']

In [8]:
def read_and_split_the_01(zero_one_path):
    """
    :func: 根据xlsx文件获取原始list和测试list和label
    :param path: 文件路径
    :return: 问题list，答案list
    """
    # 读取文件
    df1 = pd.read_csv(zero_one_path)
    # 分开
    Sen1_list = df1.iloc[:,0].tolist()
    Sen2_list = df1.iloc[:,1].tolist()
    label_list = df1.iloc[:,2].tolist()
    # 返回
    return Sen1_list,Sen2_list,label_list

In [9]:
# 测试read_and_split_the_excel
Sen1_list, Sen2_list, label_list = read_and_split_the_01("../input/01-uic-rm-dup/01_all_rm_dup.csv")
display(Sen1_list[:3])
display(Sen2_list[:3])
display(label_list[:3])

['UIC的办学性质是什么？', '学校现在有多少在校生？', '北师港浸大的全称是什么？']

['UIC的业务性质是什么？', '目前有多少学生在学校就读？', '北师大香港浸会大学的全称是什么？']

[1, 1, 1]

In [10]:
def shuffle_without_repeated(list_):
    temp_list = deepcopy(list_)
    m = len(temp_list)
    m = m-1
    for i_current in range(m,1,-1):
        rest = i_current - 1
        i_replace = randint(0, rest)
#         print(i_current)
#         print(i_replace)
        temp_list[i_current], temp_list[i_replace] = temp_list[i_replace], temp_list[i_current]
    return temp_list
    
def obtain_shuffle_01(ori_list):
    shuffle_q_list = shuffle_without_repeated(ori_list)
    
    shuffle_label_list = [0]*len(shuffle_q_list)
    
    return ori_list,shuffle_q_list,shuffle_label_list

In [11]:
# Test the shuffle
question_list = ['The cat sits outside',
      'A man is playing guitar',
      'The new movie is awesome',
      'The new opera is nice']
obtain_shuffle_01(question_list)

(['The cat sits outside',
  'A man is playing guitar',
  'The new movie is awesome',
  'The new opera is nice'],
 ['The cat sits outside',
  'The new opera is nice',
  'A man is playing guitar',
  'The new movie is awesome'],
 [0, 0, 0, 0])

In [12]:
def read_qa_and_expand_training_set(QA_path, zero_one_path):
    # get the qa_data
    question_list,answer_list = read_and_split_the_excel(QA_path)
    # get the 01_data
    Sen1_list, Sen2_list, label_list = read_and_split_the_01(zero_one_path)
    # get expand 01 data
    ori_list,shuffle_q_list,shuffle_label_list = obtain_shuffle_01(question_list)
    Sen1_list.extend(ori_list)
    Sen2_list.extend(shuffle_q_list)
    label_list.extend(shuffle_label_list)
    
    # get the index of Sen1_list corresponding to the question_list
    question_list_index = range(len(question_list))
    question_index_dict = dict(zip(question_list,question_list_index))
    
    Sen1_list_index = []
    for i in Sen1_list:
        Sen1_list_index.append(question_index_dict[i])
        
#     print(Sen1_list_index)
    
    return question_list,answer_list,Sen1_list,Sen1_list_index,Sen2_list,label_list

In [13]:
# Test for getting all the QA data and the zero_one data
QA_path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
zero_one_path = "../input/01-uic-rm-dup/01_all_rm_dup.csv"
question_list,answer_list,Sen1_list,Sen1_list_index,Sen2_list,label_list = read_qa_and_expand_training_set(QA_path, zero_one_path)

# display the sample result
display(question_list[:3])
display(answer_list[:3])
display(Sen1_list[:3])
display(Sen1_list_index[:10])
display(Sen2_list[:3])
display(label_list[:3])

['UIC的办学性质是什么？', '学校现在有多少在校生？', 'UIC的全称是什么？']

['UIC是北京师范大学与香港浸会大学合作办学的一所博雅大学。',
 '截至2021年10月底，北师港浸大现有本科、硕士、博士在校生共8100余人。',
 'UIC的全称是北京师范大学-香港浸会大学联合国际学院。']

['UIC的办学性质是什么？', '学校现在有多少在校生？', '北师港浸大的全称是什么？']

[0, 1, 3, 4, 6, 9, 10, 11, 12, 13]

['UIC的业务性质是什么？', '目前有多少学生在学校就读？', '北师大香港浸会大学的全称是什么？']

[1, 1, 1]

In [14]:
import time
from tqdm import tqdm

# test
with tqdm(total=200) as pbar:
    pbar.set_description('Processing')
    # total表示总的项目, 循环的次数20*10(每次更新数目) = 200(total)
    for i in range(20):
        # 进行动作, 这里是过0.1s
        time.sleep(0.1)
        # 进行进度更新, 这里设置10个
        pbar.update(10)

Processing: 100%|██████████| 200/200 [00:02<00:00, 98.39it/s]


# TF-IDF-test

In [15]:
import jieba
import pandas as pd
import numpy as np
from gensim import corpora, models, similarities
from termcolor import colored

In [16]:
def read_and_split_the_excel(path):
    """
    :func: 根据xlsx文件获取问题list和答案list（需要更新openyxl）
    :param path: 文件路径
    :return: 问题list，答案list
    """
    # 读取文件
    df1 = pd.read_excel(path)
    # 分开
    question_list = df1.iloc[:,0].tolist()
    answer_list = df1.iloc[:,1].tolist()
    # 返回
    return question_list,answer_list

# # 测试read_and_split_the_excel
# question_list,answer_list = read_and_split_the_excel("./dataset/CN_QA_dataset_all.xlsx")
# print(question_list[:3])

# 导入停用词表
def obtain_stop_word(path):
    """
    :func: 获取stop_word
    :param path: 文件路径
    :return: 返回stop_word list
    """
    stop_words = [line.strip() for line in open(path).readlines()]
    stop_words.extend([""," "])
    return stop_words
# # obtain stop word 代码测试
# # 使用的是cn_stopwords，在kaggle搜索哈工大第一个
# path = './dataset/cn_stopwords.txt'
# stop_words = obtain_stop_word(path)
# print(stop_words[:25])

In [17]:
def cn_stop_word_rm(sentence, stop_words):
    """
    :func: 将输入的句子分词并且移除stopword，返回list
    :param stop_words: 需要移除的stopword（用的是cn_stopwords）
        eg:
                ['$', '0', '1', '2', '3', '4', '5', '6', '7', '8',
                '9', '?', '_', '“', '”', '、', '。','《', '》', '一',
                '一些', '一何', '一切', '一则', '一方面', '一旦', '一来']
    :param sentence: 句子
        eg:
            "今天我想摆烂，你能拿我咋办，摸鱼我说了算"
    :return: 返回分词后的token list
    """
    # split the sentence
    word_tokens = list(jieba.cut_for_search(sentence))

    # remove stop words
    query = [w.lower() for w in word_tokens if not w in stop_words]
    #     print(query)
    #     question_list[index] = ' '.join(line for line in query)
    return query

# # 测试cn_stop_word_rm
# sentence = "我们认为，一些关键问题就是所谓问题的关键，所以问题的关键在于我们如何把握关键问题，这个是我们任务的关键"
# path = './dataset/cn_stopwords.txt'
# stop_words = obtain_stop_word(path)
# query = cn_stop_word_rm(sentence,stop_words)
# print(query)

def generate_question_t_list(question_list, stop_words):
    """
    :func: 将输入的问句分词逐个转为token list

    :param question_list: 句子列表
        eg:
            ["今天我想摆烂"，
            "你能拿我咋办"，
            "摸鱼我说了算"]

    :param stop_words: 需要移除的stopword（用的是cn_stopwords）
        eg:
                ['$', '0', '1', '2', '3', '4', '5', '6', '7', '8',
                '9', '?', '_', '“', '”', '、', '。','《', '》', '一',
                '一些', '一何', '一切', '一则', '一方面', '一旦', '一来']

    :return: 返回question_list每句分词后的token list
    """

    # transfer the question list into the token form
    question_token_list = ['' for i in range(len(question_list))]

    for index in range(len(question_list)):
        # split the sentence
        question_token_list[index] = cn_stop_word_rm(question_list[index], stop_words)
    # return the token list
    return question_token_list

# # 测试generate_question_t_list
# question_list,answer_list = read_and_split_the_excel("./dataset/CN_QA_dataset_all.xlsx")
# path = './dataset/cn_stopwords.txt'
# stop_words = obtain_stop_word(path)
# question_token_list = generate_question_t_list(question_list, stop_words)
# print(question_token_list[:4])

In [18]:
def similarity_cn(query, dictionary, tfidf, corpus_tfidf, topk=3, threshold=0.7, all_score_without_rank=0):
    """
    :func: 计算问题与知识库中问题的相似度
    :param Corp: 分词后的问题
        eg:
                [['UIC', '学校', '办学', '性质'],
                 ['学校', '现在', '在校', '在校生'],
                 ['UIC', '全称'],
                 ['北师', '北师港', '浸大', '全称']]
    :param query: 分词后的问题
        eg:
                ['UIC', '全称', '名字']
    :return: 返回
        if_valid: 最匹配的答案的相似度是否超过了threshold
        max_loc: 前topk个最匹配的答案所在的index
        sims: 每一个问题与查找问题的相似度，依据index来的
    """

    #     print(corpus_tfidf)
    # # 得到TF-IDF值
    #     for temp in corpus_tfidf:
    #         print(temp)

    vec_bow = dictionary.doc2bow(query)
    vec_tfidf = tfidf[vec_bow]
    #     print(vec_tfidf)

    index = similarities.MatrixSimilarity(corpus_tfidf)
    #     print(index)

    sims = index[vec_tfidf]
    #     print(sims)

    if all_score_without_rank:
        return sims
    else:
        max_loc = np.argsort(sims)[::-1][:topk]
        #     print(np.argsort(sims)[::-1])

        #     top_max_sim = sims[max_loc]
        #     print(top_max_sim)

        # if the score is larger than the threshold
        if sims[max_loc[0]] < threshold:
            if_valid = 0
        else:
            if_valid = 1

        return if_valid, max_loc, sims



# # 测试，用query的问句去问系统，这里面返回前三个的相似度
# # generate question_t_list
# question_list,answer_list = read_and_split_the_excel("./dataset/CN_QA_dataset_all.xlsx")
# path = './dataset/cn_stopwords.txt'
# stop_words = obtain_stop_word(path)
# question_token_list = generate_question_t_list(question_list, stop_words)
#
# Corp = question_token_list
# query = ['uic', '全称', '名字']
# if_vaild, max_loc, top_max_sim = similarity_cn(Corp, query)
# print(if_vaild)
# print(max_loc)
# print(top_max_sim[:3])

def TF_IDF_prepared(data_file_path, stopword_file_path):
    # 准备TF-IDF回答过程中需要的材料
    # read the file
    question_list, answer_list = read_and_split_the_excel(data_file_path)

    # stop words list
    stop_words = obtain_stop_word(stopword_file_path)

    # generate question token list
    question_token_list = generate_question_t_list(question_list, stop_words)

    # 建立词典
    dictionary = corpora.Dictionary(question_token_list)

    # 基于词典，将分词列表集转换成稀疏向量集，即语料库
    corpus = [dictionary.doc2bow(text) for text in question_token_list]

    # 训练TF-IDF模型，传入语料库进行训练
    tfidf = models.TfidfModel(corpus)

    # 用训练好的TF-IDF模型处理被检索文本，即语料库
    corpus_tfidf = tfidf[corpus]

    return question_list, question_token_list, answer_list, stop_words,dictionary, tfidf, corpus_tfidf

def TF_IDF_reply(query, question_list, answer_list, stop_words, topk_TFIDF,threshold_TFIDF, dictionary, tfidf, corpus_tfidf):
    # 对查询的问题进行处理
    query_processed = cn_stop_word_rm(query, stop_words)

    # 得到问题（答案）所对应的行索引
    if_valid, topk_idx_TF, score_TF = similarity_cn(query_processed, dictionary, tfidf, corpus_tfidf, topk_TFIDF, threshold_TFIDF, all_score_without_rank=0)

    #回答答案，可以注释
#     print('top few questions(TFIDF: %d) similar to "%s"' % (topk_TFIDF, colored(query, 'green')))
#     print("The best similarity for TF-IDF is:", score_TF[topk_idx_TF[0]])

#     for idx in topk_idx_TF:
#         print('TF-IDF; %s\t%s' % (colored('%.4f' % score_TF[idx], 'cyan'), colored(question_list[idx], 'yellow')))

    # get the answer
#     if if_valid:
#         print(answer_list[topk_idx_TF[0]])

    return if_valid, topk_idx_TF, score_TF
# 下面为主函数
def TF_IDF_QA_test():
    # file storing the data
    print("数据准备中")
    data_file_path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
    stopword_file_path = '../input/english-and-chinese-stopwords/cn_stopwords.txt'
    # 显示前几个答案
    topk_TFIDF = 3
    # 超过多少相似度才算合格
    threshold_TFIDF = 0.7
    # data prepared
    question_list, question_token_list, answer_list, stop_words, dictionary, \
        tfidf, corpus_tfidf = TF_IDF_prepared(data_file_path, stopword_file_path)
    
    # prepared testing data
    question_list,answer_list,Sen1_list,Sen1_list_index,Sen2_list,label_list = read_qa_and_expand_training_set(QA_path, zero_one_path)
    
    print("准备完毕")
    
    # obtain the question
    
    predict_result = []
    
    with tqdm(total=len(Sen1_list)) as pbar:
        pbar.set_description('正在测试')
        
        for index, test_query in enumerate(Sen2_list):

            if_valid, topk_idx_TF, score_TF = TF_IDF_reply(test_query, question_list, answer_list, stop_words, topk_TFIDF,
                         threshold_TFIDF, dictionary, tfidf, corpus_tfidf)

            if topk_idx_TF[0] == Sen1_list_index[index]:
                prediction = 1
            else:
                prediction = 0
            
            predict_result.append(prediction)
            
#             print(question_list[topk_idx_TF[0]])
#             print(Sen2_list[index])
#             print(Sen1_list[index])
            
            pbar.update(1)
            
#     print(label_list[:10])
#     print(predict_result)

    cf_matrix_test,figure_test = performance(label_list,predict_result)
    figure_test.show()

In [19]:
TF_IDF_QA_test()

Building prefix dict from the default dictionary ...


数据准备中


Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.031 seconds.
Prefix dict has been built successfully.


准备完毕


正在测试: 100%|██████████| 2968/2968 [07:14<00:00,  6.83it/s]


Unnamed: 0,Class_type,F1-macro,Recall-macro,Precision-macro
0,0,0.959484,0.96037,0.959061
1,1,0.959484,0.96037,0.959061
