In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from datasets import list_metrics,load_metric
from sklearn.metrics import confusion_matrix

## Evaluation function

In [2]:
def performance(y_ture,y_pred):
    f1_metric = load_metric("f1")
    re_metric = load_metric("recall")
    pre_metric = load_metric("precision")
    type_c_int = list(set(np.concatenate([y_ture, y_pred])))
    type_c = [str(i) for i in type_c_int]
    
    f1_m_list = []
    re_m_list = []
    pre_m_list = []
    
    for i in type_c_int:
        bi_ture = list(y_ture == i)
        bi_pred = list(y_pred == i)
        f1_m_results = f1_metric.compute(predictions=bi_pred, references=bi_ture, average="macro")
        re_m_results = re_metric.compute(predictions=bi_pred, references=bi_ture, average="macro")
        pre_m_results = pre_metric.compute(predictions=bi_pred, references=bi_ture, average="macro")
        
        f1_m_list.append(f1_m_results["f1"])
        re_m_list.append(re_m_results["recall"])
        pre_m_list.append(pre_m_results["precision"])
        
    data = {'Class_type':type_c_int,'F1-macro':f1_m_list,'Recall-macro':re_m_list,'Precision-macro':pre_m_list}
    df = pd.DataFrame(data)
    display(df)
    
    z = confusion_matrix(y_ture, y_pred)
    x_lab = type_c

    fig = px.imshow(z, 
                    text_auto=True,
                    labels=dict(y="True label", x="Predicted label", color="times"),
                    x=x_lab,
                    y=x_lab)
#     fig.show()
    
    return z,fig

In [3]:
# testing
cf_matrix_test,figure_test = performance([1,3,1,2,2,1],[2,3,1,3,3,2])
figure_test.show()

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.21k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Unnamed: 0,Class_type,F1-macro,Recall-macro,Precision-macro
0,1,0.625,0.666667,0.8
1,2,0.25,0.25,0.25
2,3,0.625,0.8,0.666667


# Loading the data for use

In [4]:
!pip install -U sentence-transformers
!pip install openpyxl

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m904.0 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=4e6439a5039dd147511225a522cf18afc4eb98ad2b6d9feeffce100ad9bf6340
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
[0m

In [5]:
from copy import deepcopy
from random import randint,shuffle

In [6]:
def read_and_split_the_excel(QA_path):
    """
    :func: 根据xlsx文件获取问题list和答案list（需要更新openyxl）
    :param path: 文件路径
    :return: 问题list，答案list
    """
    # 读取文件
    df1 = pd.read_excel(QA_path)
    # 分开
    question_list = df1.iloc[:,0].tolist()
    answer_list = df1.iloc[:,1].tolist()
    # 返回
    return question_list,answer_list

In [7]:
# 测试read_and_split_the_excel
question_list,answer_list = read_and_split_the_excel("../input/uic-cn-admission/CN_QA_dataset_all.xlsx")
display(question_list[:3])

['UIC的办学性质是什么？', '学校现在有多少在校生？', 'UIC的全称是什么？']

In [8]:
def read_and_split_the_01(zero_one_path):
    """
    :func: 根据xlsx文件获取原始list和测试list和label
    :param path: 文件路径
    :return: 问题list，答案list
    """
    # 读取文件
    df1 = pd.read_csv(zero_one_path)
    # 分开
    Sen1_list = df1.iloc[:,0].tolist()
    Sen2_list = df1.iloc[:,1].tolist()
    label_list = df1.iloc[:,2].tolist()
    # 返回
    return Sen1_list,Sen2_list,label_list

In [9]:
# 测试read_and_split_the_excel
Sen1_list, Sen2_list, label_list = read_and_split_the_01("../input/01-uic-rm-dup/01_all_rm_dup.csv")
display(Sen1_list[:3])
display(Sen2_list[:3])
display(label_list[:3])

['UIC的办学性质是什么？', '学校现在有多少在校生？', '北师港浸大的全称是什么？']

['UIC的业务性质是什么？', '目前有多少学生在学校就读？', '北师大香港浸会大学的全称是什么？']

[1, 1, 1]

In [10]:
def shuffle_without_repeated(list_):
    temp_list = deepcopy(list_)
    m = len(temp_list)
    m = m-1
    for i_current in range(m,1,-1):
        rest = i_current - 1
        i_replace = randint(0, rest)
#         print(i_current)
#         print(i_replace)
        temp_list[i_current], temp_list[i_replace] = temp_list[i_replace], temp_list[i_current]
    return temp_list
    
def obtain_shuffle_01(ori_list):
    shuffle_q_list = shuffle_without_repeated(ori_list)
    
    shuffle_label_list = [0]*len(shuffle_q_list)
    
    return ori_list,shuffle_q_list,shuffle_label_list

In [11]:
# Test the shuffle
question_list = ['The cat sits outside',
      'A man is playing guitar',
      'The new movie is awesome',
      'The new opera is nice']
obtain_shuffle_01(question_list)

(['The cat sits outside',
  'A man is playing guitar',
  'The new movie is awesome',
  'The new opera is nice'],
 ['The new movie is awesome',
  'The new opera is nice',
  'The cat sits outside',
  'A man is playing guitar'],
 [0, 0, 0, 0])

In [12]:
def read_qa_and_expand_training_set(QA_path, zero_one_path):
    # get the qa_data
    question_list,answer_list = read_and_split_the_excel(QA_path)
    # get the 01_data
    Sen1_list, Sen2_list, label_list = read_and_split_the_01(zero_one_path)
    # get expand 01 data
    ori_list,shuffle_q_list,shuffle_label_list = obtain_shuffle_01(question_list)
    Sen1_list.extend(ori_list)
    Sen2_list.extend(shuffle_q_list)
    label_list.extend(shuffle_label_list)
    
    # get the index of Sen1_list corresponding to the question_list
    question_list_index = range(len(question_list))
    question_index_dict = dict(zip(question_list,question_list_index))
    
    Sen1_list_index = []
    for i in Sen1_list:
        Sen1_list_index.append(question_index_dict[i])
        
#     print(Sen1_list_index)
    
    return question_list,answer_list,Sen1_list,Sen1_list_index,Sen2_list,label_list

In [13]:
# # Test for getting all the QA data and the zero_one data
# QA_path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
# zero_one_path = "../input/01-uic-rm-dup/01_all_rm_dup.csv"
# question_list,answer_list,Sen1_list,Sen1_list_index,Sen2_list,label_list = read_qa_and_expand_training_set(QA_path, zero_one_path)

# # display the sample result
# display(question_list[:3])
# display(answer_list[:3])
# display(Sen1_list[:3])
# display(Sen1_list_index[:10])
# display(Sen2_list[:3])
# display(label_list[:3])

In [14]:
import time
from tqdm import tqdm

# test
with tqdm(total=200) as pbar:
    pbar.set_description('Processing')
    # total表示总的项目, 循环的次数20*10(每次更新数目) = 200(total)
    for i in range(20):
        # 进行动作, 这里是过0.1s
        time.sleep(0.1)
        # 进行进度更新, 这里设置10个
        pbar.update(10)

Processing: 100%|██████████| 200/200 [00:02<00:00, 98.69it/s]


# Word_embedding

In [15]:
import pandas as pd
import numpy as np
from termcolor import colored
from transformers import BertTokenizer, BertModel
import warnings
warnings.filterwarnings("ignore")

def transfer_sentence_vector(sentence,tokenizer,model):
    """
    :func: 把句子embedding成向量
    :param sentence: 句子
    :param tokenizer: 分词器
    :param model: 模型
    :return: 转成的向量
    """
    # generate question vector
    encoded_input = tokenizer(sentence, return_tensors='pt')
    output = model(**encoded_input)[1].detach().numpy()
    #第零个表示的是整个句子的信息
    return output.tolist()[0]

# # 测试
# tokenizer = BertTokenizer.from_pretrained('./bert_base_chinese')
# model = BertModel.from_pretrained('./bert_base_chinese')
# sentence = '学校现在有多少在校生？'
# print(transfer_sentence_vector(sentence,tokenizer,model))

def transfer_all_q2v(sentence_list,tokenizer,model):
    """
    :func: 把句子list都embedding成向量
    :param sentence: 句子的list
    :param tokenizer: 分词器
    :param model: 模型
    :return: 转成的向量list
    """
    doc_vecs=[]
    for sentence in sentence_list:
        doc_vecs.append(transfer_sentence_vector(sentence,tokenizer,model))
    doc_vecs = np.array(doc_vecs)
    return doc_vecs

# # 测试
# tokenizer = BertTokenizer.from_pretrained('./bert_base_chinese')
# model = BertModel.from_pretrained('./bert_base_chinese')
# question_list,answer_list = read_and_split_the_excel("./dataset/CN_QA_dataset_all.xlsx")
# doc_vecs = transfer_all_q2v(question_list,tokenizer,model)
# print(doc_vecs)


def get_similar_q_id(query_vec, doc_vecs, tokenizer, model, topk=5, threshold=0.95, all_score_without_rank=0):
    """
    :func: 通过cosine similarity找到相似句子
    :param sentence: 转为向量的句子
    :param doc_vecs: 已经转换为向量的句子列表
    :param topk: 显示前topk个最相似的句子
    :param threshold: 认为是匹配的问句的有效阈值
    :param tokenizer: 分词器
    :param model: 模型
    :return: 是否达到要求，返回满足阈值要求的问题所在行索引——对应答案所在的行索引的np.array，相似度分数
    """
    # compute normalized dot product as score
    score = np.sum(query_vec * doc_vecs, axis=1) / np.linalg.norm(doc_vecs, axis=1) / np.linalg.norm(query_vec)

    if all_score_without_rank:
        return score
    else:
        # get the top "topk" score's id
        topk_idx = np.argsort(score)[::-1][:topk]

        # if the score is larger than the threshold
        if score[topk_idx[0]] < threshold:
            if_vaild = 0
        else:
            if_vaild = 1

        return if_vaild, topk_idx, score

# 测试
# tokenizer = BertTokenizer.from_pretrained('./bert_base_chinese')
# model = BertModel.from_pretrained('./bert_base_chinese')
# question_list,answer_list = read_and_split_the_excel("./dataset/CN_QA_dataset_all.xlsx")
# sentence = '学校现在有多少在校生？'
# sentence_vec = transfer_sentence_vector(sentence,tokenizer,model)
# topk = 5
# threshold = 0.95
#
# doc_vecs = transfer_all_q2v(question_list,tokenizer,model)
# if_vaild, topk_idx, score = get_similar_q_id(sentence_vec,doc_vecs,tokenizer,model,topk,threshold)
# print(if_vaild)
# print(topk_idx)
# print(score[:5])

def Bert_em_prepared(data_path, model_path):
    # initial things
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertModel.from_pretrained(model_path)
    question_list, answer_list = read_and_split_the_excel(data_path)
    doc_vecs = transfer_all_q2v(question_list, tokenizer, model)

    return tokenizer, model, question_list,answer_list, doc_vecs

def Bert_em_reply(query,tokenizer, model, question_list,answer_list, doc_vecs, topk=5, threshold=0.95):

    query_vec = transfer_sentence_vector(query, tokenizer, model)
#     print(query_vec)
    # 匹配
    if_vaild, Bert_emb_topk_idx, Bert_emb_each_score = get_similar_q_id(query_vec, doc_vecs, tokenizer, model, topk, threshold)

#     # 返回最相似的问题
#     print('top %d questions similar to "%s"' % (topk, colored(query, 'green')))
#     for idx in Bert_emb_topk_idx:
#         print('&gt; %s\t%s' % (colored('%.4f' % Bert_emb_each_score[idx], 'cyan'), colored(question_list[idx], 'yellow')))
#     print("The best similarity is:", Bert_emb_each_score[Bert_emb_topk_idx[0]])

#     # get the answer
#     if if_vaild:
#         print(answer_list[Bert_emb_topk_idx[0]])
    return if_vaild, Bert_emb_topk_idx, Bert_emb_each_score

def Bert_em_QA_test(model_path):
    # preparing
    print("数据准备中")

    data_path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"

    
#     model_path = './distilbert_base_multilingual_cased'

    zero_one_path = "../input/01-uic-rm-dup/01_all_rm_dup.csv"

    tokenizer, model, question_list, answer_list, doc_vecs = Bert_em_prepared(data_path, model_path)

    Bert_emb_topk = 5
    Bert_emb_threshold = 0.95
    
    # prepared testing data
    question_list,answer_list,Sen1_list,Sen1_list_index,Sen2_list,label_list = read_qa_and_expand_training_set(data_path, zero_one_path)
    
    print("准备完毕")
    
    predict_result = []
    
    # get the query
    
    with tqdm(total=len(Sen1_list)) as pbar:
        pbar.set_description('正在测试')
        
        for index, test_query in enumerate(Sen2_list):

            # get the reply
            if_vaild, Bert_emb_topk_idx, Bert_emb_each_score = Bert_em_reply(test_query, tokenizer, model, question_list, answer_list, doc_vecs, Bert_emb_topk, Bert_emb_threshold)
            
            if Bert_emb_topk_idx[0] == Sen1_list_index[index]:
                prediction = 1
            else:
                prediction = 0
            
            predict_result.append(prediction)
            
#             print(question_list[topk_idx_TF[0]])
#             print(Sen2_list[index])
#             print(Sen1_list[index])
            
            pbar.update(1)
                
#     print(label_list[:10])
#     print(predict_result)

    cf_matrix_test,figure_test = performance(label_list,predict_result)
    figure_test.show()

In [16]:
model_path = 'bert-base-chinese'
Bert_em_QA_test(model_path)

数据准备中


Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/393M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


准备完毕


正在测试: 100%|██████████| 2968/2968 [03:34<00:00, 13.81it/s]


Unnamed: 0,Class_type,F1-macro,Recall-macro,Precision-macro
0,0,0.52783,0.609717,0.761411
1,1,0.52783,0.609717,0.761411


In [17]:
model_path = 'clue/roberta_chinese_base'
Bert_em_QA_test(model_path)

数据准备中


Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/621 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading:   0%|          | 0.00/393M [00:00<?, ?B/s]

Some weights of the model checkpoint at clue/roberta_chinese_base were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


准备完毕


正在测试: 100%|██████████| 2968/2968 [03:35<00:00, 13.79it/s]


Unnamed: 0,Class_type,F1-macro,Recall-macro,Precision-macro
0,0,0.854406,0.862663,0.876586
1,1,0.854406,0.862663,0.876586
