In [1]:
!pip install -U sentence-transformers
!pip install openpyxl

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=454b61dfa8de29619e98ce6c71508bf120016553e6c53c281774c881bef776fb
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
[0m

# Directly use the Sentence bert model

In [2]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from copy import deepcopy
from random import randint,shuffle
from termcolor import colored
# model = SentenceTransformer('embedding-data/deberta-sentence-transformer')
# ,device='cuda'

## Read the data

In [3]:
def read_and_split_the_excel(QA_path):
    """
    :func: 根据xlsx文件获取问题list和答案list（需要更新openyxl）
    :param path: 文件路径
    :return: 问题list，答案list
    """
    # 读取文件
    df1 = pd.read_excel(QA_path)
    # 分开
    question_list = df1.iloc[:,0].tolist()
    answer_list = df1.iloc[:,1].tolist()
    # 返回
    return question_list,answer_list

# 测试read_and_split_the_excel
question_list,answer_list = read_and_split_the_excel("../input/uic-cn-admission/CN_QA_dataset_all.xlsx")
display(question_list[:3])

['UIC的办学性质是什么？', '学校现在有多少在校生？', 'UIC的全称是什么？']

In [4]:
def read_and_split_the_01(zero_one_path):
    """
    :func: 根据xlsx文件获取原始list和测试list和label
    :param path: 文件路径
    :return: 问题list，答案list
    """
    # 读取文件
    df1 = pd.read_csv(zero_one_path)
    # 分开
    Sen1_list = df1.iloc[:,0].tolist()
    Sen2_list = df1.iloc[:,1].tolist()
    label_list = df1.iloc[:,2].tolist()
    # 返回
    return Sen1_list,Sen2_list,label_list

# 测试read_and_split_the_excel
Sen1_list, Sen2_list, label_list = read_and_split_the_01("../input/01-uic-rm-dup/01_all_rm_dup.csv")
display(Sen1_list[:3])
display(Sen2_list[:3])
display(label_list[:3])

['UIC的办学性质是什么？', '学校现在有多少在校生？', '北师港浸大的全称是什么？']

['UIC的业务性质是什么？', '目前有多少学生在学校就读？', '北师大香港浸会大学的全称是什么？']

[1, 1, 1]

In [5]:
def shuffle_without_repeated(list_):
    temp_list = deepcopy(list_)
    m = len(temp_list)
    m = m-1
    for i_current in range(m,1,-1):
        rest = i_current - 1
        i_replace = randint(0, rest)
#         print(i_current)
#         print(i_replace)
        temp_list[i_current], temp_list[i_replace] = temp_list[i_replace], temp_list[i_current]
    return temp_list
    
def obtain_shuffle_01(ori_list):
    shuffle_q_list = shuffle_without_repeated(ori_list)
    
    shuffle_label_list = [0]*len(shuffle_q_list)
    
    return ori_list,shuffle_q_list,shuffle_label_list

In [6]:
# Test the shuffle
question_list = ['The cat sits outside',
      'A man is playing guitar',
      'The new movie is awesome',
      'The new opera is nice']
obtain_shuffle_01(question_list)

(['The cat sits outside',
  'A man is playing guitar',
  'The new movie is awesome',
  'The new opera is nice'],
 ['The cat sits outside',
  'The new movie is awesome',
  'The new opera is nice',
  'A man is playing guitar'],
 [0, 0, 0, 0])

In [7]:
def read_qa_and_expand_training_set(QA_path, zero_one_path):
    # get the qa_data
    question_list,answer_list = read_and_split_the_excel(QA_path)
    # get the 01_data
    Sen1_list, Sen2_list, label_list = read_and_split_the_01(zero_one_path)
    # get expand 01 data
    ori_list,shuffle_q_list,shuffle_label_list = obtain_shuffle_01(question_list)
    Sen1_list.extend(ori_list)
    Sen2_list.extend(shuffle_q_list)
    label_list.extend(shuffle_label_list)
    
    Sen1_list
    Sen2_list
    
    return question_list,answer_list,Sen1_list,Sen2_list,label_list

## Use for the QA 

In [8]:
def SBERT_get_reply(model, query, question_list, answer_list, question_list_emb, topk_SBERT,threshold_SBERT):
    
    # prepared for queries
    queries = [query]
    query_embeddings = model.encode(queries,convert_to_tensor=True)
    if_valid = 0
        
    index_ranked = []
    tensor_scores = []
    
    
    # search the best answer
#     for query, query_embedding in zip(queries, query_embeddings):
    cosine_scores = util.pytorch_cos_sim(query_embeddings,question_list_emb)[0]
    results = zip(range(len(cosine_scores)), cosine_scores)
    # 第一个是按照score排序的index，第二个为对应的score但是是tensor格式
    results = sorted(results, key=lambda x: x[1],reverse=True)
    
        
    for index,tensor_score in results:
        index_ranked.append(index)
        tensor_scores.append(tensor_score)
        
    if  tensor_scores[0] > threshold_SBERT:
        if_valid = 1
    
    # 回答答案
    print('top few questions(TFIDF: %d) similar to "%s"' % (topk_SBERT, colored(query, 'green')))
    print("The best similarity for TF-IDF is:", tensor_scores[0])
    
    # 得到前几个的index
    topk_idx_SBERT = index_ranked[:topk_SBERT]
    
    for idx in topk_idx_SBERT:
        print('SBERT; %s\t%s' % (colored('%.4f' % tensor_scores[idx], 'cyan'), colored(question_list[idx], 'yellow')))
    
    if if_valid:
        print(answer_list[index_ranked[0]])
        
    return if_valid, topk_idx_SBERT, tensor_scores

In [9]:
def use_model_qa(model_path,QA_path):
    print("数据准备中")
    model = SentenceTransformer(model_path)
    
    topk_SBERT = 3
    threshold_SBERT = 0.6
    
    # data embedding
    question_list,answer_list = read_and_split_the_excel(QA_path)
    question_embeddings = model.encode(question_list,convert_to_tensor=True)
    print("准备完毕")
    while(1):
        # 获得问题
        query = input("请输入问题（输入quit退出）:")
        if query == "quit":
            break
        
    
        SBERT_get_reply(model, query, question_list, answer_list, question_embeddings, topk_SBERT, threshold_SBERT)

In [10]:
# # Test for all
# print("数据准备中")
# model = SentenceTransformer(model_path)

# topk_SBERT = 3
# threshold_SBERT = 0.7

# # data embedding
# question_list,answer_list = read_and_split_the_excel(QA_path)
# question_embeddings = model.encode(question_list,convert_to_tensor=True)
# print("准备完毕")
# # 获得问题
# q = "UIC是"
# SBERT_get_reply(model, q, question_list, answer_list, question_embeddings, topk_SBERT, threshold_SBERT)

In [11]:
model_path = 'embedding-data/deberta-sentence-transformer'
QA_path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
# use_model_qa(model_path,QA_path)

# Make fine-tune for the embedding model

In [12]:
#Define the model. Either from scratch of by loading a pre-trained model
model = SentenceTransformer('jamescalam/deberta-v3-base-qa',device='cuda')

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/860 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/126 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/735M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [13]:
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, evaluation, losses
from torch.utils.data import DataLoader

QA_path = "../input/uic-cn-admission/CN_QA_dataset_all.xlsx"
zero_one_path = "../input/01-uic-rm-dup/01_all_rm_dup.csv"

# perpared_data
question_list,answer_list,Sen1_list,Sen2_list,label_list = read_qa_and_expand_training_set(QA_path, zero_one_path)


train_size = int(len(question_list) * 0.8)
eval_size = len(question_list) - train_size

# Define your train examples.
train_data = []
for idx in range(train_size):
#     print(Sen1_list[idx])
#     print(Sen2_list[idx])
#     print(label_list[idx])

    train_data.append(InputExample(texts=[Sen1_list[idx], Sen2_list[idx]], label=float(label_list[idx])))

evaluator = evaluation.EmbeddingSimilarityEvaluator(Sen1_list,Sen2_list, label_list)


# Define your train dataset, the dataloader and the train loss
train_dataset = SentencesDataset(train_data, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=32)
train_loss = losses.CosineSimilarityLoss(model)

print("Ready for train")

Ready for train


In [14]:
# Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=100, warmup_steps=100, evaluator=evaluator, evaluation_steps=100, output_path='./deberta_fine_tune')

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13 [00:00<?, ?it/s]