In [4]:
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
from transformers import AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification

In [5]:
def find_insert_text(str1, str2):
    str1_list = str1.split(' ')
    str2_list = str2.split(' ')
    i = 0
    j = 0
    res = ''
    for j in range(len(str2_list)):
        if str1_list[i] != str2_list[j]:
            res += str2_list[j] + ' '
        else:
            i += 1
        if i == len(str1_list):
            break
    return res

In [9]:
model = 'deberta'
if model == 't5':
    model_name = "PavanNeerudu/t5-base-finetuned-qqp"
    MODEL = AutoModelForSeq2SeqLM.from_pretrained("./t5")
    tokenizer = AutoTokenizer.from_pretrained("./t5")
    nlp = pipeline('text2text-generation', model=MODEL, tokenizer=tokenizer, device=0)
elif model == 'deberta':
    model_name = "Tomor0720/deberta-large-finetuned-qqp"
    MODEL = AutoModelForSequenceClassification.from_pretrained("./deberta")
    tokenizer = AutoTokenizer.from_pretrained("./deberta")
    nlp = pipeline('text-classification', model=MODEL, tokenizer=tokenizer, device=0)
else:
    raise Exception
df = pd.read_csv('./qqp_lego.tsv', sep='\t')
df['index'] = df.reset_index().index

In [None]:
def t5_get_label(question1, question2, nlp):
    input_text = "qqp question1: " + question1 + "question2: " + question2
    res = nlp(input_text)
    if 'generated_text' in res[0]:
        return res[0]['generated_text']
    else:
        return ''

def deberta_get_label(question1, question2, nlp):
    input_text = question1 + " " + question2
    res = nlp(input_text)
    if res[0]['label'] == 'LABEL_0':
        return 'not_duplicate'
    elif res[0]['label'] == 'LABEL_1':
        return 'duplicate'
    else:
        return "ERROR"

def get_label(question1, question2, nlp):
    if model == 't5':
        return t5_get_label(question1, question2, nlp)
    elif model == 'deberta':
        return deberta_get_label(question1, question2, nlp)
    else:
        return ''

In [None]:
former_text = ''
index_set = set()
alone_index_set = set()
row_list = []
for index, row in tqdm(df.iterrows()):
    if index == 0:
        former_text = row['text_a']
    if row['text_a'] == former_text:
        row_list.append(row)
    else:
        if len(row_list) <= 1:
            former_text = row['text_a']
            row_list = [row]
            alone_index_set.add(int(row_list[0]['index']))
            continue
        right_row_list = []
        wrong_row_list = []
        for sub_row in row_list:
            if get_label(sub_row['text_a'], sub_row['text_b'], nlp) == 'duplicate':
                wrong_row_list.append(sub_row)
            else:
                right_row_list.append(sub_row)
        for i in range(len(right_row_list)):
            for j in range(len(wrong_row_list)):
                right_insert_text = find_insert_text(right_row_list[i]['text_a'], right_row_list[i]['text_b'])
                wrong_insert_text = find_insert_text(wrong_row_list[j]['text_a'], wrong_row_list[j]['text_b'])
                if get_label(right_insert_text,wrong_insert_text,nlp) == 'duplicate':
                    if get_label(right_row_list[i]['text_b'], wrong_row_list[j]['text_b'], nlp) != 'duplicate':
                        index_set.add(int(right_row_list[i]['index']))
                        index_set.add(int(wrong_row_list[j]['index']))
        former_text = row['text_a']
        row_list = [row]

In [None]:
res_df = pd.DataFrame()
for l in list(index_set):
    res_df.append(df.iloc[l], ignore_index=True)
res_df.to_csv('result_' + model + '_meaning_likelihood.tsv', sep='\t')
alone_df = pd.DataFrame()
for l in list(alone_index_set):
    alone_df.append(df.iloc[i], ignore_index=True)
alone_df.to_csv('alone_' + 'result_' + model + '_meaning_likelihood.tsv', sep='\t')