# CE Sentence Ranker 설명 
- 유니크한 IPC 기준 8:1:1로 골드 데이터셋 스플릿 후 cross encoder 학습 
- cross encoder는 DeBERTa-Large 기반, Doc Ranker에서 전이학습을 했고 loss는 RMSE를 사용 
- 10 에포크 학습. Validation loss는 0.1-0.2 RMSE 정도
- 한개의 쿼리 문장당 100개 이상의 후보 문장들 존재 
- A100-80GB 한장 기준, 배치 사이즈 없이 하나씩 처리할때 cross encoder로 계산하는데 3초가 걸린다.  

In [12]:
import numpy as np 
import pandas as pd 
import os 
from tqdm.auto import tqdm 
from transformers import (
    AdamW, 
    AutoConfig, 
    AutoModel, 
    AutoTokenizer, 
    get_linear_schedule_with_warmup
) 
import torch 
import torch.nn.functional as F 
import torch.nn as nn 
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, RandomSampler, SequentialSampler, IterableDataset
import math 
import time 
import datetime 
import re

In [13]:
files = os.listdir("../storage/FGH_spec_ind_claim_triplet_v1.4.1s") 
len(files)

2590216

In [14]:
df = pd.read_excel("0919_라벨링세트_9주차_병합.xlsx")

df = df.loc[df["라벨링"].notnull(), ["쿼리 번호", "IPC 분류", "쿼리 문장", "후보 문장", "쿼리 문서 번호", "Positive 문서 번호", "라벨링"]] 
df = df.dropna() 
labels_fixed = [] 
labels = df["라벨링"].values 

for i in range(len(labels)): 
    if labels[i] == 0.1:
        labels_fixed.append(1.0) 
    elif labels[i] not in [0,0.5,0.8,1.0]: 
        labels_fixed.append(None) 
    else: 
        labels_fixed.append(labels[i]) 
        
df["라벨링"] = labels_fixed
df = df.dropna() 
ipc_types = df["IPC 분류"].values 
unique_ipcs = np.unique(ipc_types) 

train_size = int(len(unique_ipcs) * 0.8) 
val_size = int(len(unique_ipcs) * 0.1) 

train_unique_ipcs = unique_ipcs[:train_size] 
val_unique_ipcs = unique_ipcs[train_size:train_size+val_size] 
test_unique_ipcs = unique_ipcs[train_size+val_size:] 

In [15]:
df

Unnamed: 0,쿼리 번호,IPC 분류,쿼리 문장,후보 문장,쿼리 문서 번호,Positive 문서 번호,라벨링
0,166,F21V504,an adjustable lens positioned so as to alter ...,Several mechanisms for altering the beam prod...,20080259600,6474837,0.8
1,166,F21V504,an adjustable lens positioned so as to alter ...,When a plurality of aperture plates are incor...,20080259600,6474837,0.0
2,166,F21V504,an adjustable lens positioned so as to alter ...,"By deforming the base member 20 in Bailey, th...",20080259600,6474837,0.5
3,166,F21V504,an adjustable lens positioned so as to alter ...,Beam modifying optics are used to alter the f...,20080259600,6474837,0.8
4,166,F21V504,an adjustable lens positioned so as to alter ...,For an aperture plate with light refractive o...,20080259600,6474837,1.0
...,...,...,...,...,...,...,...
33695,4207,G06F017/60,using a death benefit value of the policy for ...,As the investment in the life insurance policy...,20100000000,20000000000,0.5
33696,4207,G06F017/60,using a death benefit value of the policy for ...,This database provides historical or anticipat...,20100000000,20000000000,0.0
33697,4207,G06F017/60,using a death benefit value of the policy for ...,3 shows the process for determining the monthl...,20100000000,20000000000,0.0
33698,4207,G06F017/60,using a death benefit value of the policy for ...,For example the lender may require additional ...,20100000000,20000000000,0.0


In [17]:
# make sure to only test with samples with at least one 0.8 or 1.0 score. 
train_queries, train_candidates, train_labels = [], [], [] 
valid_queries, valid_candidates, valid_labels = [], [], [] 
test_queries, test_candidates, test_labels = [], [], [] 
test_query_nums, test_candidate_nums = [], [] 

ipcs = df["IPC 분류"].values 
queries = df["쿼리 문장"].values 
candidates = df["후보 문장"].values 
labels = df["라벨링"].values
query_nums = df["쿼리 문서 번호"].values 
positive_nums = df["Positive 문서 번호"].values 

for i in tqdm(range(len(queries)), position=0, leave=True): 
    if ipcs[i] in train_unique_ipcs: 
        train_queries.append(queries[i]) 
        train_candidates.append(candidates[i]) 
        train_labels.append(labels[i]) 
    elif ipcs[i] in val_unique_ipcs: 
        valid_queries.append(queries[i]) 
        valid_candidates.append(candidates[i]) 
        valid_labels.append(labels[i]) 
    elif ipcs[i] in test_unique_ipcs: 
        test_queries.append(queries[i]) 
        test_candidates.append(candidates[i]) 
        test_labels.append(labels[i])  
        test_query_nums.append(query_nums[i]) 
        test_candidate_nums.append(positive_nums[i])  
        
        
# for inference, we only use test data. Train and valid wree used for training our cross encoder. 

  0%|          | 0/33077 [00:00<?, ?it/s]

In [105]:
df.columns

Index(['쿼리 번호', 'IPC 분류', '쿼리 문장', '후보 문장', '쿼리 문서 번호', 'Positive 문서 번호',
       '라벨링'],
      dtype='object')

In [124]:
'''
First, we will get all the unique test_queries 
Second, for each test query we get all the corresponding candidate sentences, candidate labels and we also get the candidate document number 
Get all sentences from the candidate document and then merge with the obtained candidate sentences. Give zero labels for all candidate sentences that are not in the labeled dataframe. 
Store array of tuples (query, candidate, actual score, predicted score) and sort in descending order based on predicted score 
'''
tokenizer = AutoTokenizer.from_pretrained("tanapatentlm/patentdeberta_large_spec_128_pwi")
device = torch.device("cuda")

# define model 
class SentenceRanker(nn.Module): 
    def __init__(self, plm="tanapatentlm/patentdeberta_large_spec_128_pwi"): 
        super(SentenceRanker, self).__init__() 
        self.config = AutoConfig.from_pretrained(plm)  
        self.net = AutoModel.from_pretrained(plm) 
        self.tokenizer = AutoTokenizer.from_pretrained(plm) 
        self.tokenizer.add_special_tokens({"additional_special_tokens":["[IPC]", "[TTL]", "[CLMS]", "[ABST]"]}) 
        self.net.resize_token_embeddings(len(self.tokenizer))
        self.dropout = nn.Dropout(0.1) 
        self.fc = nn.Linear(self.config.hidden_size, 1) 
        
    def mean_pooling(self, model_output, attention_mask): 
        token_embeddings = model_output[0] 
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    def forward(self, input_ids, attention_mask): 
        x = self.net(input_ids, attention_mask) 
        x = self.mean_pooling(x, attention_mask) 
        x = self.dropout(x) 
        x = self.fc(x) 
        return x 

print("Loading Model...") 
model = SentenceRanker() 
checkpoint = torch.load("DeBERTa_Cross_Encoder.pt") 
model.load_state_dict(checkpoint) 
model.cuda() 
model.eval() 

def split_sentences(doc_num, L=100): # L is the threshold for sentence length 
    with open("../storage/FGH_spec_ind_claim_triplet_v1.4.1s/" + str(doc_num) + ".txt", "r") as f: 
        doc = f.read() 
    positive_doc = doc.replace(".",";") 
    p_ttl = re.search("<TTL>([\s\S]*?)<IPC>", positive_doc).group(1)
    p_ipc = re.search("<IPC>([\s\S]*?)<ABST>", positive_doc).group(1)
    p_abst = re.search("<ABST>([\s\S]*?)<CLMS>", positive_doc).group(1)
    p_clms = re.search("<CLMS>([\s\S]*?)<DESC>", positive_doc).group(1)
    p_desc = re.search("<DESC>([\s\S]*)$", positive_doc).group(1)
    splitted_positives = [] 
    for split in re.split(r"wherein|[;\n]+", p_abst.replace(".",";")):
        if len(split) > L:
            splitted_positives.append(split) 
    for split in re.split(r"wherein|[;\n]+", p_clms.replace(".",";")):
        if len(split) > L:
            splitted_positives.append(split) 
    for split in re.split(r"wherein|[;\n]+", p_desc.replace(".",";")):
        if len(split) > L:
            splitted_positives.append(split) 
    splitted_positives = list(set(splitted_positives))
    return splitted_positives 


test_queries = np.array(test_queries) 
_, idx = np.unique(test_queries, return_index=True) 
unique_test_queries = test_queries[np.sort(idx)] 

saved_tuples = [] # for later analysis 
ranks = [] 

for test_query in tqdm(unique_test_queries, desc="Inference", position=0, leave=True):
    search_df = df[df["쿼리 문장"]==test_query] 
    candidates = search_df["후보 문장"].values 
    candidate_labels = search_df["라벨링"].values 
    candidate_doc_num = np.unique(search_df["Positive 문서 번호"]) 
    
    splitted_sentences = [] 
    for doc_num in candidate_doc_num: 
        split_batch = split_sentences(doc_num) 
        splitted_sentences.extend(split_batch) 
        
    # splitted_sentences = split_sentences(candidate_doc_num) 
    all_candidates = np.array(candidates.tolist() + splitted_sentences)
    _, idx = np.unique(all_candidates, return_index=True)
    all_candidates = all_candidates[np.sort(idx)]
    
    # give zero label for all sentences that are not in the gold dataset, regardless of whether they are really similar to the query or not. 
    all_labels = candidate_labels.tolist() + [0 for _ in range(len(all_candidates) - len(candidates))] 
    
    # inference 
    all_tuples = [] # query, candidate, gold dataset score, predicted score 
    ce_scores = [] 

    for i in range(len(all_candidates)):
        encoded_input = tokenizer(test_query, all_candidates[i], max_length=256, truncation=True, padding="max_length", return_tensors="pt").to(device) 
        input_ids = encoded_input["input_ids"] 
        attn_mask = encoded_input["attention_mask"] 
        with torch.no_grad(): 
            output = model(input_ids, attn_mask) 
        ce_scores.append(output.item()) 
    
    for i in range(len(all_candidates)): 
        all_tuples.append((test_query, 
                           all_candidates[i],
                           all_labels[i], 
                           ce_scores[i]))
    
    
    sorted_list = sorted(
        all_tuples,
        key=lambda t: t[3],
        reverse=True
    )

    rank = 0 
    for i in range(len(sorted_list)): 
        if sorted_list[i][2] >= 0.8: 
            rank = i+1
            break 
            
    ranks.append(rank)     
    saved_tuples.append(all_tuples) 


Loading Model...


Some weights of the model checkpoint at tanapatentlm/patentdeberta_large_spec_128_pwi were not used when initializing DebertaModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Inference:   0%|          | 0/196 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: '../storage/FGH_spec_ind_claim_triplet_v1.4.1s/20090163784.txt'

In [127]:
for r in ranks: 
    if r <= 1000 and r > 0: 
        rr.append(1/r) 
    else:
        rr.append(0) 
        
np.mean(rr)

0.24421389252911227

In [128]:
s,cnt = 0,0  
for r in ranks: 
    if r > 0: 
        s += r 
        cnt += 1
        
s / cnt 

45.470588235294116