# Testing our Cross-Encoder Model

In [2]:
import numpy as np 
import pandas as pd 
import os 
from tqdm.auto import tqdm 
from transformers import (
    AdamW, 
    AutoConfig, 
    AutoModel, 
    AutoTokenizer, 
    get_linear_schedule_with_warmup
) 
import torch 
import torch.nn.functional as F 
import torch.nn as nn 
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler, IterableDataset
import math 
import time 
import datetime 
import re

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [3]:
files = os.listdir("../storage/FGH_spec_ind_claim_triplet_v1.4.1s") 
len(files)

2590216

In [4]:
df = pd.read_excel("0919_라벨링세트_9주차_병합.xlsx") 

df = df.loc[df["라벨링"].notnull(), ["쿼리 번호", "IPC 분류", "쿼리 문장", "후보 문장", "쿼리 문서 번호", "Positive 문서 번호", "라벨링"]] 
df = df.dropna() 
labels_fixed = [] 
labels = df["라벨링"].values 

for i in range(len(labels)): 
    if labels[i] == 0.1: 
        labels_fixed.append(1.0) 
    elif labels[i] not in [0, 0.5, 0.8, 1.0]: 
        labels_fixed.append(None) 
    else: 
        labels_fixed.append(labels[i]) 

df["라벨링"] = labels_fixed 
df = df.dropna() 
query_numbers = df["쿼리 번호"].values 
unique_queries = np.unique(query_numbers) 

In [5]:
train_size = int(len(unique_queries) * 0.8) 
val_size = int(len(unique_queries) * 0.1) 

train_unique_queries = unique_queries[:train_size] 
val_unique_queries = unique_queries[train_size:train_size+val_size] 
test_unique_queries = unique_queries[train_size+val_size:] 

In [6]:
train_queries, train_candidates, train_labels = [], [], [] 
valid_queries, valid_candidates, valid_labels = [], [], [] 
test_queries, test_candidates, test_labels = [], [], [] 

test_query_nums, test_candidate_nums = [], [] 

query_nums = df["쿼리 번호"].values 
queries = df["쿼리 문장"].values
candidates = df["후보 문장"].values 
labels = df["라벨링"].values 
query_document_ids = df["쿼리 문서 번호"].values 
candidate_document_ids = df["Positive 문서 번호"].values 

for i in tqdm(range(len(queries)), position=0, leave=True): 
    if query_nums[i] in train_unique_queries: 
        train_queries.append(queries[i]) 
        train_candidates.append(candidates[i]) 
        train_labels.append(labels[i]) 
    elif query_nums[i] in val_unique_queries: 
        valid_queries.append(queries[i]) 
        valid_candidates.append(candidates[i]) 
        valid_labels.append(labels[i]) 
    elif query_nums[i] in test_unique_queries: 
        test_queries.append(queries[i]) 
        test_candidates.append(candidates[i]) 
        test_labels.append(labels[i]) 
        test_query_nums.append(query_document_ids[i]) 
        test_candidate_nums.append(candidate_document_ids[i]) 

  0%|          | 0/33077 [00:00<?, ?it/s]

In [12]:
tokenizer = AutoTokenizer.from_pretrained("tanapatentlm/patentdeberta_large_spec_128_pwi") 
device = torch.device("cuda")

# define model 
class WeightedLayerPooling(nn.Module): 
    def __init__(self, num_hidden_layers, layer_start, layer_weights=None): 
        super(WeightedLayerPooling, self).__init__() 
        self.layer_start = layer_start 
        self.num_hidden_layers = num_hidden_layers 
        self.layer_weights = nn.Parameter(torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)) 
    def forward(self, all_hidden_states): 
        all_layer_embedding = torch.stack(list(all_hidden_states), dim=0) 
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :] 
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor * all_layer_embedding).sum(dim=0) / self.layer_weights.sum()  
        return weighted_average 

class SentenceRanker(nn.Module): 
    def __init__(self, plm="tanapatentlm/patentdeberta_large_spec_128_pwi"): 
        super(SentenceRanker, s 314/843 [02:45<04:38, 1.90batch/s, loss=0.00266]elf).__init__() 
        self.config = AutoConfig.from_pretrained(plm) 
        self.config.hidden_dropout = 0 
        self.config.hidden_dropout_prob = 0 
        self.config.attention_dropout = 0 
        self.config.attention_probs_dropout_prob = 0 
        self.model = AutoModel.from_pretrained(plm, config=self.config) 
        self.tokenizer = AutoTokenizer.from_pretrained(plm) 
        self.weighted_layer_pooling = WeightedLayerPooling(self.config.num_hidden_layers, 6, None) 
        self.fc = nn.Linear(self.config.hidden_size, 1) 
        self._init_weights(self.fc) 
    
    def _init_weights(self, module): 
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None: 
                module.bias.data.zero_() 
        elif isinstance(module, nn.Embedding): 
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None: 
                module.weight.data[module.padding_idx].zero_() 
        elif isinstance(module, nn.LayerNorm): 
            module.bias.data.zero_() 
            module.weight.data.fill_(1.0) 
    
    def forward(self, input_ids, attn_masks): 
        x = self.model(input_ids, attn_masks, output_hidden_states=True)
        x = self.weighted_layer_pooling(x.hidden_states) 
        x = x[:, 0] 
        x = self.fc(x) 
        return x 
    
    
# load preModifiedvious checkpoint 
checkpoint = "M_DeBERTa_Cross_Encoder_0.033700802607199876.pt"
model = SentenceRanker() 
checkpoint = torch.load(checkpoint) 
model.load_state_dict(checkpoint) 
model.cuda() 
model.eval() 
print()

Some weights of the model checkpoint at tanapatentlm/patentdeberta_large_spec_128_pwi were not used when initializing DebertaModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





In [13]:
def split_sentences(doc_num, L=100):
    with open("../storage/FGH_spec_ind_claim_triplet_v1.4.1s/" + str(doc_num) + ".txt", "r") as f: 
        doc = f.read() 
    positive_doc = doc.replace(".",";") 
    p_ttl = re.search("<TTL>([\s\S]*?)<IPC>", positive_doc).group(1)
    p_ipc = re.search("<IPC>([\s\S]*?)<ABST>", positive_doc).group(1)
    p_abst = re.search("<ABST>([\s\S]*?)<CLMS>", positive_doc).group(1)
    p_clms = re.search("<CLMS>([\s\S]*?)<DESC>", positive_doc).group(1)
    p_desc = re.search("<DESC>([\s\S]*)$", positive_doc).group(1)
    splitted_positives = [] 
    for split in re.split(r"wherein|[;\n]+", p_abst.replace(".",";")):
        if len(split) > L:
            splitted_positives.append(split) 
    for split in re.split(r"wherein|[;\n]+", p_clms.replace(".",";")):
        if len(split) > L:
            splitted_positives.append(split) 
    for split in re.split(r"wherein|[;\n]+", p_desc.replace(".",";")):
        if len(split) > L:
            splitted_positives.append(split) 
    splitted_positives = list(set(splitted_positives))
    return splitted_positives 


In [14]:
# only consider unique queries
test_queries = np.array(test_queries) 
_, idx = np.unique(test_queries, return_index=True) 
unique_test_queries = test_queries[np.sort(idx)] 

In [30]:
saved_tuples = [] 
ranks = [] 

for test_query in tqdm(unique_test_queries, desc="Inference", position=0, leave=True): 
    search_df = df[df["쿼리 문장"]==test_query] 
    candidates = search_df["후보 문장"].values 
    candidate_labels = search_df["라벨링"].values 
    candidate_doc_num = np.unique(search_df["Positive 문서 번호"]) 
   
    positive_sentence_exists = False 
    for i in range(len(candidate_labels)): 
        if candidate_labels[i] >= 0.8: 
            positive_sentence_exists = True 
            break
    
    if positive_sentence_exists == False: 
        continue # ignore samples that do not have labels at least 0.8 

    try: 
        for doc_num in candidate_doc_num: 
            splitted_sentences = split_sentences(doc_num) 

            all_candidates = np.array(candidates.tolist() + splitted_sentences) # force candidates to be in the pool of all splitted sentences, just in case 
            _, idx = np.unique(all_candidates, return_index=True) # get rid of possible duplicates 
            all_candidates = all_candidates[np.sort(idx)]

            all_labels = candidate_labels.tolist() + [0 for _ in range(len(all_candidates) - len(candidates))] # give zero labels for all sentences not in the gold dataset, regardless of their similarity to the query 

            all_tuples = [] # query, candidate, gold dataset, score, predicted score 
            sim_scores = [] 

            # process one by one without any batching -> this part can be modified for potential speedups 
            for i in range(len(all_candidates)): 
                encoded_input = tokenizer(test_query, all_candidates[i], max_length=256, truncation=True, padding="max_length", return_tensors="pt").to(device) 
                input_ids = encoded_input["input_ids"] 
                attn_masks = encoded_input["attention_mask"] 
                with torch.no_grad(): 
                    output = model(input_ids, attn_masks) 
                sim_scores.append(output.item()) 

            for i in range(len(all_candidates)):
                all_tuples.append((test_query, all_candidates[i], all_labels[i], sim_scores[i])) 

            sorted_list = sorted(all_tuples, key=lambda t: t[3], reverse=True)

            rank = 0 
            for i in range(len(sorted_list)): 
                if sorted_list[i][2] >= 0.8: 
                    rank = i+1 
                    break 

            ranks.append(rank) 
            saved_tuples.append(sorted_list) 
    except Exception as e: 
        print(e)
        continue 
        

Inference:   0%|          | 0/261 [00:00<?, ?it/s]

[Errno 2] No such file or directory: '../storage/FGH_spec_ind_claim_triplet_v1.4.1s/20040064166.txt'
[Errno 2] No such file or directory: '../storage/FGH_spec_ind_claim_triplet_v1.4.1s/20080200771.txt'
[Errno 2] No such file or directory: '../storage/FGH_spec_ind_claim_triplet_v1.4.1s/20030124221.txt'
[Errno 2] No such file or directory: '../storage/FGH_spec_ind_claim_triplet_v1.4.1s/20030124221.txt'


In [33]:
rr = [] 
for r in ranks:
    if r <= 100: 
        rr.append(1/r)  
    else: 
        rr.append(0)
        
rr_1000 = [] 
for r in ranks: 
    if r <= 1000: 
        rr_1000.append(1/r) 
    else:
        rr_1000.append(0) 

print(f"average rank: {np.mean(ranks)} \n\n MRR@100: {np.mean(rr)} \n\n MRR@1000: {np.mean(rr_1000)}")  

average rank: 14.0 

 MRR@100: 0.46898411543695884 

 MRR@1000: 0.4694485382038454


In [39]:
import pickle

with open("saved_tuples_cross_encoder.pkl", "wb") as f: 
    pickle.dump(saved_tuples, f) 

with open("saved_ranks_cross_encoder.pkl", "wb") as f: 
    pickle.dump(ranks, f) 

In [41]:
np.std(ranks)

27.065921424627362