In [5]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler, IterableDataset
from pytorch_metric_learning import miners, losses
from pytorch_metric_learning.distances import CosineSimilarity
import sys
from pathlib import Path
import shutil
import pytorch_lightning as pl
from pytorch_lightning.strategies.ddp import DDPStrategy
from pytorch_lightning.callbacks import BasePredictionWriter
from pytorch_lightning.core.saving import load_hparams_from_yaml, update_hparams
import torch
from torch.utils.data import Dataset, DataLoader
from typing import List
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import re
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)
import addict
import argparse

# for calculating MRR 
from scipy.spatial import distance 
from functools import partial 
from tqdm.contrib.concurrent import process_map 

In [6]:
class TripletData(Dataset): 
    def __init__(self, path): 
        super().__init__() 
        self.data = [] 
        with Path(path).open("r", encoding="utf8") as f:
            for i, triplet in enumerate(f):
                if i == 0:
                    continue 
                try: 
                    query, positive, negative = triplet.strip().split(",") 
                    data = [] 
                    data.append("../storage/kr_triplet_v1.1/{}.txt".format(query)) 
                    data.append("../storage/kr_triplet_v1.1/{}.txt".format(positive)) 
                    data.append("../storage/kr_triplet_v1.1/{}.txt".format(negative)) 
                    self.data.append(data) 
                except: 
                    continue 
    def __getitem__(self, index): 
        return self.data[index] 
    def __len__(self): 
        return len(self.data) 

In [7]:
class custom_collate(object): 
    def __init__(self, plm="tanapatentlm/patent-ko-deberta"): 
        self.tokenizer = AutoTokenizer.from_pretrained(plm) 
        self.tokenizer.add_special_tokens({"additional_special_tokens": ["[IPC]", "[TTL]", "[CLMS]", "[ABST]"]}) 
        self.chunk_size = 1024 
    def __call__(self, batch): 
        input_ids, attn_masks, labels = [], [], [] 
        ids = 0 
        for idx, triplet in enumerate(batch): 
            try: 
                query_txt, positive_txt, negative_txt = triplet
                with Path(query_txt).open("r", encoding="utf8") as f: 
                    q = f.read() 
                with Path(positive_txt).open("r", encoding="utf8") as f: 
                    p = f.read() 
                with Path(negative_txt).open("r", encoding="utf8") as f: 
                    n = f.read() 
                encoded_q = self.tokenizer(q, return_tensors="pt", max_length=self.chunk_size, padding="max_length", truncation=True) 
                encoded_p = self.tokenizer(p, return_tensors="pt", max_length=self.chunk_size, padding="max_length", truncation=True)  
                encoded_n = self.tokenizer(n, return_tensors="pt", max_length=self.chunk_size, padding="max_length", truncation=True) 
                
                input_ids.append(encoded_q["input_ids"]) 
                attn_masks.append(encoded_q["attention_mask"]) 
                labels.append(ids*2) 
                
                input_ids.append(encoded_p["input_ids"]) 
                attn_masks.append(encoded_p["attention_mask"]) 
                labels.append(ids*2) 

                input_ids.append(encoded_n["input_ids"]) 
                attn_masks.append(encoded_n["attention_mask"]) 
                labels.append(ids*2+1) 
                ids += 1 

            except Exception as e:
                print(e) 
                print("==="*100) 
                continue 
        input_ids = torch.stack(input_ids, dim=0).squeeze(dim=1) 
        attn_masks = torch.stack(attn_masks, dim=0).squeeze(dim=1) 
        labels = torch.tensor(labels, dtype=int) 
        return input_ids, attn_masks, labels 

In [8]:
class NeuralRanker(pl.LightningModule): 
    def __init__(self, hparams=dict(), plm="tanapatentlm/patent-ko-deberta"): 
        super(NeuralRanker, self).__init__() 
        self.hparams.update(hparams) 
        self.save_hyperparameters(ignore="hparams") 
        self.tokenizer = AutoTokenizer.from_pretrained(plm) 
        self.config = AutoConfig.from_pretrained(plm) 
        self.metric = losses.MultiSimilarityLoss() 
        self.miner = miners.MultiSimilarityMiner() 
        self.net = AutoModel.from_pretrained(plm) 
        if "additional_special_tokens" in self.hparams and self.hparams["additional_special_tokens"]: 
            additional_special_tokens = self.hparams["additional_special_tokens"] 
            self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) 
            self.net.resize_token_embeddings(len(self.tokenizer)) 

    def mean_pooling(self, model_output, attention_mask): 
        token_embeddings = model_output[0] 
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) 

    def forward(self, input_ids, attention_mask): 
        model_output = self.net(input_ids, attention_mask) 
        model_output = self.mean_pooling(model_output, attention_mask) 
        return model_output 

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), 
                                      lr=float(self.hparams.lr), 
                                      weight_decay=float(self.hparams.weight_decay), 
                                      eps=float(self.hparams.adam_epsilon)) 
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=self.hparams.warmup_steps, 
            num_training_steps=self.trainer.estimated_stepping_batches,
        ) 
        scheduler = {"scheduler":scheduler, "interval":"step", "frequency":1} 
        return [optimizer], [scheduler] 

    def training_step(self, batch, batch_idx): 
        input_ids, attn_masks, labels = batch 
        embeddings = self(input_ids, attn_masks) 
        hard_pairs = self.miner(embeddings, labels) 
        loss = self.metric(embeddings, labels, hard_pairs) 
        self.log("train_loss", loss, batch_size=len(batch)) 
        return {"loss":loss} 

    def validation_step(self, batch, batch_idx): 
        input_ids, attn_masks, labels = batch 
        embeddings = self(input_ids, attn_masks) 
        loss = self.metric(embeddings, labels) 
        self.log("val_loss", loss, batch_size=len(batch)) 
        return {"val_loss":loss} 

    def validation_epoch_end(self, outputs): 
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() 
        print(f"\nEpoch {self.current_epoch} | avg_loss: {avg_loss}\n") 

    def predict_step(self, batch, batch_idx: int, dataloader_idx: int=0): 
        q_input_ids, q_attn_masks = batch["q"]["input_ids"], batch["q"]["attention_mask"] 
        q_emb = self(q_input_ids, q_attn_masks) 
        return q_emb 

In [9]:
test_triplets = pd.read_csv("kr_triplet_test.csv") 

test_triplets.head()

Unnamed: 0,queries,positives,negatives
0,1020130067084,1020130008422,1013294870000
1,1020130065171,1020120081496,1020200140972
2,1020130065171,1011789270000,1020140091466
3,1020130065755,1020130033621,1020150115926
4,1020130062194,1020130028620,1021316260000


In [11]:
test_set = TripletData("kr_triplet_test.csv") 
collate = custom_collate() 
test_dataloader = DataLoader(test_set, batch_size=1, collate_fn=collate, shuffle=False) 
parser = argparse.ArgumentParser() 
parser.add_argument("--setting", "-s", type=str, default="default.yaml", help="experiment setting") 
args = parser.parse_args(args=[]) 
hparams = addict.Addict(dict(load_hparams_from_yaml(args.setting)))

model = NeuralRanker(hparams)
model_pt_path = Path("KR_Ranker_epoch_end_checkpoints-epoch=02-val_loss=0.26689678.ckpt") 
device = torch.device("cuda") 
checkpoint = torch.load(model_pt_path, map_location=device)
loaded_dict = checkpoint["state_dict"] 
print(model.load_state_dict(loaded_dict))
model.eval()
model.freeze()

Some weights of the model checkpoint at tanapatentlm/patent-ko-deberta were not used when initializing DebertaV2Model: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>


In [7]:
model.cuda() 
print() 




In [10]:
device = torch.device("cuda") 

queries, positives, negatives = [], [], [] 

for step, batch in enumerate(tqdm(test_dataloader, position=0, leave=True)): 
    input_ids, attn_masks, labels = batch 
    input_ids = input_ids.to(device) 
    attn_masks = attn_masks.to(device) 
    
    with torch.no_grad(): 
        output = model(input_ids, attn_masks) 
    
    queries.append(output[0]) 
    positives.append(output[1]) 
    negatives.append(output[2]) 

  0%|          | 0/4213 [00:00<?, ?it/s]

In [11]:
len(queries), len(positives), len(negatives)

(4213, 4213, 4213)

In [19]:
import pickle 

q_v, p_v, n_v = [], [], [] 

for q in tqdm(queries): 
    q_v.append(q.detach().cpu().numpy().copy().reshape((-1,2048))) 

for p in tqdm(positives): 
    p_v.append(p.detach().cpu().numpy().copy().reshape((-1,2048))) 

for n in tqdm(negatives): 
    n_v.append(n.detach().cpu().numpy().copy().reshape((-1,2048))) 
    
q_v = np.concatenate(q_v, axis=0) 
p_v = np.concatenate(p_v, axis=0) 
n_v = np.concatenate(n_v, axis=0) 

print(q_v.shape, p_v.shape, n_v.shape) 

candidate = np.concatenate([p_v, n_v], axis=0) 

print(candidate.shape) 

embeddings = {
    "query": q_v, 
    "candidate": candidate 
} 

print("saving embeddings...") 
with open("kr_embeddings.pkl", "wb") as f: 
    pickle.dump(embeddings, f) 

  0%|          | 0/4213 [00:00<?, ?it/s]

  0%|          | 0/4213 [00:00<?, ?it/s]

  0%|          | 0/4213 [00:00<?, ?it/s]

(4213, 2048) (4213, 2048) (4213, 2048)
(8426, 2048)
saving embeddings...


In [30]:
import os 
os.environ["TOKENIZERS_PARALLELISM"] = "true" 

def get_rank(inp, candidate): 
    i, q = inp 
    distances = distance.cdist([q], candidate.copy(), "cosine")[0] 
    rank = np.argsort(distances) 
    return rank[0], np.where(rank==i)[0][0] + 1 

ranks = process_map(partial(get_rank, candidate=candidate), 
                    enumerate(q_v), 
                    total=len(q_v), 
                    max_workers=32) 

p1, rank = zip(*ranks) 


  0%|          | 0/4213 [00:00<?, ?it/s]

In [31]:
rrank = [] 

for r in rank: 
    if r <= 1000: 
        rrank.append(1/r) 
    else:
        rrank.append(0) 

print(f"MRR@1000: {np.mean(rrank)}")  

MRR@1000: 0.0010815775511349701


In [33]:
print(f"average rank: {np.mean(rank)}")

average rank: 4031.218609067173
