In [1]:
import numpy as np 
import pandas as pd
import pickle 
import os
import random 
import torch 
import torch.nn as nn 
from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast, AdamW
from colbert.parameters import DEVICE 
from colbert.modeling.colbert import ColBERT
from tqdm import tqdm

In [2]:
test_collections = pd.read_csv("test_collections.tsv", sep="\t") 
test_collections.head(3)

Unnamed: 0,id,passages
0,0,A method for battery charger and diagnosis wi...
1,1,A non transitory computer readable medium car...
2,2,A method for measuring glutamyl transpeptidas...


In [3]:
test_collections.shape

(77886, 2)

In [4]:
small_test_queries = pd.read_csv("small_test_queries.tsv", sep="\t") 

small_test_queries

Unnamed: 0,id,queries
0,10,A sign language image input method used in a ...
1,11,canceled A method for quantitating the antibo...
2,12,A jacket seal comprising a pliable elastomeri...


In [5]:
with open('query_positive_dict.pkl', 'rb') as f: 
    loaded_dict = pickle.load(f) 


In [6]:
checkpoint = torch.load('experiments/dirty/train.py/2021-12-06_08.01.48/checkpoints/colbert-32000.dnn') 

colbert = ColBERT.from_pretrained("bert-base-uncased", 
                                  query_maxlen=512, 
                                  doc_maxlen=512, 
                                  dim=128,
                                  similarity_metric='cosine',
                                  mask_punctuation=True) 

colbert.load_state_dict(checkpoint['model_state_dict']) 

colbert = colbert.to(DEVICE) 

colbert.eval() 

Some weights of the model checkpoint at bert-base-uncased were not used when initializing ColBERT: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing ColBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ColBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ColBERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['linear.weight']
You should probably TRAI

ColBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [7]:
from colbert.modeling.inference import ModelInference

inference_bot = ModelInference(colbert=colbert)


In [8]:
def sort_tuple(tup): 
    tup.sort(key = lambda x: x[1]) 
    return tup[::-1] 

def calculate_score(Q, D): 
    score = Q@D.permute(0,2,1)
    score = score.max(1) 
    score = score.values.sum(-1).cpu() 
    return score 

# Example 1

쿼리 하나에 대한 inference 예시 

Using 20,000 candidate documents (out of ~80,000 total) 

In [9]:
queries = small_test_queries['queries'].values 
documents = test_collections['passages'].values 

In [17]:
# list of similar documents to query 1 
loaded_dict[queries[1]]

[11, 7265, 11730, 12266, 13031, 16549, 26519, 31740]

In [18]:
queries = small_test_queries['queries'].values 
documents = test_collections['passages'].values 
for i in range(1,2):  
    scores = [] # array of tuples, stores (index, score) information 
    q = [queries[i]] 
    q_embedding = inference_bot.queryFromText(q) 
    for j in tqdm(range(0,20000)): 
        d = [documents[j]] 
        d_embedding = inference_bot.docFromText(d) 
        score = calculate_score(q_embedding, d_embedding) 
        scores.append((j, score)) 
    
    ranks = sort_tuple(scores) 

100%|██████████| 20000/20000 [16:19<00:00, 20.42it/s] 


In [20]:
for i,tup in enumerate(ranks):
    if tup[0] in loaded_dict[queries[1]]: 
        print(i)

0
1
2
3
8
18764


유사한 문서가 제일 처음에 랭크된걸 볼 수 있다 (rank 0, 1/20000위). 

# Example 2

또 다른 쿼리 하나에 대한 inference 예시 

Using 1000 candidate documents (out of ~80,000 total) 

In [22]:
# list of similar documents to query 1 
loaded_dict[queries[0]]

[10]

In [23]:
for i in range(1):  
    scores = [] # array of tuples, stores (index, score) information 
    q = [queries[i]] 
    q_embedding = inference_bot.queryFromText(q) 
    for j in tqdm(range(0,1000)): 
        d = [documents[j]] 
        d_embedding = inference_bot.docFromText(d) 
        score = calculate_score(q_embedding, d_embedding) 
        scores.append((j, score)) 
    
    ranks = sort_tuple(scores) 

100%|██████████| 1000/1000 [00:49<00:00, 20.21it/s]


In [25]:
for i,tup in enumerate(ranks):
    if tup[0] in loaded_dict[queries[0]]: 
        print(i)  

2


1000개만 사용했을때 유사한 문서가 3/1000번째에 랭크된걸 볼 수 있다 (rank 2). 

Candidate document의 수를 10배로 늘리면 순위가 더 낮아지겠지만 어느정도로 변할까?

In [26]:
for i in range(1):  
    scores = [] # array of tuples, stores (index, score) information 
    q = [queries[i]] 
    q_embedding = inference_bot.queryFromText(q) 
    for j in tqdm(range(0,10000)): 
        d = [documents[j]] 
        d_embedding = inference_bot.docFromText(d) 
        score = calculate_score(q_embedding, d_embedding) 
        scores.append((j, score)) 
    
    ranks = sort_tuple(scores) 
    
for i,tup in enumerate(ranks):
    if tup[0] in loaded_dict[queries[0]]: 
        print(i)  

100%|██████████| 10000/10000 [08:11<00:00, 20.35it/s]


24


랭크 25/10000위에 등장한다 (rank 24). 

많은 샘플들을 테스트한건 아니지만 여러모로 이전 (데이터 10,000개정도로만 학습했을때)보다는 훨씬 나아보인다. 

물론 매번 쿼리가 들어왔을때마다 이런식으로 처리한다면 곤란하다. 20,000개정도의 candidate document로도 15분이 넘게 걸리니, 100만개의 문서가 db에 쌓여있다면 계산 시간이 너무 오래 걸려서 상용화하는게 불가능할것이다. 따라서 candidate document들의 임베딩을 offline으로 미리 뽑아두고, 인덱싱을 한후에 FAISS를 사용해서 임베딩끼리의 similarity score가 빨리 계산되도록 설계해주는게 필수적이다. 