In [1]:
import faiss
import numpy as np
import torch
from transformers import DPRContextEncoderTokenizer
from transformers import DPRContextEncoder
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_split(filename):
    with open(filename, 'r', encoding='utf-8') as file:
            text = file.read()
    paragraphs = text.split('\n')
    paragraphs = [para.strip() for para in paragraphs if len(para.strip()) >0]
    return paragraphs

In [3]:
paragraphs = read_split('crrao_about.txt')

In [4]:
paragraphs

['The institute is named after the most celebrated statistician of our times, Padma Vibushan Prof. C. R. Rao, FRS(1920-2023). The institute stands testimony to the ideology of Prof. Rao in promoting research from grass root levels and his vision to strengthen the research community in India. His zeal and passion, aspires us in working towards building an institute of excellence, to promote research in Mathematics, Statistics and their applications to varied fields.',
 "CR Rao AIMSCS is engaged in cutting edge research in areas of Mathematics, Statistics, Computer Science and interdisciplinary fields, and provides a forum for national and international experts from different fields to meet and address problems of mutual interest. This is done through Centers of Excellence in (1) Mathematical Sciences, (2) Statistics (3) Computer Science and its allied areas such as Data Science, Cyber Security, Artificial Intelligence, Blockchain technology, IoT security, and Quantum & post-Quantum tech

In [5]:
model_name = "facebook/dpr-ctx_encoder-single-nq-base"
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(model_name)
model_name = "facebook/dpr-ctx_encoder-single-nq-base"
context_encoder = DPRContextEncoder.from_pretrained(model_name)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

In [6]:
def encode_contexts(text_list):
    embeddings = []
    for text in text_list:
        inputs = context_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=256)
        outputs = context_encoder(**inputs)
        embeddings.append(outputs.pooler_output)
    return torch.cat(embeddings).detach().numpy()

In [7]:
context_embeddings =encode_contexts(paragraphs)

In [8]:
context_embeddings

array([[-6.21099100e-02,  1.58208027e-01, -1.11808985e-01, ...,
        -4.12264317e-01,  2.66007543e-01,  9.66586359e-03],
       [-3.74277122e-04,  4.04670835e-01,  3.87263000e-01, ...,
        -4.87996668e-01, -2.83834264e-02, -1.43022329e-01],
       [ 2.76853710e-01,  6.12155557e-01, -1.26120001e-01, ...,
        -4.53348905e-01,  3.39126348e-01, -1.64205596e-01]], dtype=float32)

In [9]:
embedding_dim = 768
context_embeddings_np = np.array(context_embeddings).astype('float32')

<img src='./images/IMG_0360.jpg' width="800">

In [10]:
index = faiss.IndexFlatL2(embedding_dim)
index.add(context_embeddings_np)

In [11]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001669A3C4E10> >

In [12]:
tokenizer_model = 'facebook/dpr-question_encoder-single-nq-base'
encoder_model = 'facebook/dpr-question_encoder-single-nq-base'

In [13]:
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(tokenizer_model)
question_encoder = DPRQuestionEncoder.from_pretrained(encoder_model)

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
question = 'research area of institute'
question_inputs = question_tokenizer(question, return_tensors='pt')
question_embedding = question_encoder(**question_inputs).pooler_output.detach().numpy()

In [15]:
D, I = index.search(question_embedding, k=1) 

In [16]:
print("D",D,"\nI",I)

D [[96.837036]] 
I [[0]]


In [17]:
for i, idx in enumerate(I[0]):
    print(f"{i+1}: {paragraphs[idx]}")
    print(f"distance {D[0][i]}")  

1: The institute is named after the most celebrated statistician of our times, Padma Vibushan Prof. C. R. Rao, FRS(1920-2023). The institute stands testimony to the ideology of Prof. Rao in promoting research from grass root levels and his vision to strengthen the research community in India. His zeal and passion, aspires us in working towards building an institute of excellence, to promote research in Mathematics, Statistics and their applications to varied fields.
distance 96.8370361328125
