In [1]:
# !pip install datasets
# !pip install faiss-cpu

In [7]:
import faiss
import numpy as np
from transformers import AutoTokenizer
from app.biencoder.sentence_bert import SentenceBert, encode
from app.reranker.reranker import CrossEncoderBert, get_1st_rank

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Crate dataset

In [8]:
device = "cpu"

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = SentenceBert(device=device)
model_location = "/content/drive/MyDrive/weights/sentence_bert_biencoder"
model.bert_model = model.bert_model.from_pretrained(model_location)
model.to(device)

base = np.load("/content/drive/MyDrive/weights/faiss_base_tokens.npy")
homer_vocab = np.load("/content/drive/MyDrive/weights/faiss_base_originals.npy")
index = faiss.IndexFlatL2(base.shape[1])
index.add(base)

ce_model = CrossEncoderBert().to(device)
ce_model.bert_model.from_pretrained("/content/drive/MyDrive/weights/ce_bert")


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [9]:
query = "Hello, what's your name?"
pooled_embeds = encode(query, model.bert_tokenizer, model.bert_model, device)
pooled_embeds = pooled_embeds.cpu().detach().numpy()
D, I = index.search(pooled_embeds, 10)
candidates = [homer_vocab[i] for i in I[0]]
get_1st_rank(tokenizer, ce_model, query, candidates, device=device)

'Hello? Hello? Hello taste... where are you?'

In [10]:
query = "Welcome to Springfield"
pooled_embeds = encode(query, model.bert_tokenizer, model.bert_model, device)
pooled_embeds = pooled_embeds.cpu().detach().numpy()
D, I = index.search(pooled_embeds, 10)
candidates = [homer_vocab[i] for i in I[0]]
get_1st_rank(tokenizer, ce_model, query, candidates, device=device)

'New Springfield rocks!'

In [11]:
query = "I am here"
pooled_embeds = encode(query, model.bert_tokenizer, model.bert_model, device)
pooled_embeds = pooled_embeds.cpu().detach().numpy()
D, I = index.search(pooled_embeds, 10)
candidates = [homer_vocab[i] for i in I[0]]
get_1st_rank(tokenizer, ce_model, query, candidates, device=device)

'Now it begins.'

In [12]:
query = "What did you do on a weekend?"
pooled_embeds = encode(query, model.bert_tokenizer, model.bert_model, device)
pooled_embeds = pooled_embeds.cpu().detach().numpy()
D, I = index.search(pooled_embeds, 10)
candidates = [homer_vocab[i] for i in I[0]]
get_1st_rank(tokenizer, ce_model, query, candidates, device=device)

'Did you say life story?'

In [13]:
query = "No, I didn't. Did you?"
pooled_embeds = encode(query, model.bert_tokenizer, model.bert_model, device)
pooled_embeds = pooled_embeds.cpu().detach().numpy()
D, I = index.search(pooled_embeds, 10)
candidates = [homer_vocab[i] for i in I[0]]
get_1st_rank(tokenizer, ce_model, query, candidates, device=device)

"You... didn't?"

In [14]:
query = "Yes, I did't. Why are you so surprised?"
pooled_embeds = encode(query, model.bert_tokenizer, model.bert_model, device)
pooled_embeds = pooled_embeds.cpu().detach().numpy()
D, I = index.search(pooled_embeds, 10)
candidates = [homer_vocab[i] for i in I[0]]
get_1st_rank(tokenizer, ce_model, query, candidates, device=device)

'Then, yes.'