In [102]:
import warnings
warnings.filterwarnings('ignore')

In [103]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer

In [104]:
class Encoder(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, output_embed_dim):
        super().__init__()
        self.embedding_layer = torch.nn.Embedding(vocab_size, embed_dim) # each token in the vocabulary has an embedding vector
        self.encoder = torch.nn.TransformerEncoder(
            torch.nn.TransformerEncoderLayer(embed_dim, nhead=8, batch_first=True),
            num_layers = 3,
            norm = torch.nn.LayerNorm([embed_dim]),
            enable_nested_tensor=False
        )
        self.projection = torch.nn.Linear(embed_dim, output_embed_dim)
        
    def forward(self, tokenizer_output):
        x = self.embedding_layer(tokenizer_output['input_ids'])
        x = self.encoder(x, src_key_padding_mask= tokenizer_output['attention_mask'].logical_not())
        cls_embed = x[:,0,:]
        return self.projection(cls_embed)

In [105]:
import torch.utils
import torch.utils.data


def train_loop(dataset, num_epochs = 1, verbose = False):
    embed_size = 512
    output_embed_size = 128
    max_seq_len = 64
    batch_size = 32
    
    # number of iterations
    n_iters = (len(dataset)//batch_size) + 1
    
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    
    # question encoder
    question_encoder = Encoder(tokenizer.vocab_size, embed_size, output_embed_size)
    answer_encoder = Encoder(tokenizer.vocab_size, embed_size, output_embed_size)
    
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    optimizer = torch.optim.Adam(list(question_encoder.parameters()) + list(answer_encoder.parameters()), lr=1e-5)
    loss_fn = torch.nn.CrossEntropyLoss()
    
    for epoch in range(num_epochs):
        epoch_loss = []
        for index, batch_data in enumerate(dataloader):
            
            # since out batchsize is 32, here we will get 32 question and 32 answers.
            questions, answers = batch_data
        
            # tokenization
            question_tok = tokenizer(questions, padding= True, truncation=True, return_tensors="pt", max_length=max_seq_len)
            answer_tok = tokenizer(answers, padding= True, truncation=True, return_tensors="pt", max_length=max_seq_len)
            
            # embeddings
            question_embd = question_encoder(question_tok) # dim = 32 x 128
            answer_embd = answer_encoder(answer_tok)       # dim = 32 x 128
            
            # dot product to calculate the similarity score
            similarity_score = question_embd @ answer_embd.T
            
            if index == 0 and epoch == 0:
                print(f"question tokens : {question_tok['input_ids'].shape}, answer tokens : {answer_tok['input_ids'].shape}")
                print(f"question embedding : {question_embd.shape}, answer embedding : {answer_embd.shape}")
                print(f"similarity score shape : {similarity_score.shape}")
            
            target = torch.arange(question_embd.shape[0], dtype=torch.long)
            loss = loss_fn(similarity_score, target)
            epoch_loss += [loss.item()]
            
            if (index == n_iters - 1) and (verbose == True):
                print(f"epoch : {epoch} , loss : {np.mean(epoch_loss)}")
                
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
    return question_encoder, answer_encoder

In [106]:
class get_dataset(torch.utils.data.Dataset):
    def __init__(self, path):
        self.data = pd.read_csv(path, sep = '\t')
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, index):
        return self.data.iloc[index]['questions'], self.data.iloc[index]['answers']
        

In [107]:
dataset = get_dataset('qa_dataset.tsv')
dataset.data.head(4)

Unnamed: 0,questions,answers
0,who played bubba in the tv series in the heat ...,Carlos Alan Autry Jr. (also known for a period...
1,where did the 2017 tour de france start,"The 3,540 km (2,200 mi)-long race commenced wi..."
2,who is the chess champion of the world,Current world champion Magnus Carlsen won the ...
3,who scored the most hat tricks in football,Cristiano Ronaldo and Messi have scored three ...


In [110]:
dataset.data['questions'][0]

'who played bubba in the tv series in the heat of the night'

In [111]:
dataset.data['answers'][0]

'Carlos Alan Autry Jr. (also known for a period of time as Carlos Brown; born July 31, 1952), is an American actor, politician, and former National Football League player.'

In [108]:
len(dataset)

1000

In [109]:
question_encoder, answer_encoder = train_loop(dataset,num_epochs=10, verbose= True)

question tokens : torch.Size([32, 14]), answer tokens : torch.Size([32, 64])
question embedding : torch.Size([32, 128]), answer embedding : torch.Size([32, 128])
similarity score shape : torch.Size([32, 32])
epoch : 0 , loss : 3.736607886850834
epoch : 1 , loss : 3.5064046904444695
epoch : 2 , loss : 3.4198911264538765
epoch : 3 , loss : 3.376646675169468
epoch : 4 , loss : 3.2931443825364113
epoch : 5 , loss : 3.228568986058235
epoch : 6 , loss : 3.133317343890667
epoch : 7 , loss : 3.043817799538374
epoch : 8 , loss : 2.9490405060350895
epoch : 9 , loss : 2.799636036157608


### Testing

In [117]:
question = "Who proposed the theory of general relativity?"
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

question_tok = tokenizer(question, padding=True, truncation=True, return_tensors="pt", max_length=64)
question_embedding = question_encoder(question_tok)[0]
question_embedding

tensor([-0.4559,  0.3104,  0.1083,  0.2180,  0.6370,  0.3144,  0.0525,  0.2199,
         0.1124, -0.0224, -0.2600,  0.4079,  0.4972,  0.3436,  0.1591,  0.3813,
         0.1897,  0.0105,  0.0283,  0.4334,  0.3882,  0.1251,  0.5354, -0.0250,
        -0.0951,  0.1878,  0.4431, -0.1647,  0.2198, -0.1959,  0.0997,  0.0196,
        -0.0581, -0.1162,  0.3346, -0.1372,  0.1587,  0.2087, -0.4258,  0.1915,
         0.3737, -0.4297, -0.1160,  0.5246, -0.2358,  0.0687,  0.3521, -0.7793,
        -0.0573, -0.5430, -0.1427,  0.0986,  0.0726,  0.0713,  0.2144,  0.2821,
        -0.0862,  0.4196,  0.1255,  0.2805,  0.0521, -0.5239, -0.2706,  0.3674,
        -0.3771, -0.3281, -0.0123,  0.1145, -0.1107,  0.1231,  0.2426,  0.2001,
         0.0658, -0.2434, -0.4554, -0.5927,  0.1864,  0.0623,  0.1784,  0.0910,
         0.0186, -0.2464, -0.2155, -0.1771,  0.4431, -0.0783,  0.3244,  0.3044,
         0.2192,  0.2269, -0.1557,  0.1104, -0.3379, -0.0858,  0.2079, -0.2956,
        -0.6909, -0.3882, -0.1762, -0.03

In [185]:
question_embedding.shape

torch.Size([128])

In [157]:
question_tok

{'input_ids': tensor([[  101,  2040,  3818,  1996,  3399,  1997,  2236, 20805,  1029,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [164]:
answers =[
    "General Relativity was established in 1915 by Albert Einstein",
    "The sum of 1 + 2 is equal to 3",
    "Who proposed the theory of general relativity?"
]

ans_tokens = []
ans_embeddings = []
similary_scores = []
for answer in answers:
    answer_tok = tokenizer(answer, padding=True, truncation=True, return_tensors="pt", max_length=64)
    ans_tokens.append(answer_tok)
    answer_embedding = answer_encoder(answer_tok)[0]
    ans_embeddings.append(answer_embedding)
    similarity_score = question_embedding @ answer_embedding.T
    similary_scores.append(similarity_score)
    

In [178]:
question_embedding@answer_embedding.T

tensor(1.5183, grad_fn=<DotBackward0>)

In [166]:
similary_scores

[tensor(1.9853, grad_fn=<DotBackward0>),
 tensor(1.2378, grad_fn=<DotBackward0>),
 tensor(1.5183, grad_fn=<DotBackward0>)]

In [167]:
ans_tokens

[{'input_ids': tensor([[  101,  2236, 20805,  2001,  2511,  1999,  4936,  2011,  4789, 15313,
            102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[ 101, 1996, 7680, 1997, 1015, 1009, 1016, 2003, 5020, 2000, 1017,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[  101,  2040,  3818,  1996,  3399,  1997,  2236, 20805,  1029,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}]

In [172]:
for token in ans_tokens:
    print(token['input_ids'][0])

tensor([  101,  2236, 20805,  2001,  2511,  1999,  4936,  2011,  4789, 15313,
          102])
tensor([ 101, 1996, 7680, 1997, 1015, 1009, 1016, 2003, 5020, 2000, 1017,  102])
tensor([  101,  2040,  3818,  1996,  3399,  1997,  2236, 20805,  1029,   102])


In [186]:
ans_embeddings[1].shape

torch.Size([128])

In [190]:
print(f"question embedding : {question_embedding[:10]}")
print("answer embedding :")
for embedding in ans_embeddings:
    print(embedding[:10])

question embedding : tensor([-0.4559,  0.3104,  0.1083,  0.2180,  0.6370,  0.3144,  0.0525,  0.2199,
         0.1124, -0.0224], grad_fn=<SliceBackward0>)
answer embedding :
tensor([ 0.0824,  0.4870, -0.1635,  0.7219,  0.3285, -1.0932,  0.3275, -0.0585,
        -0.0921, -0.3693], grad_fn=<SliceBackward0>)
tensor([-0.5028,  0.7946, -0.7614,  0.3862,  0.6956, -1.0700,  0.5501,  0.0275,
         0.0091, -0.1320], grad_fn=<SliceBackward0>)
tensor([-0.1641,  0.3631, -0.3865,  0.2011,  0.6216, -0.5416,  0.4523,  0.1810,
         0.3832, -0.2904], grad_fn=<SliceBackward0>)
