In [1]:
!mkdir -p /scratch/sagarsj42/torch-cache
!mkdir -p /scratch/sagarsj42/transformers
!mkdir -p /scratch/sagarsj42/hf-datasets

import os
os.chdir('/scratch/sagarsj42')
os.environ['TORCH_HOME'] = '/scratch/sagarsj42/torch-cache'
os.environ['TRANSFORMERS_CACHE'] = '/scratch/sagarsj42/transformers'
os.environ['HF_DATASETS_CACHE'] = '/scratch/sagarsj42/hf-datasets'

In [267]:
import time
import string
import random
import copy

import numpy as np
import datasets

import torch
from torch import nn, optim
from torch.utils.data import Dataset

In [3]:
# !scp sagarsj42@ada:/share1/sagarsj42/WikiQACorpus.zip .
# !unzip -o WikiQACorpus.zip

WikiQACorpus.zip                              100% 6928KB   6.8MB/s   00:00    
Archive:  WikiQACorpus.zip
  inflating: WikiQACorpus/emnlp-table/WikiQA.CNN.dev.rank  
  inflating: WikiQACorpus/emnlp-table/WikiQA.CNN.test.rank  
  inflating: WikiQACorpus/emnlp-table/WikiQA.CNN-Cnt.dev.rank  
  inflating: WikiQACorpus/emnlp-table/WikiQA.CNN-Cnt.test.rank  
  inflating: WikiQACorpus/eval.py    
  inflating: WikiQACorpus/Guidelines_Phase1.pdf  
  inflating: WikiQACorpus/Guidelines_Phase2.pdf  
  inflating: WikiQACorpus/WikiQA.tsv  
  inflating: WikiQACorpus/WikiQA-dev.ref  
  inflating: WikiQACorpus/WikiQA-dev.tsv  
  inflating: WikiQACorpus/WikiQA-dev.txt  
  inflating: WikiQACorpus/WikiQA-dev-filtered.ref  
  inflating: WikiQACorpus/WikiQASent.pos.ans.tsv  
  inflating: WikiQACorpus/WikiQA-test.ref  
  inflating: WikiQACorpus/WikiQA-test.tsv  
  inflating: WikiQACorpus/WikiQA-test.txt  
  inflating: WikiQACorpus/WikiQA-test-filtered.ref  
  inflating: WikiQACorpus/WikiQA-train.ref  
  in

In [4]:
# !wget https://nlp.stanford.edu/data/glove.6B.zip
# !unzip -o glove.6B.zip

--2021-11-29 12:30:45--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-11-29 12:30:46--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... ^C
Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

In [4]:
GLOVE_FILE = 'glove.6B.300d.txt'

In [83]:
# Copied from explore_wikiqa.ipynb

def get_valid_questions(wikiqa):
    question_status = dict()

    for split in wikiqa:
        split_dataset = wikiqa[split]
        n_samples = len(split_dataset)

        for i in range(n_samples):
            qid = split_dataset[i]['question_id']
            label = split_dataset[i]['label']
            if qid not in question_status:
                question_status[qid] = label
            else:
                question_status[qid] = max(question_status[qid], label)

    valid_questions = set([qid for qid in question_status if question_status[qid] > 0])
    
    return valid_questions


def load_glove(filename):
    glove = dict()

    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line_content = line.split()
            word = line_content[0].strip()
            vec = np.array(line_content[1:], dtype='float32')
            glove[word] = vec
            
    return glove


def get_tokens(sample):
    question = sample['question'].translate(str.maketrans('', '', string.punctuation))
    question = question.lower().split()
    
    answer = sample['answer'].translate(str.maketrans('', '', string.punctuation))
    answer = answer.lower().split()
    
    return question, answer


def get_embeddings(q_a_tokens, glove):
    embed_size = len(list(glove.values())[0])
    q_vecs = [glove[q_word] if q_word in glove else np.zeros(embed_size) for q_word in q_a_tokens[0]]
    a_vecs = [glove[a_word] if a_word in glove else np.zeros(embed_size) for a_word in q_a_tokens[1]]
    
    return q_vecs, a_vecs

In [84]:
wikiqa = datasets.load_dataset('wiki_qa')
valid_questions = get_valid_questions(wikiqa)
wikiqa_f = wikiqa.filter(lambda sample: sample['question_id'] in valid_questions)

wikiqa_f

Using custom data configuration default
Reusing dataset wiki_qa (/scratch/sagarsj42/hf-datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c)
Loading cached processed dataset at /scratch/sagarsj42/hf-datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c/cache-dfbc6880319836b4.arrow
Loading cached processed dataset at /scratch/sagarsj42/hf-datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c/cache-f1e19b344b89867e.arrow
Loading cached processed dataset at /scratch/sagarsj42/hf-datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c/cache-5aef7c39e6a2d38e.arrow


DatasetDict({
    test: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 2351
    })
    validation: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 1130
    })
    train: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 8672
    })
})

In [85]:
glove = load_glove(GLOVE_FILE)

len(glove)

400000

In [234]:
class WikiqaDataset(Dataset):
    def __init__(self, wikiqa, glove):
        super(WikiqaDataset, self).__init__()
        self.wikiqa = wikiqa
        self.glove = glove
        
    def __len__(self):
        return len(self.wikiqa)
    
    def __getitem__(self, idx):
        sample = self.wikiqa[idx]
        question, sentence = get_embeddings(get_tokens(sample), self.glove)
        label = torch.tensor([sample['label']], dtype=torch.long)
        
        question = torch.cat([torch.Tensor(q_word).view(1, -1) for q_word in question], dim=0)
        sentence = torch.cat([torch.Tensor(s_word).view(1, -1) for s_word in sentence], dim=0)
        
        return question, sentence, label

In [297]:
class AttPoolLSTM(nn.Module):
    def __init__(self, embed_dim, hidden_dim, bidirectional=True):
        super(AttPoolLSTM, self).__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        
        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
        
        self.lstm = nn.LSTM(input_size=self.embed_dim, hidden_size=self.hidden_dim, num_layers=1, 
                            batch_first=True, bidirectional=bidirectional)
        
        self.dual_att_projection = nn.Parameter(torch.rand(2*self.hidden_dim, 2*self.hidden_dim))
        
        self.softmax = nn.Softmax(dim=-1)
        
        self.cosine = nn.CosineSimilarity(dim=-1)
        
        
    def forward(self, question, sentence):
        if len(question.shape) == 2:
            question = question.unsqueeze(dim=0)
            sentence = sentence.unsqueeze(dim=0)
        
        question = self.projection(question)
        question, _ = self.lstm(question)
        sentence = self.projection(sentence)
        sentence, _ = self.lstm(sentence)
        
        n = sentence.shape[0]
        l = sentence.shape[1]
        c = sentence.shape[2]
        qs_alignment = torch.matmul(torch.matmul(question, self.dual_att_projection), sentence.view(n, c, l))
        
        q_pool = torch.max(qs_alignment, dim=2)[0]
        q_pool = self.softmax(q_pool)
        s_pool = torch.max(qs_alignment, dim=1)[0]
        s_pool = self.softmax(s_pool)
        
        q_rep = torch.matmul(question.transpose(1, 2), q_pool.transpose(0, 1)).transpose(1, 2)
        s_rep = torch.matmul(sentence.transpose(1, 2), s_pool.transpose(0, 1)).transpose(1, 2)
        
        match_score = self.cosine(q_rep, s_rep)
        
        return match_score
    

model = AttPoolLSTM(300, 300)
list(model.named_parameters())

[('dual_att_projection',
  Parameter containing:
  tensor([[0.3619, 0.1466, 0.2737,  ..., 0.7473, 0.2578, 0.8925],
          [0.1477, 0.9535, 0.9919,  ..., 0.1017, 0.9293, 0.1048],
          [0.8272, 0.3024, 0.3761,  ..., 0.5615, 0.9536, 0.4489],
          ...,
          [0.5078, 0.9481, 0.3531,  ..., 0.4432, 0.2870, 0.0515],
          [0.9998, 0.7741, 0.5814,  ..., 0.8803, 0.3475, 0.0534],
          [0.6585, 0.4045, 0.5212,  ..., 0.8566, 0.2748, 0.7356]],
         requires_grad=True)),
 ('projection.weight',
  Parameter containing:
  tensor([[ 0.0235,  0.0055, -0.0002,  ...,  0.0337, -0.0019, -0.0013],
          [ 0.0347, -0.0337,  0.0566,  ..., -0.0352, -0.0195,  0.0127],
          [ 0.0450,  0.0121, -0.0424,  ...,  0.0342, -0.0419,  0.0478],
          ...,
          [-0.0345,  0.0021,  0.0122,  ...,  0.0164, -0.0144, -0.0035],
          [-0.0457,  0.0473, -0.0239,  ..., -0.0410, -0.0219, -0.0072],
          [-0.0238, -0.0521,  0.0518,  ..., -0.0030,  0.0164, -0.0180]],
         requ

In [298]:
question, sentence = get_embeddings(get_tokens(wikiqa_f['train'][100]), glove)
question = torch.cat([torch.Tensor(q_word).view(1, -1) for q_word in question], dim=0).unsqueeze(0)
sentence = torch.cat([torch.Tensor(s_word).view(1, -1) for s_word in sentence], dim=0).unsqueeze(0)

question.shape, sentence.shape

(torch.Size([1, 6, 300]), torch.Size([1, 35, 300]))

In [299]:
match_score = model(question, sentence)
match_score

tensor([[0.7626]], grad_fn=<DivBackward0>)

In [300]:
model = AttPoolLSTM(embed_dim=300, hidden_dim=400)
model.to(DEVICE)

AttPoolLSTM(
  (projection): Linear(in_features=300, out_features=300, bias=True)
  (lstm): LSTM(300, 400, batch_first=True, bidirectional=True)
  (softmax): Softmax(dim=-1)
  (cosine): CosineSimilarity()
)

In [301]:
train_dataset = WikiqaDataset(wikiqa_f['train'], glove)
dev_dataset = WikiqaDataset(wikiqa_f['validation'], glove)
test_dataset = WikiqaDataset(wikiqa_f['test'], glove)

len(train_dataset), len(dev_dataset), len(test_dataset)

(8672, 1130, 2351)

In [302]:
batch_size = 4
n_epochs = 20

criterion = nn.MarginRankingLoss(margin=0.1)
criterion

MarginRankingLoss()

In [303]:
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.6, weight_decay=0.001, dampening=0.0, nesterov=True)
optimizer

SGD (
Parameter Group 0
    dampening: 0.0
    lr: 0.001
    momentum: 0.6
    nesterov: True
    weight_decay: 0.001
)

In [325]:
def find_best_candidate(dataset, indices, model, select_from=20, device='cpu'):
    candidates = random.sample(indices, select_from)
    candidate_scores = list()
    for c_i in candidates:
        c_q, c_s, c_l = dataset[c_i]
        c_q = c_q.to(device)
        c_s = c_s.to(device)
        
        if c_l == 0:
            cand_match_score = model(c_q, c_s)
            candidate_scores.append(cand_match_score.item())
    if len(candidate_scores) > 0:
        neg_sample_idx = candidates[np.argmax(np.array(candidate_scores))]
    else:
        neg_sample_idx = 0
        
    neg_sample = dataset[neg_sample_idx]
    n_question, n_sentence, _ = neg_sample
    
    return n_question.to(device), n_sentence.to(device)

In [326]:
def train_epoch(dataset, model, optimizer, criterion, batch_size, device='cpu'):
    model.train()
    indices = list(range(len(dataset)))
    random.shuffle(indices)
    
    total_loss = 0.0
    batch_loss = 0.0
    batch_count = 0
    step_count = 0
    
    for i in indices:
        question, sentence, label = dataset[i]
        if label == 1:
            question = question.to(device)
            sentence = sentence.to(device)
            label = label.to(device)
            
            optimizer.zero_grad()
            pos_match_score = model(question, sentence)
            n_question, n_sentence = find_best_candidate(dataset, indices, model, 
                                                         select_from=20, device=device)
            neg_match_score = model(n_question, n_sentence)
            loss = criterion(pos_match_score, neg_match_score, label)
            batch_loss += loss
            batch_count += 1
            
            if batch_count % batch_size == 0:
                loss.backward()
                optimizer.step()
                batch_loss = 0.0
                batch_count = 0

            total_loss += loss.item()
            step_count += 1
            
            if step_count % 100 == 0:
                print(step_count, 'steps done')
            
    return total_loss / step_count

In [329]:
def evaluate(dataset, model, device='cpu'):
    model.eval()
    total_loss = 0.0
    step_count = 0
    indices = list(range(len(dataset)))
    
    for i in indices:
        question, sentence, label = dataset[i]
        if label == 1:
            question = question.to(device)
            sentence = sentence.to(device)
            label = label.to(device)
            
            optimizer.zero_grad()
            pos_match_score = model(question, sentence)
            n_question, n_sentence = find_best_candidate(dataset, indices, model, 
                                                         select_from=20, device=device)
            neg_match_score = model(n_question, n_sentence)
            loss = criterion(pos_match_score, neg_match_score, label)
            
            total_loss += loss.item()
            step_count += 1
    
    return total_loss / step_count

In [330]:
best_loss = float('inf')
best_model = None

for epoch in range(n_epochs):
    start = time.time()
    
    train_loss = train_epoch(train_dataset, model, optimizer, criterion, batch_size, device=DEVICE)
    dev_loss = evaluate(dev_dataset, model, device=DEVICE)
    
    if dev_loss < best_loss:
        best_loss = dev_loss
        best_model = copy.deepcopy(model)
    
    print(f'Epoch {epoch} complete.\nTrain loss:{train_loss:.4f}, Dev loss:{dev_loss:.4f}')

torch.save(best_model.state_dict(), 'best_attpool_lstm.pth')
print('Training complete.\nThe params of best model saved.')

Epoch 0 complete.
Train loss:0.1319, Dev loss:0.1394
Epoch 1 complete.
Train loss:0.1319, Dev loss:0.1551
Epoch 2 complete.
Train loss:0.1319, Dev loss:0.1422
Epoch 3 complete.
Train loss:0.1319, Dev loss:0.1567
Epoch 4 complete.
Train loss:0.1319, Dev loss:0.1416
Epoch 5 complete.
Train loss:0.1319, Dev loss:0.1421
Epoch 6 complete.
Train loss:0.1319, Dev loss:0.1519
Epoch 7 complete.
Train loss:0.1319, Dev loss:0.1491
Epoch 8 complete.
Train loss:0.1319, Dev loss:0.1491
Epoch 9 complete.
Train loss:0.1319, Dev loss:0.1482
Epoch 10 complete.
Train loss:0.1319, Dev loss:0.1402
Epoch 11 complete.
Train loss:0.1319, Dev loss:0.1492
Epoch 12 complete.
Train loss:0.1319, Dev loss:0.1459
Epoch 13 complete.
Train loss:0.1319, Dev loss:0.1502
Epoch 14 complete.
Train loss:0.1319, Dev loss:0.1503
Epoch 15 complete.
Train loss:0.1319, Dev loss:0.1413
Epoch 16 complete.
Train loss:0.1319, Dev loss:0.1566
Epoch 17 complete.
Train loss:0.1319, Dev loss:0.1533
Epoch 18 complete.
Train loss:0.1319, 

In [None]:
model = AttPoolLSTM(embed_dim=300, hidden_dim=400)
model.load_state_dict(torch.load('best_attpool_lstm.pth'))

test_loss = evaluate(test_dataset, model, device=DEVICE)
print(f'Test loss:{test_loss:.4f}')

In [351]:
def convert_to_questionwise_dataset(dataset):
    q_dataset = dict()
    
    for sample in dataset:
        qid = sample['question_id']
        question = sample['question']
        sentence = sample['answer']
        label = sample['label']
        
        if qid in q_dataset:
            q_dataset[qid][1].append((sentence, label))
        else:
            q_dataset[qid] = (question, [(sentence, label)])
            
    return q_dataset


def get_scores_for_sample(sample, model, glove, device='cpu'):
    question = sample[0]
    question = question.translate(str.maketrans('', '', string.punctuation))
    question = question.lower().split()
    
    embed_size = len(list(glove.values())[0])
    question = [glove[q_word] if q_word in glove else np.zeros(embed_size) for q_word in question]
    question = torch.cat([torch.Tensor(q_word).view(1, -1) for q_word in question], dim=0)
    question = question.to(device)
        
    scores = list()
    for sentence, label in sample:
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        sentence = sentence.lower().split()
        s_vecs = [glove[s_word] if s_word in glove else np.zeros(embed_size) for s_word in sentence]
        
        sentence = sentence.to(device)
        score = model(question, sentence)
        
        scores.append((score, label))
        
    scores.sort(key=lambda s: s[0], reverse=True)
        
    return scores

In [352]:
qwise_dataset = convert_to_questionwise_dataset(wikiqa_f['train'])
len(qwise_dataset)

873

In [354]:
scores = get_scores_for_sample(list(qwise_dataset.values())[0], glove, best_model)
scores

TypeError: tuple indices must be integers or slices, not str