In [3]:
from time import time
from datetime import timedelta
from copy import deepcopy

import random
import numpy as np
import pandas as pd
from ml_metrics import mapk

import torch
from torch.optim import AdamW
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForMultipleChoice

# Random seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# CUDA device
use_cuda_device = 0
torch.cuda.set_device(use_cuda_device)
print("Using CUDA device: %d" % torch.cuda.current_device())

In [4]:
# Input files
document_csv_path = './data/documents.csv'
training_csv_path = './data/train_queries.csv'
testing_csv_path = './data/test_queries.csv'

# Input limitation
max_query_length = 64
max_input_length = 512
num_negatives = 3    # num. of negative documents to pair with a positive document

# Model finetuning
model_name_or_path = "bert-base-uncased"
max_epochs = 1
learning_rate = 3e-5
dev_set_ratio = 0.2   # make a ratio of training set as development set for rescoring weight sniffing
max_patience = 0      # earlystop if avg. loss on development set doesn't decrease for num. of epochs
batch_size = 2        # num. of inputs = 8 requires ~9200 MB VRAM (num. of inputs = batch_size * (num_negatives + 1))
num_workers = 2       # num. of jobs for pytorch dataloader

# Save paths
save_model_path = "models/bert_base_uncased"  # assign `None` for not saving the model
save_submission_path = "bm25_bert_rescoring.csv"
K = 1000   # for MAP@K

In [5]:
# Build and save BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name_or_path)
if save_model_path is not None:
    save_tokenizer_path = "%s/tokenizer" % (save_model_path)
    tokenizer.save_pretrained(save_tokenizer_path)

# Collect mapping of all document id and text
doc_id_to_text = {}
doc_df = pd.read_csv(document_csv_path)
doc_df.fillna("<Empty Document>", inplace=True)
id_text_pair = zip(doc_df["doc_id"], doc_df["doc_text"])
for i, pair in enumerate(id_text_pair, start=1):
    doc_id, doc_text = pair
    doc_id_to_text[doc_id] = doc_text
    
    print("Progress: %d/%d\r" % (i, len(doc_df)), end='')
    
doc_df.tail()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…


Progress: 100000/100000

Unnamed: 0,doc_id,doc_text
99995,LA123190-0105,CLERKS AT 13 STORES ARRESTED AFTER MINORS BUY ...
99996,LA123190-0108,LOOKING TO 1991; \n THE NEW YEAR PROMISES TRE...
99997,LA123190-0117,"LOCAL; \n GIRL, 14, DIES IN DRIVE-BY INCIDENT..."
99998,LA123190-0119,"GREECE, ISRAEL HIT BY EXODUS FROM ALBANIA \n ..."
99999,LA123190-0124,<Empty Document>


In [6]:
train_df = pd.read_csv(training_csv_path)
dev_df, train_df = np.split(train_df, [int(dev_set_ratio*len(train_df))])
dev_df.reset_index(drop=True, inplace=True)
train_df.reset_index(drop=True, inplace=True)

print("train_df shape:", train_df.shape)
print("dev_df shape:", dev_df.shape)
train_df.tail()

train_df shape: (96, 5)
dev_df shape: (24, 5)


Unnamed: 0,query_id,query_text,pos_doc_ids,bm25_top1000,bm25_top1000_scores
91,641,Valdez wildlife marine life,FT911-1460 FT931-15213 FT931-16010 FT933-7162 ...,LA120989-0014 LA032390-0003 LA040889-0009 LA03...,34.16304495 32.97577181 31.31040724 30.8527172...
92,642,Tiananmen Square protesters,FBIS3-1941 FBIS3-2223 FBIS3-2224 FBIS3-26281 F...,FT922-10319 FT931-8730 FT942-5501 FBIS4-24379 ...,32.38429409 30.71831856 29.63771818 29.4676000...
93,648,family leave law,FBIS3-43072 FBIS3-61562 FBIS4-25261 FR940323-0...,FR941202-0-00181 FR941202-0-00176 FR941202-0-0...,24.51293307 23.98772391 23.42756181 23.0616218...
94,649,computer viruses,FBIS3-40468 FBIS3-42979 FBIS3-43017 FBIS4-5044...,FT944-9024 FBIS4-50440 FT921-5724 FT941-13624 ...,27.84369436 27.24267123 26.98326939 26.9108106...
95,650,tax evasion indicted,LA011689-0065 LA012589-0008 LA012889-0016 LA02...,LA040889-0060 LA053189-0041 LA092590-0146 LA06...,29.72207523 27.98961258 27.73561512 27.3372072...


In [7]:
%%time
doc_id_to_token_ids = {}
def preprocess_df(df):
    ''' Preprocess DataFrame into training instances for BERT. '''
    instances = []
    
    # Parse CSV
    for i, row in df.iterrows():
        query_id, query_text, pos_doc_ids, bm25_top1000, _ = row
        pos_doc_id_list = pos_doc_ids.split()
        pos_doc_id_set = set(pos_doc_id_list)
        bm25_top1000_list = bm25_top1000.split()
        bm25_top1000_set = set(bm25_top1000_list)

        # Pair BM25 neg. with pos. samples
        labeled_pos_neg_list = []
        for pos_doc_id in pos_doc_id_list:
            neg_doc_id_set = bm25_top1000_set - pos_doc_id_set
            neg_doc_ids = random.sample(neg_doc_id_set, num_negatives)
            pos_position = random.randint(0, num_negatives)
            pos_neg_doc_ids = neg_doc_ids
            pos_neg_doc_ids.insert(pos_position, pos_doc_id)
            labeled_sample = (pos_neg_doc_ids, pos_position)
            labeled_pos_neg_list.append(labeled_sample)
            
        # Make query tokens for BERT
        query_tokens = tokenizer.tokenize(query_text)
        if len(query_tokens) > max_query_length:  # truncation
            query_tokens = query_tokens[:max_query_length]
        query_token_ids = tokenizer.convert_tokens_to_ids(query_tokens)
        query_token_ids.insert(0, tokenizer.cls_token_id)
        query_token_ids.append(tokenizer.sep_token_id)

        # Make input instances for all query/doc pairs
        for doc_ids, label in labeled_pos_neg_list:
            paired_input_ids = []
            paired_attention_mask = []
            paired_token_type_ids = []
            
            # Merge all pos/neg inputs as a single sample
            for doc_id in doc_ids:
                if doc_id in doc_id_to_token_ids:
                    doc_token_ids = doc_id_to_token_ids[doc_id]
                else:
                    doc_text = doc_id_to_text[doc_id]
                    doc_tokens = tokenizer.tokenize(doc_text)
                    doc_token_ids = tokenizer.convert_tokens_to_ids(doc_tokens)
                    doc_id_to_token_ids[doc_id] = doc_token_ids
                doc_token_ids.append(tokenizer.sep_token_id)

                # make input sequences for BERT
                input_ids = query_token_ids + doc_token_ids
                token_type_ids = [0 for token_id in query_token_ids]
                token_type_ids.extend(1 for token_id in doc_token_ids)
                if len(input_ids) > max_input_length:  # truncation
                    input_ids = input_ids[:max_input_length]
                    token_type_ids = token_type_ids[:max_input_length]
                attention_mask = [1 for token_id in input_ids]
                
                # convert and collect inputs as tensors
                input_ids = torch.LongTensor(input_ids)
                attention_mask = torch.FloatTensor(attention_mask)
                token_type_ids = torch.LongTensor(token_type_ids)
                paired_input_ids.append(input_ids)
                paired_attention_mask.append(attention_mask)
                paired_token_type_ids.append(token_type_ids)
            label = torch.LongTensor([label]).squeeze()
            
            # Pre-pad tensor pairs for efficiency
            paired_input_ids = pad_sequence(paired_input_ids, batch_first=True)
            paired_attention_mask = pad_sequence(paired_attention_mask, batch_first=True)
            paired_token_type_ids = pad_sequence(paired_token_type_ids, batch_first=True)

            # collect all inputs as a dictionary
            instance = {}
            instance['input_ids'] = paired_input_ids.T  # transpose for code efficiency
            instance['attention_mask'] = paired_attention_mask.T
            instance['token_type_ids'] = paired_token_type_ids.T
            instance['label'] = label
            instances.append(instance)

        print("Progress: %d/%d\r" % (i+1, len(df)), end='')
    print()
    return instances

train_instances = preprocess_df(train_df)
dev_instances = preprocess_df(dev_df)

print("num. train_instances: %d" % len(train_instances))
print("num. dev_instances: %d" % len(dev_instances))
print("input_ids.T shape:", train_instances[0]['input_ids'].T.shape)
train_instances[0]['input_ids'].T

Token indices sequence length is longer than the specified maximum sequence length for this model (8849 > 512). Running this sequence through the model will result in indexing errors


Progress: 96/96
Progress: 24/24
num. train_instances: 7679
num. dev_instances: 1677
input_ids.T shape: torch.Size([4, 512])
Wall time: 1min 38s


tensor([[  101,  3199,  3036,  ...,  1012,  1019,  2213],
        [  101,  3199,  3036,  ...,  2000,  3477,  2051],
        [  101,  3199,  3036,  ...,     0,     0,     0],
        [  101,  3199,  3036,  ..., 24534,  9556,  1012]])

In [None]:
class TrainingDataset(Dataset):
    def __init__(self, instances):
        self.instances = instances
    
    def __len__(self):
        return len(self.instances)
        
    def __getitem__(self, i):
        instance = self.instances[i]
        input_ids = instance['input_ids']
        attention_mask = instance['attention_mask']
        token_type_ids = instance['token_type_ids']
        label = instance['label']
        return input_ids, attention_mask, token_type_ids, label
    
def get_train_dataloader(instances, batch_size=2, num_workers=4):
    def collate_fn(batch):
        input_ids, attention_mask, token_type_ids, labels = zip(*batch)
        input_ids = pad_sequence(input_ids, batch_first=True).transpose(1,2).contiguous()  # re-transpose
        attention_mask = pad_sequence(attention_mask, batch_first=True).transpose(1,2).contiguous()
        token_type_ids = pad_sequence(token_type_ids, batch_first=True).transpose(1,2).contiguous()
        labels = torch.stack(labels)
        return input_ids, attention_mask, token_type_ids, labels
    
    dataset = TrainingDataset(instances)
    dataloader = DataLoader(dataset, collate_fn=collate_fn, shuffle=True, \
                            batch_size=batch_size, num_workers=num_workers)
    return dataloader

# Demo
dataloader = get_train_dataloader(train_instances)
for batch in dataloader:
    input_ids, attention_mask, token_type_ids, labels = batch
    break
    
print(input_ids.shape)
input_ids

In [None]:
model = BertForMultipleChoice.from_pretrained(model_name_or_path)
model.cuda()

optimizer = AdamW(model.parameters(), lr=learning_rate)
optimizer.zero_grad()

In [None]:
def validate(model, instances):
    total_loss = 0
    model.eval()
    dataloader = get_train_dataloader(instances, batch_size=batch_size, num_workers=num_workers)
    for batch in dataloader:
        batch = (tensor.cuda() for tensor in batch)
        input_ids, attention_mask, token_type_ids, labels = batch
        
        ''' TO-DO: 
        1. Compute the cross-entropy loss (using built-in loss of BertForMultipleChoice)
          (Hint: You need to call a function of model which takes all the 4 tensors in the batch as inputs)
          
        2. Sum up the loss of all dev-set samples
          (Hint: The built-in loss is averaged, so you should multiply it with the batch size)
        '''
        with torch.no_grad():
            outputs = model(input_ids = input_ids,
                           token_type_ids = token_type_ids,
                           attention_mask = attention_mask,
                           labels = labels)
            loss = outputs[0]         ### 1. insert_missing_code
        total_loss += loss ### 2. insert_missing_code
        
    avg_loss = total_loss / len(instances)
    return avg_loss

In [None]:
patience, best_dev_loss = 0, 1e10
best_state_dict = model.state_dict()

start_time = time()
dataloader = get_train_dataloader(train_instances, batch_size=batch_size, num_workers=num_workers)
for epoch in range(1, max_epochs+1):
    model.train()
    for i, batch in enumerate(dataloader, start=1):
        batch = (tensor.cuda() for tensor in batch)
        input_ids, attention_mask, token_type_ids, labels = batch
        
        # Backpropogation
        ''' TO-DO: 
        1. Compute the cross-entropy loss (using built-in loss of BertForMultipleChoice)
          (Hint: You need to call a function of model which takes all the 4 tensors in the batch as inputs)
         
        2. Perform backpropogation on the loss (i.e. compute gradients)
        3. Optimize the model.
          (Hint: These two lines of codes can be found in PyTorch tutorial)
        '''
        outputs = model(input_ids = input_ids,
                           token_type_ids = token_type_ids,
                           attention_mask = attention_mask,
                           labels = labels)
        loss = outputs[0]    ### 1. insert_missing_code
        loss.backward()      ### 2. insert_missing_code
        optimizer.step()     ### 3. insert_missing_code
        optimizer.zero_grad()
        
        # Progress bar with timer ;-)
        elapsed_time = time() - start_time
        elapsed_time = timedelta(seconds=int(elapsed_time))
        print("Epoch: %d/%d | Batch: %d/%d | loss=%.5f | %s      \r" \
              % (epoch, max_epochs, i, len(dataloader), loss, elapsed_time), end='')
        
    # Save parameters of each epoch
    if save_model_path is not None:
        save_checkpoint_path = "%s/epoch_%d" % (save_model_path, epoch)
        model.save_pretrained(save_checkpoint_path)
        
    # Get avg. loss on development set
    print("Epoch: %d/%d | Validating...                           \r" % (epoch, max_epochs), end='')
    dev_loss = validate(model, dev_instances)
    elapsed_time = time() - start_time
    elapsed_time = timedelta(seconds=int(elapsed_time))
    print("Epoch: %d/%d | dev_loss=%.5f | %s                      " \
          % (epoch, max_epochs, dev_loss, elapsed_time))
    
    # Track best checkpoint and earlystop patience
    if dev_loss < best_dev_loss:
        patience = 0
        best_dev_loss = dev_loss
        best_state_dict = deepcopy(model.state_dict())
        if save_model_path is not None:
            model.save_pretrained(save_model_path)
    else:
        patience += 1
    
    if patience > max_patience:
        print('Earlystop at epoch %d' % epoch)
        break
        
# Restore parameters with best loss on development set
model.load_state_dict(best_state_dict)

# Test

In [None]:
class TestingDataset(Dataset):
    def __init__(self, instances):
        self.instances = instances
    
    def __len__(self):
        return len(self.instances)
        
    def __getitem__(self, i):
        instance = self.instances[i]
        input_ids = instance['input_ids']
        attention_mask = instance['attention_mask']
        token_type_ids = instance['token_type_ids']
        input_ids = torch.LongTensor(input_ids)
        attention_mask = torch.FloatTensor(attention_mask)
        token_type_ids = torch.LongTensor(token_type_ids)
        return input_ids, attention_mask, token_type_ids, 
    
def get_test_dataloader(instances, batch_size=8, num_workers=4):
    def collate_fn(batch):
        input_ids, attention_mask, token_type_ids = zip(*batch)
        input_ids = pad_sequence(input_ids, batch_first=True).unsqueeze(1)  # predict as single choice
        attention_mask = pad_sequence(attention_mask, batch_first=True).unsqueeze(1)
        token_type_ids = pad_sequence(token_type_ids, batch_first=True).unsqueeze(1)
        return input_ids, attention_mask, token_type_ids
    
    dataset = TestingDataset(instances)
    dataloader = DataLoader(dataset, collate_fn=collate_fn, shuffle=False, \
                            batch_size=batch_size, num_workers=num_workers)
    return dataloader

In [None]:
def predict_query_doc_scores(model, df):
    model.eval()
    start_time = time()

    # Parse CSV
    query_id_list = df["query_id"]
    query_text_list = df["query_text"]
    bm25_top1000_list = df["bm25_top1000"]

    # Treat {1 query, K documents} as a dataset for prediction
    query_doc_scores = []
    query_doc_ids = []
    rows = zip(query_id_list, query_text_list, bm25_top1000_list)
    for qi, row in enumerate(rows, start=1):
        query_id, query_text, bm25_top1000 = row
        bm25_doc_id_list = bm25_top1000.split()
        query_doc_ids.append(bm25_doc_id_list)

        #################################################
        #    Collect all instances of query/doc pairs
        #################################################
        query_instances = []

        # Make query tokens for BERT
        query_tokens = tokenizer.tokenize(query_text)
        if len(query_tokens) > max_query_length:  # truncation
            query_tokens = query_tokens[:max_query_length]
        query_token_ids = tokenizer.convert_tokens_to_ids(query_tokens)
        query_token_ids.insert(0, tokenizer.cls_token_id)
        query_token_ids.append(tokenizer.sep_token_id)

        # Make input instances for all query/doc pairs
        for i, doc_id in enumerate(bm25_doc_id_list, start=1):
            if doc_id in doc_id_to_token_ids:
                doc_token_ids = doc_id_to_token_ids[doc_id]
            else:
                doc_text = doc_id_to_text[doc_id]
                doc_tokens = tokenizer.tokenize(doc_text)
                doc_token_ids = tokenizer.convert_tokens_to_ids(doc_tokens)
                doc_id_to_token_ids[doc_id] = doc_token_ids
            doc_token_ids.append(tokenizer.sep_token_id)

            # make input sequences for BERT
            input_ids = query_token_ids + doc_token_ids
            token_type_ids = [0 for token_id in query_token_ids]
            token_type_ids.extend(1 for token_id in doc_token_ids)
            if len(input_ids) > max_input_length:  # truncation
                input_ids = input_ids[:max_input_length]
                token_type_ids = token_type_ids[:max_input_length]
            attention_mask = [1 for token_id in input_ids]

            # convert and collect inputs as tensors
            input_ids = torch.LongTensor(input_ids)
            attention_mask = torch.FloatTensor(attention_mask)
            token_type_ids = torch.LongTensor(token_type_ids)


            # collect all inputs as a dictionary
            instance = {}
            instance['input_ids'] = input_ids
            instance['attention_mask'] = attention_mask
            instance['token_type_ids'] = token_type_ids
            query_instances.append(instance)

        #################################################################
        #    Predict relevance scores for all BM25-top-1000 documents
        #################################################################
        doc_scores = np.empty((0,1))

        # Predict scores for each document
        dataloader = get_test_dataloader(query_instances, batch_size=batch_size*(num_negatives+1), num_workers=num_workers)
        for di, batch in enumerate(dataloader, start=1):
            batch = (tensor.cuda() for tensor in batch)
            input_ids, attention_mask, token_type_ids = batch
            
            ''' TO-DO: 
            1. Compute the logits as relevance scores (using the same function of how you compute built-in loss)
              (Hint: You need to call a function of model which takes all the 3 tensors in the batch as inputs)
         
            2. The scores are still on GPU. Reallocate them on CPU, and convert into numpy arrays.
              (Hint: You need to call two functions on the `scores` tensors. You can find them in PyTorch tutorial.)
            '''
            with torch.no_grad():
                outputs = model(input_ids = input_ids,
                           token_type_ids = token_type_ids,
                           attention_mask = attention_mask)
                scores = outputs[0]   ### 1. insert_missing_code_to_compute_logits ###

            # merge all scores into a big numpy array
            scores = scores.cpu().numpy()  ###insert_missing_function_1()###.###insert_missing_function_2()###  # step 2.
            doc_scores = np.vstack((doc_scores, scores))

            # Progress bar with timer ;-)
            elapsed_time = time() - start_time
            elapsed_time = timedelta(seconds=int(elapsed_time))
            print("Query: %d/%d | Progress: %d/%d | %s      \r" \
                  % (qi, len(df), di, len(dataloader), elapsed_time), end='')

        # merge all query/BM25 document pair scores
        query_doc_scores.append(doc_scores)
    query_doc_scores = np.hstack(query_doc_scores).T

    print()
    return query_doc_scores, query_doc_ids

In [None]:
dev_query_doc_scores, dev_query_doc_ids = predict_query_doc_scores(model, dev_df)

print('---- Grid search weight for "BM25 + weight * BERT" ----')
best_map_score, best_bert_weight = -100, 0.0
bert_scores = dev_query_doc_scores
n_query = dev_query_doc_scores.shape[0]

# Get MAP@K of BM25 baseline
query_pos_doc_ids = dev_df['pos_doc_ids'].values.tolist()
actual = [doc_ids.split() for doc_ids in query_pos_doc_ids]
bm25_predicted = [doc_id_list[:K] for doc_id_list in dev_query_doc_ids]
map_score = mapk(actual, bm25_predicted, k=K)
best_map_score = map_score
print("weight=%.1f: %.5f  (BM25 baseline)" % (0, 100*map_score))

# Collect BM25 scores into same format of BERT scores
''' TO-DO: 
1. Convert the BM25 top-1000 scores into 2d numpy arrays
2. BM25 scores should have the same shape and orders as `dev_query_doc_scores` (i.e. BERT scores)
  (Hint: If there are 24 dev-set queries, the shape should be (24, 1000) )
'''
bm25_scores = np.array([_score.split() for _score in dev_df['bm25_top1000_scores']]).astype(np.float)  ### insert_whatever_you_want_to_meet_the_requirement_in_step2. ###

# Grid search for BM25 + BERT rescoring
low_bound, high_bound, scale = 0, 5, 1000
grids = [i / scale for i in range(low_bound * scale+1, high_bound * scale+1)]
for weight in grids:
    
    ''' TO-DO: 
    1. Compute the weighted scores using `bm25_scores`, `weight`, and `bert_scores`
    '''
    weighted_scores = bm25_scores + weight * bert_scores  ### 1. insert_missing_code ###
    
    # sort index and map to document ids as output
    rescore_argsort = np.flip(weighted_scores.argsort(), axis=1)
    predicted = []
    for i in range(n_query):  # num. of queries
        predicted.append([dev_query_doc_ids[i][idx] for idx in rescore_argsort[i]][:K])
    map_score = mapk(actual, predicted, k=K)
    
    # show part of results for human evaluation
    if weight * 10 % 2 == 0:
        print("weight=%.1f: %.5f" % (weight, 100*map_score))
        
    # track weight with best MAP@10
    if map_score > best_map_score:
        best_map_score = map_score
        best_bert_weight = weight
print("\nHighest MAP@%d = %.5f found at weight=%.3f" % (K, 100*best_map_score, best_bert_weight))

In [None]:
# Predict BERT scores for testing set
test_df = pd.read_csv(testing_csv_path)
query_id_list = test_df["query_id"]
n_query = len(query_id_list)
test_query_doc_scores, test_query_doc_ids = predict_query_doc_scores(model, test_df)
bert_scores = test_query_doc_scores

In [None]:
# Rescore query/document score with BM25 + BERT
bm25_scores = [scores.split() for scores in test_df["bm25_top1000_scores"]]  # parse into 2d list of string
bm25_scores = [[float(score) for score in scores] for scores in bm25_scores]  # convert to float
bm25_scores = np.array(bm25_scores)

''' TO-DO: 
1. Compute the weighted scores using `bm25_scores`, `best_bert_weight`, and `bert_scores`
'''
weighted_scores = bm25_scores + best_bert_weight * bert_scores   ### 1. insesrt_missing_code ###

# Rerank document ids with new scores
rescore_argsort = np.flip(weighted_scores.argsort(), axis=1)
ranked_doc_id_list = []
for i in range(n_query):  # num. of queries
    ranked_doc_id_list.append([test_query_doc_ids[i][idx] for idx in rescore_argsort[i]][:K])
ranked_doc_ids = [' '.join(doc_id_list) for doc_id_list in ranked_doc_id_list]

# Save reranked results for submission
data = {'query_id': query_id_list, 'ranked_doc_ids': ranked_doc_ids}
submission_df = pd.DataFrame(data)
submission_df.reset_index(drop=True, inplace=True)
submission_df.to_csv(save_submission_path, index=False)
print("Saved submission file as `%s`" % save_submission_path)