# Import the libraries

In [None]:
import pickle
from tqdm import tqdm
from nltk.util import ngrams
import torch
import gzip
import csv
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import random
import numpy as np
import time
import matplotlib.pyplot as plt

# Building the dataloader 

This Dataset class helps to fetch the training data in mini-batches. The queries file is assumed to be in the format of dictionary `{k:v}`, in which, `k` is the query_id and `v` is the query sequence represented as a list of lists. Each list inside this list describes the words in the query. So, for the query of 10 words, we'd have a list of 10 lists. Each list stores the indices of letter trigrams present in the corresponding word.
      
### Example

Let the query be `"clean cities"`. Since there are only two words, we'd have list of two list. Each of the two list can be described as : 

`clean cities` ===> `['clean', 'cities']` ===> `[['#cl', 'cle', 'lea', 'ean', 'an#'], ['#ci', 'cit', 'iti', 'tie', 'ies', 'es#']]`. We just replace the letter trigrams with their indices in the above list. 

We have the similar format of `doc_titles`. Variable `ds` representes the training data. I have assumed the pairwise training here. In pairwise training, we use (query, document) pairs for training. Each pair has query and its relevant document (the document that can answer the given query).


In [None]:
class Dataset(Dataset):
    def __init__(self):
        
        self.queries = pickle.load(open('./DS/train_queries.p', 'rb'))
        self.doc_titles = pickle.load(open('./DS/doc_titles.p', 'rb'))
        
        self.doc_map = list(self.doc_titles.keys())
        self.coln_len = len(self.doc_map)
        
        self.ds = pickle.load(open('./P2-datasets/qrel_train.p', 'rb'))
        self.ds = list(reversed(self.ds))
        
        # To cache the query tensors. This helps when we are considering top-k documents. e.g. k=5.
        # This avoids the recalculation query_tensor for each of the five (Q, D) tuples. 
        self.local_cache = {}
        self.init_negs_()
        
    # For negative docs, we periodically build the local database of 1000 randomly selected
    # documents. By default, I rebuild the database each time I clean the cache. 
    def init_negs_(self):
        idx = np.random.randint(0, self.coln_len, (1000,))
        self.negs = torch.zeros(1000,30000, 10)
        i_ = 0
        for id_ in idx:
            dseq = self.doc_titles[self.doc_map[id_]]
            for i in range(1, min(9,len(dseq))+1):
                self.negs[i_, dseq[i-1], i] = 1
            i_ += 1

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, index):
        qid = self.ds[index][0]
        did = self.ds[index][1]
        
        qseq = self.queries[qid]
        dseq = self.doc_titles[did]
        
        if qid in self.local_cache:
            query_tensor = self.local_cache[qid]
        else:
            if(len(self.local_cache) > 1000):
                self.local_cache = {}
                self.init_negs_()
                
            query_tensor = torch.zeros(30000,20)
            for i in range(1,len(qseq)+1):
                query_tensor[qseq[i-1], i] = 1
            self.local_cache[qid] = query_tensor
            
        pos_tensor = torch.zeros(30000, 20)
        for i in range(1, len(dseq)+1):
            pos_tensor[dseq[i-1],i] = 1
        
        idx = np.random.randint(0, len(self.negs), (4,))
        neg_tensor = self.negs[idx]
            
        return query_tensor, pos_tensor, neg_tensor

In [None]:
ds = Dataset()
dl = DataLoader(ds, batch_size=512)

# CDSSM model description

CDSSM runs the 1-d convolution over the input sequence in order to extract the local contextual features from different parts of input sentence. Then the max-pooling layer builds the sentence level feature vector from the local contextual feature vectors. We then project this global sentence level feature map to a lower dimensional latent semantic space using the fully connected semantic layer. 

### 1-d Convulution : 
    We run 1-d convolution over the input sequence. Since the input sequence is sequence of letter tri-gram based word hashes, each word in the input sequence is represented as a 30000 dimensional vector based on letter-trigrams present in that word. We use a filter of length 3 here. Each convolutional operation produces the output vector of size 300, which represents the local feature map at different positions in the sentence. 
    
### Max-pooling : 

    Max-pooling operation performs the max operation over the output of convolution layer over entire input sequence. We have fixed the query and document length to be 20 here, so, the max-pooling layer has fixed input size of 18 in our case. Likewise, the size of negative documents has been chosen to be 10, and so, the max pooling layer for negative docs is of input size 8. 
    
### Semantic layer : 

    This is simple fully connected neural net to project the global feature map to latent semantic space

In [None]:
class CDSSM(nn.Module):
    
    def __init__(self):
        super(CDSSM, self).__init__()
        
        # Convolutional layer
        self.conv = nn.Conv1d(30000, 300, 3, bias=False).cuda()
        self.drop1 = nn.Dropout(0.15).cuda()
        
        # Max-pooling layer
        self.max_pool = nn.MaxPool1d(18).cuda()
        self.neg_max_pool = nn.MaxPool1d(8).cuda()
        
        # Semantic layer
        self.sem = nn.Linear(300, 128, bias=False).cuda()
        self.drop2 = nn.Dropout(0.15).cuda()
        
        # Temperature parameter of softmax function. 
        self.gamma = 10
        
    def forward(self, q, p, n):
        
        qc = self.conv(q)
        qc_drop = self.drop1(qc)
        qc_t = torch.tanh(qc_drop)
        qc_max = self.max_pool(qc_t).squeeze(2)
        qs = self.sem(qc_max)
        qs_drop = self.drop2(qs)
        qs_t = torch.tanh(qs_drop)
        qs1 = qs_t.unsqueeze(1)
        
        pc = self.conv(p)
        pc_drop = self.drop1(pc)
        pc_t = torch.tanh(pc_drop)
        pc_max = self.max_pool(pc_t).squeeze(2)
        ps = self.sem(pc_max)
        ps_drop = self.drop2(ps)
        ps_t = torch.tanh(ps_drop)
        ps1 = ps_t.unsqueeze(1)
        
        nc = self.conv(n)
        nc_drop = self.drop1(nc)
        nc_t = torch.tanh(nc_drop)
        nc_mx = self.neg_max_pool(nc_t).squeeze(2)
        ns = self.sem(nc_mx)
        ns_drop = self.drop2(ns)
        ns_t = torch.tanh(ns_drop)
        ns1 = ns.view(len(q), 4, -1)
        
        ds = torch.cat((ps1, ns1), dim=1)
        
        R = torch.cosine_similarity(ds, qs1, dim=2)
        with_gamma = R*gamma
        return with_gamma

In [None]:
model = CDSSM()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=5e-4, momentum=0.9)

# Training : 

The following lines demonstrate the training of CDSSM. We can also fine tune the temperature parameter gamma based on validation set. Various values of gamma like [0.1, 10, 25, 40] should be tried. 

## After training : 

In my case, model was trained for about 25 epochs. We can also perform additional evaluaition of model on validation set at the end of each epoch. In fact, we also evaluated the model on validation set at each epoch. Once the mode has been trained, we can encode each document into latent semantic space using the trained model. This will give us a big tensor of size `[|C|, 128]`. Here `|C|` is the size of collection (number of documents in the corpus). `128` is the dimensionality of latent semantic space. The retrieval of relevant documents can be done based on cosine similarity between query vector (also encoded in the same space), and this huge collection tensor. We can then retrieve the top documents based on the cosine similarity scores. We were able to produce the MRR@10 of 0.215 on the MS-MARCO dataset using this model.

In [None]:
num_epochs = 100
PATH = "model_checkpoints/mod5e-4_"

mini_batch_losses = []
epoch_losses = []

for epoch in range(num_epochs):
    y = torch.zeros((512, ), dtype=torch.long).cuda()
    epoch_loss = 0
    for batch in tqdm(dl):
        
        q = batch[0].cuda()
        p = batch[1].cuda()
        
        sh = list(batch[2].shape)
        n = batch[2].view(sh[0]*sh[1], sh[2], sh[3]).cuda()
        
        if(len(y) != len(q)):
            y = torch.zeros((len(q), ), dtype=torch.long).cuda()
            
        y_hat = model(q, p, n)
        loss = criterion(y_hat, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        mini_batch_losses.append(loss.data)
        epoch_loss += loss.data
    epoch_losses.append(epoch_loss)
    
    print("EPOCH : ", epoch, " LOSS : ", tot_loss)
    
    model_path = PATH + str(epoch) + '.pt'
    torch.save({'epoch' : epoch, 
                'model_state_dict': model.state_dict(), 
                'optimizer_state_dict': optimizer.state_dict(), 
                'loss': epoch_loss,
                'mini_batch_losses' : mini_batch_losses,
                'epoch_losses' : epoch_losses}, model_path)
           