In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import re
from copy import deepcopy
import pandas as pd
import numpy as np

In [2]:
USE_CUDA = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [None]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if w in to_index.keys() else to_index["<unk>"], seq))
    return LongTensor(idxs)

In [None]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [None]:
def pad_to_batch(batch,x_to_ix):
    
    sorted_batch =  sorted(batch, key=lambda b:b[0].size(1),reverse=True) # sort by len
    x,y = list(zip(*sorted_batch))
    max_x = max([s.size(1) for s in x])
    
    x_p,y_p=[],[]
    for i in range(len(batch)):
        if x[i].size(1)<max_x:
            x_p.append(torch.cat([x[i],Variable(LongTensor([x_to_ix['<PAD>']]*(max_x-x[i].size(1)))).view(1,-1)],1))
        else:
            x_p.append(x[i])
        
        
    input_var = torch.cat(x_p)
    target_var = torch.cat(y)
    input_len = [list(map(lambda s: s ==0, t.data)).count(False) for t in input_var]
    target_len = [list(map(lambda s: s ==0, t.data)).count(False) for t in target_var]
    
    return input_var, target_var, input_len, target_len

In [None]:
def normalize_string(s):
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [None]:
df = pd.read_csv('data/atec_nlp_sim_train.csv', sep='\t', names=['number', "sen1", "sen2", "label"])

In [None]:
%%time

X1_r = list(map(normalize_string, df.sen1.tolist()))
X2_r = list(map(normalize_string, df.sen2.tolist()))
y_r = df.label.tolist()
print(len(X1_r), len(X2_r), len(y_r))
print(X1_r[0], "@@@@", X2_r[0], "@@@@",y_r[0])

In [None]:
vocab = list(set(flatten(X1_r + X2_r)))

In [None]:
source2index = {'<PAD>':0,'<UNK>':1,'<s>':2,'</s>':3}
for vo in vocab:
    if vo not in source2index.keys():
        source2index[vo]=len(source2index)
index2source = {v:k for k,v in source2index.items()}

In [None]:
%%time
X1_p, X2_p = [],[]
ta_p = []

for s1, s2, ta in zip(X1_r, X2_r, y_r):
    X1_p.append(prepare_sequence(s1,source2index).view(1,-1))
    X2_p.append(prepare_sequence(s2,source2index).view(1,-1))
    ta_p.append(label)


In [None]:
train_data = list(zip(X1_p, X2_p, ta_p))
print(train_data[0])

In [3]:
LETTER_GRAM_SIZE = 3 # See section 3.2.
WINDOW_SIZE = 3 # See section 3.2.
TOTAL_LETTER_GRAMS = int(3 * 1e4) # Determined from data. See section 3.2.
WORD_DEPTH = WINDOW_SIZE * TOTAL_LETTER_GRAMS # See equation (1).
# Uncomment it, if testing
# WORD_DEPTH = 1000
K = 300 # Dimensionality of the max-pooling layer. See section 3.4.
L = 128 # Dimensionality of latent semantic space. See section 3.5.
J = 4 # Number of random unclicked documents serving as negative examples for a query. See section 4.
FILTER_LENGTH = 1 # We only consider one time step for convolutions.


In [4]:
def kmax_pooling(x, dim, k):
    index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]
    return x.gather(dim, index)

class CDSSM(nn.Module):
    def __init__(self):
        super(CDSSM, self).__init__()
        # layers for query
        self.query_conv = nn.Conv1d(WORD_DEPTH, K, FILTER_LENGTH)
        self.query_sem = nn.Linear(K, L)
        # layers for docs
        self.doc_conv = nn.Conv1d(WORD_DEPTH, K, FILTER_LENGTH)
        self.doc_sem = nn.Linear(K, L)
        # learning gamma
        self.learn_gamma = nn.Conv1d(1, 1, 1)
        
        self.fc = nn.Linear(2*L, L)
        
    def forward(self, q, pos, negs):
        # Query model. The paper uses separate neural nets for queries and documents (see section 5.2).
        # To make it compatible with Conv layer we reshape it to: (batch_size, WORD_DEPTH, query_len)
        q = q.transpose(1,2)
        # In this step, we transform each word vector with WORD_DEPTH dimensions into its
        # convolved representation with K dimensions. K is the number of kernels/filters
        # being used in the operation. Essentially, the operation is taking the dot product
        # of a single weight matrix (W_c) with each of the word vectors (l_t) from the
        # query matrix (l_Q), adding a bias vector (b_c), and then applying the tanh activation.
        # That is, h_Q = tanh(W_c • l_Q + b_c). Note: the paper does not include bias units.
        q_c = F.tanh(self.query_conv(q))
        # Next, we apply a max-pooling layer to the convolved query matrix.
        q_k = kmax_pooling(q_c, 2, 1)
        q_k = q_k.transpose(1,2)
        # In this step, we generate the semantic vector represenation of the query. This
        # is a standard neural network dense layer, i.e., y = tanh(W_s • v + b_s). Again,
        # the paper does not include bias units.
        q_s = F.tanh(self.query_sem(q_k))
        q_s = q_s.resize(L)
        # # The document equivalent of the above query model for positive document
        pos = pos.transpose(1,2)
        pos_c = F.tanh(self.doc_conv(pos))
        pos_k = kmax_pooling(pos_c, 2, 1)
        pos_k = pos_k.transpose(1,2)
        pos_s = F.tanh(self.doc_sem(pos_k))
        pos_s = pos_s.resize(L)
       
        dots = q_s.concat(pos_s)
        return prob

In [5]:
model = CDSSM()

# Build a random data set.
import numpy as np
sample_size = 10
l_Qs = []
pos_l_Ds = []

(query_len, doc_len) = (5, 100)

for i in range(sample_size):
    query_len = np.random.randint(1, 10)
    l_Q = np.random.rand(1, query_len, WORD_DEPTH)
    l_Qs.append(l_Q)
    
    doc_len = np.random.randint(50, 500)
    l_D = np.random.rand(1, doc_len, WORD_DEPTH)
    pos_l_Ds.append(l_D)

In [7]:
l_Qs[0].shape

(1, 7, 90000)

In [8]:
l_Qs[1].shape

(1, 5, 90000)