In [12]:
import pandas as pd
from torch.utils.data import DataLoader
import numpy as np, torch, re
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
from torch.nn import Embedding
import torch.nn as nn
import torch.nn.functional as F


In [13]:
df=pd.read_csv('/Users/srishtysuman/PycharmProjects/NaturalLanguageProcessing/quora_question_pair_data/train.csv')
df=df.iloc[:2000]
df
def clean(text):
    if text=='' or text is pd.isnull(text) or type(text)!=str:
        return ''
    text=text.lower()
    text=re.sub(r"([.!?])", r" \1", text)
    text=re.sub(r"[^a-zA-Z]+", r" ", text)

    return text
df=df[["question1", "question2", "is_duplicate"]]
df["question1"]=df["question1"].apply(lambda x: clean(x))
df["question2"]=df["question2"].apply(lambda x: clean(x))
df

class VocabularyNMapSentence:
    def __init__(self):
        self.word2index={}
        self.index2word={'SOS':1, 'EOS':2}
        self.n_word=2
        self.word2count={}
    def addSentence(self, q):
        q_map=[]
        for word in q.split(' '):
            if word in self.word2index:
                q_map.append(self.word2index[word])
            else:
                self.word2index[word]=self.n_word
                self.index2word[self.n_word]=word
                self.n_word+=1
                self.word2count[word]=0
            self.word2count[word]+=1
        return q_map
    
build_vocab_and_map=VocabularyNMapSentence()
df["q1_map"]=df["question1"].apply(lambda x: build_vocab_and_map.addSentence(x))
df["q2_map"]=df["question2"].apply(lambda x: build_vocab_and_map.addSentence(x))
df = df[df['q1_map'].apply(lambda x: len(x) > 0)].reset_index(drop=True)
df = df[df['q2_map'].apply(lambda x: len(x) > 0)].reset_index(drop=True)
df
ques1=df["q1_map"].tolist()
ques2=df["q2_map"].tolist()
labels=df["is_duplicate"].tolist()

class Dataset:
    def __init__(self, ques1, ques2, label):
        self.ques1=ques1
        self.ques2=ques2
        self.labels=label
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, index):
        return {
            "ques1": self.ques1[index],
            "ques2": self.ques2[index],
            "label": self.labels[index]
        }
    
def collate_fn(batch):
    ques1_tensor=[]
    ques2_tensor=[]
    label_tensor=[]

    for i in range(len(batch)):
        ques1_tensor.append(batch[i]["ques1"])
        ques2_tensor.append(batch[i]["ques2"])
        label_tensor.append(batch[i]["label"])
    q1_length=[len(q) for q in ques1_tensor]
    q2_length=[len(q) for q in ques2_tensor]



    return {
        "q1":ques1_tensor,
        "q2":ques2_tensor,
        "q1_length":q1_length,
        "q2_length":q2_length,
        "label":label_tensor
    }

train_data=Dataset(ques1, ques2, labels)
for data in train_data:
    print(data)
    break
train_dataloader=DataLoader(train_data, batch_size=32, collate_fn=collate_fn)
for sample in train_dataloader:
    pass


{'ques1': [5, 10], 'ques2': [2, 3, 4, 5, 6, 5, 7, 8, 9, 10, 11, 12, 14], 'label': 0}


In [20]:
class EncoderLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.embedding=Embedding(vocab_size, embedding_dim)
        self.lstm=nn.LSTM(embedding_dim, hidden_size)
        self.linear1=nn.Linear(hidden_size, hidden_size)
        self.dropout=nn.Dropout(0.1)
        self.linear2=nn.Linear(hidden_size, hidden_size)
        self.relu=nn.ReLU()
        self.final=nn.Sigmoid()
    
    def encode_sentence(self, questions, lengths):
        sorted_indices=np.flipud(np.argsort(lengths))
        lengths=np.flipud(np.sort(lengths))
        lengths=lengths.copy()

        questions=[torch.LongTensor(questions[i]).to('cpu') for i in sorted_indices]        
        questions=pad_sequence(questions, batch_first=True)

        embedded=self.embedding(questions).to('cpu')
        embedded=self.dropout(embedded)

        packed_sequence=pack_padded_sequence(embedded, lengths, batch_first=True)
        out, (hn, cn)=self.lstm(packed_sequence)       
        unpacked, _=pad_packed_sequence(out, batch_first=True, total_length=int(lengths[0]))

        last_token_representation=torch.zeros(hn.size()).permute(1,0,2)

        for i in range(hn.size()[1]):
            last_token_representation[i]=unpacked[i, lengths[i]-1, :].unsqueeze(0)

        out=self.linear2(self.relu(self.linear1(last_token_representation)))

        unsorted_output=torch.zeros(out.size())
        for i, encoded in enumerate(out):
            unsorted_output[sorted_indices[i]]=encoded

        return unsorted_output
    
    def cosine_similarity(self, question1, question2):
        return F.cosine_similarity(question1, question2)
    def manhattan_distance(self, question1, question2):
        return torch.exp(-torch.sum(torch.abs(question1 - question2), dim=0)).to('cpu')

    def forward(self, question1, question2, q1_length, q2_length):
        encoded_q1=self.encode_sentence(question1, q1_length)
        encoded_q2=self.encode_sentence(question2, q2_length)

        similarity_score=torch.zeros(encoded_q1.size()[0]).to('cpu')

        for i in range(encoded_q1.size()[0]):  
            similarity_score[i]=self.manhattan_distance(encoded_q1[i][0], encoded_q2[i][0])
            # similarity_score[i] = torch.dot(encoded_q1[i][0], encoded_q2[i][0])
        # prediction=self.final(similarity_score)
        return similarity_score



hidden_size=128
embedding_dim=256  
loss_history=[]

def cosine_loss(cos, target):
    loss=0
    margin=0
    # print(cos, target)
    for i in range(len(target)):
        if target[i]==1:
            loss+=(1-cos[i])
        else:
            loss+=max(0, cos[i]-margin)
    return loss/len(target)


model=EncoderLSTM(build_vocab_and_map.n_word, embedding_dim, hidden_size)
for i in range(50):
    print(i)
    criterion=nn.MSELoss()
    optimizer=torch.optim.Adam(model.parameters())
    losses=[]
    for batch in train_dataloader:    
        optimizer.zero_grad()
        y=torch.Tensor(batch['label']).to('cpu')
        # y[y==0]=-1
        similarity_score=model(batch['q1'], batch['q2'], batch['q1_length'], batch['q2_length'])
        print(similarity_score, y)
        loss=criterion(similarity_score, y)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())    
    loss_history.append(sum(losses)/len(losses))
    print(loss_history)

import matplotlib.pyplot as plt
plt.plot(np.arange(len(loss_history)), loss_history)
plt.show()


0
tensor([0.0026, 0.0421, 0.0335, 0.0177, 0.0627, 0.0347, 0.0357, 0.0535, 0.0502,
        0.0241, 0.0256, 0.0446, 0.0760, 0.0257, 0.1102, 0.0361, 0.0430, 0.0459,
        0.0303, 0.0401, 0.0451, 0.0290, 0.1626, 0.0385, 0.0763, 0.0311, 0.0557,
        0.0478, 0.0210, 0.0291, 0.0623, 0.0397], grad_fn=<CopySlices>) tensor([0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 0.,
        1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.])
tensor([0.0703, 0.0559, 0.0674, 0.0914, 0.0940, 0.0374, 0.0662, 0.0574, 0.1340,
        0.0657, 0.1076, 0.0873, 0.0890, 0.0638, 0.0561, 0.0943, 0.0621, 0.2254,
        0.0611, 0.1287, 0.1958, 0.1402, 0.0384, 0.0344, 0.0820, 0.0942, 0.0320,
        0.0584, 0.0545, 0.0777, 0.0786, 0.0675], grad_fn=<CopySlices>) tensor([1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
        1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.])
tensor([0.0797, 0.1066, 0.1509, 0.1263, 0.1308, 0.1240, 0.1273, 0.0954, 0.0934,
      

KeyboardInterrupt: 

In [169]:
class EncoderLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.embedding=Embedding(vocab_size, embedding_dim)
        self.lstm=nn.LSTM(embedding_dim, hidden_size)
        self.linear1=nn.Linear(hidden_size, hidden_size)
        self.dropout=nn.Dropout(0.1)
        self.linear2=nn.Linear(hidden_size, hidden_size)
        self.relu=nn.ReLU()
        self.final=nn.Sigmoid()
    
    def encode_sentence(self, questions, lengths):
        sorted_indices=np.flipud(np.argsort(lengths))
        lengths=np.flipud(np.sort(lengths))
        lengths=lengths.copy()

        questions=[torch.LongTensor(questions[i]).to('cpu') for i in sorted_indices]        
        questions=pad_sequence(questions, batch_first=True)

        embedded=self.embedding(questions).to('cpu')
        embedded=self.dropout(embedded)

        packed_sequence=pack_padded_sequence(embedded, lengths, batch_first=True)
        out, (hn, cn)=self.lstm(packed_sequence)       
        unpacked, _=pad_packed_sequence(out, batch_first=True, total_length=int(lengths[0]))

        last_token_representation=torch.zeros(hn.size()).permute(1,0,2)

        for i in range(hn.size()[1]):
            last_token_representation[i]=unpacked[i, lengths[i]-1, :].unsqueeze(0)

        out=self.linear2(self.relu(self.linear1(last_token_representation)))

        unsorted_output=torch.zeros(out.size())
        for i, encoded in enumerate(out):
            unsorted_output[sorted_indices[i]]=encoded

        return unsorted_output
    
    def cosine_similarity(self, question1, question2):
        return F.cosine_similarity(question1, question2)
    def manhattan_distance(self, question1, question2):
        return torch.exp(-torch.sum(torch.abs(question1 - question2), dim=0)).to('cpu')

    def forward(self, question1, question2, q1_length, q2_length):
        encoded_q1=self.encode_sentence(question1, q1_length)
        encoded_q2=self.encode_sentence(question2, q2_length)

        similarity_score=torch.zeros(encoded_q1.size()[0]).to('cpu')

        for i in range(encoded_q1.size()[0]):  
            similarity_score[i]=self.manhattan_distance(encoded_q1[i][0], encoded_q2[i][0])
            # similarity_score[i] = torch.dot(encoded_q1[i][0], encoded_q2[i][0])
        # prediction=self.final(similarity_score)
        return similarity_score



hidden_size=128
embedding_dim=256  
loss_history=[]

def cosine_loss(cos, target):
    loss=0
    margin=0
    # print(cos, target)
    for i in range(len(target)):
        if target[i]==1:
            loss+=(1-cos[i])
        else:
            loss+=max(0, cos[i]-margin)
    return loss/len(target)


model=EncoderLSTM(build_vocab_and_map.n_word, embedding_dim, hidden_size)
for i in range(50):
    print(i)
    criterion=nn.MSELoss()
    optimizer=torch.optim.Adam(model.parameters())
    losses=[]
    for batch in train_dataloader:    
        optimizer.zero_grad()
        y=torch.Tensor(batch['label']).to('cpu')
        # y[y==0]=-1
        similarity_score=model(batch['q1'], batch['q2'], batch['q1_length'], batch['q2_length'])
        print(similarity_score, y)
        loss=criterion(similarity_score, y)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())    
    loss_history.append(sum(losses)/len(losses))
    print(loss_history)

import matplotlib.pyplot as plt
plt.plot(np.arange(len(loss_history)), loss_history)
plt.show()


[tensor(466.2456, grad_fn=<AddBackward0>),
 tensor(465.6753, grad_fn=<AddBackward0>),
 tensor(465.9121, grad_fn=<AddBackward0>),
 tensor(465.5562, grad_fn=<AddBackward0>),
 tensor(466.4379, grad_fn=<AddBackward0>),
 tensor(465.9668, grad_fn=<AddBackward0>),
 tensor(466.1316, grad_fn=<AddBackward0>),
 tensor(466.0212, grad_fn=<AddBackward0>),
 tensor(465.9786, grad_fn=<AddBackward0>),
 tensor(467.0018, grad_fn=<AddBackward0>)]

In [157]:
m = nn.Sigmoid()
loss = nn.BCELoss()
input = torch.randn(3, 2, requires_grad=True)
target = torch.rand(3, 2, requires_grad=False)
target

tensor([[0.3765, 0.6911],
        [0.8399, 0.8280],
        [0.5222, 0.4767]])

In [158]:
m=nn.Sigmoid()
input = torch.randn(2)
output = m(input)
print(input, output)

tensor([-0.4543,  0.0879]) tensor([0.3883, 0.5220])


In [None]:
output = loss(m(input), target)
