In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/Colab Notebooks/Question_Pairs_new/Question_Pairs_BiLSTM

In [1]:
import torch
import torch.nn as nn
from torch.utils import data
from tokenizers import BertWordPieceTokenizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random
from datetime import datetime
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# !nvidia-smi

In [2]:
df = pd.read_csv('dataset/train.csv')

In [110]:
data_train, data_val = train_test_split(df, test_size = 0.2, random_state = 42)

In [183]:
class CustomDataset(data.Dataset):
    def __init__(self, docs_1, docs_2, labels, tokenizer_path):
        self.docs_1 = docs_1
        self.docs_2 = docs_2
        self.labels = labels
        self.tokenizer = BertWordPieceTokenizer(tokenizer_path)

    def __len__(self):
        return len(self.docs_1)

    def __getitem__(self, item):
        docs_1 = str(self.docs_1[item])
        docs_2 = str(self.docs_2[item])
        labels = int(self.labels[item])
        encoded_docs_1 = self.tokenizer.encode(docs_1)
        encoded_docs_2 = self.tokenizer.encode(docs_2)
        return dict(
            docs_1 = docs_1, 
            docs_2 = docs_2, 
            labels = labels,
            tokenized_doc_1 = encoded_docs_1.tokens,
            tokenized_doc_2 = encoded_docs_2.tokens,
            input_ids_1 = torch.tensor(encoded_docs_1.ids),
            input_ids_2 = torch.tensor(encoded_docs_2.ids)
            ) 

In [189]:
train_data = CustomDataset(
    data_train.question1.to_numpy(),
    data_train.question2.to_numpy(), 
    data_train.is_duplicate.to_numpy(),
    'BPE/vocab.txt'
    )
validation_data = CustomDataset(
    data_val.question1.to_numpy(),
    data_val.question2.to_numpy(), 
    data_val.is_duplicate.to_numpy(),
    'BPE/vocab.txt'
    )

In [190]:
from torch.nn.utils.rnn import pad_sequence
def padding(data):
    q1 = []
    q2 = []
    t = []
    for i in data:
        q1.append(i['input_ids_1'])
        q2.append(i['input_ids_2'])
        t.append(i['labels'])
    q1_pad = pad_sequence(q1, batch_first=True)
    q2_pad = pad_sequence(q2, batch_first=True)
    # t_pad = pad_sequence(t, batch_first=True)
    return q1_pad, q2_pad, torch.tensor(t)

In [191]:
train_loader = data.DataLoader(train_data, batch_size = 64, collate_fn=padding)
validation_loader = data.DataLoader(validation_data, batch_size = 128, collate_fn=padding)

In [192]:
for q1, q2, t in train_loader:
    print('questions_1: {}, shape: {}'.format(q1, q1.shape))
    print('questions_2: {}, shape: {}'.format(q2, q2.shape))
    print('lables: {}, shape: {}'.format(t, t.shape))
    break

questions_1: tensor([[   2, 1394, 1414,  ...,    0,    0,    0],
        [   2, 1399, 1669,  ...,    0,    0,    0],
        [   2, 1394, 1399,  ...,    0,    0,    0],
        ...,
        [   2, 1406, 1683,  ...,    0,    0,    0],
        [   2, 1555, 1399,  ...,    0,    0,    0],
        [   2, 1394, 1399,  ...,    0,    0,    0]]), shape: torch.Size([64, 33])
questions_2: tensor([[   2, 1394, 1414,  ...,    0,    0,    0],
        [   2, 1414, 2575,  ...,    0,    0,    0],
        [   2, 1394, 1414,  ...,    0,    0,    0],
        ...,
        [   2, 1406, 1460,  ...,    0,    0,    0],
        [   2, 1659, 1388,  ...,    0,    0,    0],
        [   2, 1394, 1399,  ...,    0,    0,    0]]), shape: torch.Size([64, 35])
lables: tensor([0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
        0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]), shape: torch.Size([64])


In [196]:
for q1, q2, t in validation_loader:
    print('questions_1: {}, shape: {}'.format(q1, q1.shape))
    print('questions_2: {}, shape: {}'.format(q2, q2.shape))
    print('lables: {}, shape: {}'.format(t, t.shape))
    break

questions_1: tensor([[   2, 1406, 1401,  ...,    0,    0,    0],
        [   2, 1394, 1414,  ...,    0,    0,    0],
        [   2, 1491, 1399,  ...,    0,    0,    0],
        ...,
        [   2, 1394, 1399,  ...,    0,    0,    0],
        [   2, 1406, 1420,  ...,    0,    0,    0],
        [   2, 1406, 1420,  ...,    0,    0,    0]]), shape: torch.Size([128, 51])
questions_2: tensor([[   2, 1406, 1401,  ...,    0,    0,    0],
        [   2, 1394, 1414,  ...,    0,    0,    0],
        [   2, 1394, 1399,  ...,    0,    0,    0],
        ...,
        [   2, 1394, 3328,  ...,    0,    0,    0],
        [   2, 1420, 9800,  ...,    0,    0,    0],
        [   2, 1406, 1420,  ...,    0,    0,    0]]), shape: torch.Size([128, 62])
lables: tensor([0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
        0, 0, 

In [161]:
# itos
batch_train = next(iter(train_loader))
print(f"q1 itos : {tokenizer.convert_ids_to_tokens(batch_train['input_ids_1'][0])}")
print('===================')
print(f"q2 itos : {tokenizer.convert_ids_to_tokens(batch_train['input_ids_2'][0])}")
print('===================')
print(f"label : {batch_train['labels'][0]}")

q1 itos : ['India', ':', 'What', 'are', 'job', 'options', 'and', 'future', 'options', 'for', 'low', 'C', '##GP', '##A', 'or', 'graduation', 'percentage', 'engineering', 'students', 'in', 'India', '(', '5', '-', '6', ')', '?', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
q2 itos : ['Job', '##s', 'and', 'Career', '##s', 'in', 'India', ':', 'I', 'am', 'currently', 'in', 'my', 'third', 'year', 'of', 'engineering', 'and', 'I', 'have', 'a', 'C', '##GP', '##A', 'of', '7', '.', '6', 'with', 'one', 'back', '##log', '(', 'after', '4', 'semester', '##s', ')', '.', 'I', 'have', 'decided', 'that', 'I', 'want', 'to', 'seek', 'a', 'job', 'in', 'a', 'firm', 'like', 'Mu', 'Sigma', 'or', 'do', 'MBA', '.', 'Which', 'optio

In [164]:
data_train.tokenizer.get_vocab_size()

AttributeError: 'DataFrame' object has no attribute 'tokenizer'

In [35]:
class BiLSTM(nn.Module):
    def __init__(self, n_vocabs, embed_dims, n_lstm_units, n_lstm_layers, n_output_classes):
        super(BiLSTM, self).__init__()
        self.v = n_vocabs
        self.e = embed_dims
        self.u = n_lstm_units
        self.l = n_lstm_layers
        self.o = n_output_classes

        self.embed = nn.Embedding(
            self.v,
            self.e
            )
        self.bilstm = nn.LSTM(
            input_size = self.e,
            hidden_size = self.u,
            num_layers = self.l,
            batch_first = True,
            bidirectional = True,
            dropout = 0.5
        )
        self.linear = nn.Linear(
            self.u * 4,
            self.o
        )

    def forward(self, X1, X2):
        h0 = torch.zeros(self.l * 2, X1.size(0), self.u).to(gpu)
        c0 = torch.zeros(self.l * 2, X1.size(0), self.u).to(gpu)
        out1 = self.embed(X1)
        out2 = self.embed(X2)
        # NxTxh, lxNxh
        out1, _ = self.bilstm(out1, (h0, c0))
        out2, _ = self.bilstm(out2, (h0, c0))
        out1 = out1[:, -1, :]
        out2 = out2[:, -1, :]
        # concatenate out1&2
        out = torch.cat((out1, out2), 1)
        out = self.linear(out)
        iout = torch.max(out, 1)[1]
        
        return iout, out

In [36]:
torch.manual_seed(42)
model = BiLSTM(len(vocab), 512, 512, 2, 2).to(gpu)
criterion = nn.CrossEntropyLoss().to(gpu)
optimizer = torch.optim.Adam(
    model.parameters(), 
    lr = 0.001
    )

In [16]:
print(model)

BiLSTM(
  (embed): Embedding(65494, 512)
  (bilstm): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (linear): Linear(in_features=2048, out_features=2, bias=True)
)


In [17]:
import sys
num_epochs = 20
losses = []
accuracies  = []
val_losses = []
val_accuracies = []
val_loss_min = np.inf

for epoch in range(num_epochs):
    print('------------------------------------------------------------------------------------------')
    print('epoch: {}/{}:'.format(epoch + 1, num_epochs))   
    print('------------------------------------------------------------------------------------------')
    t0 = time.time()

    train_tqdm_bar = tqdm(enumerate(train_iter), total = (len(train_iter)), leave = False, position = 0, file = sys.stdout, dynamic_ncols = True)
    val_tqdm_bar = tqdm(enumerate(valid_iter), total = (len(valid_iter)),  leave = False, position = 0, file = sys.stdout, dynamic_ncols = True)

    running_loss = 0.0
    running_corrects = 0.0
    val_running_loss = 0.0
    val_running_corrects = 0.0

    model.train()
    for idx, (questions, labels) in train_tqdm_bar:
        iout, out = model(questions[0], questions[1])
        loss = criterion(out, labels)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
        optimizer.step()
        running_loss += loss
        running_corrects += torch.sum(iout == labels)
        train_tqdm_bar.set_description(desc = 'train   '.format(epoch + 1, num_epochs))
        batch_idx = (idx + 1) * 64

        train_tqdm_bar.set_postfix(
            loss = running_loss.item() / batch_idx if idx + 1 < len(train_iter) else running_loss.item() / len(train_iter.dataset)
            ,acc = running_corrects.item() / batch_idx if idx + 1 < len(train_iter) else running_corrects.item() / len(train_iter.dataset)
            )
    
    model.eval()
    with torch.no_grad():
        for val_idx, (v_questions, v_labels) in val_tqdm_bar:
            v_iout, v_out = model(v_questions[0], v_questions[1])
            v_loss = criterion(v_out, v_labels)
            val_running_loss += v_loss
            val_running_corrects += torch.sum(v_iout == v_labels)
            val_tqdm_bar.set_description('validate'.format(epoch + 1, num_epochs))
            val_batch_idx = (val_idx + 1) * 128

            val_tqdm_bar.set_postfix(
            val_loss = val_running_loss.item() / val_batch_idx if val_idx + 1 < len(valid_iter) else val_running_loss.item() / len(valid_iter.dataset)
            ,val_acc = val_running_corrects.item() / val_batch_idx if val_idx + 1 < len(valid_iter) else val_running_corrects.item() / len(valid_iter.dataset)
            )
    
    epoch_loss = running_loss/len(train_iter.dataset)
    losses.append(epoch_loss)
    epoch_accuracy = running_corrects/len(train_iter.dataset)
    accuracies.append(epoch_accuracy)
    val_epoch_loss = val_running_loss/len(valid_iter.dataset)
    val_losses.append(val_epoch_loss)
    val_epoch_accuracy = val_running_corrects/len(valid_iter.dataset)
    val_accuracies.append(val_epoch_accuracy)

    checkpoint = {
            'epoch': epoch + 1
            ,'state_dict': model.state_dict()
            ,'optimizer' : optimizer.state_dict()
            ,'val_loss_min' : val_epoch_loss
        }
    
    print('training loss: {:.4f}, acc: {:.2f}'.format(epoch_loss, epoch_accuracy))
    print('validation loss: {:.4f}, acc: {:.2f}'.format(val_epoch_loss, val_epoch_accuracy))
    print('epoch time: {:.2f} seconds'.format(time.time() - t0))

    if val_epoch_loss <= val_loss_min:
        print('validation loss decreased from {:.4f} to {:.4f}, saving model...'.format(val_loss_min, val_epoch_loss))
        torch.save(checkpoint, 'checkpoint/question_pairs_lowest_val_loss_epoch_{}.pth'.format(epoch + 1))
        val_loss_min = val_epoch_loss

------------------------------------------------------------------------------------------
epoch: 1/20:
------------------------------------------------------------------------------------------
training loss: 0.0087, acc: 0.71
validation loss: 0.0041, acc: 0.74
epoch time: 1631.84 seconds
validation loss decreased from inf to 0.0041, saving model...
------------------------------------------------------------------------------------------
epoch: 2/20:
------------------------------------------------------------------------------------------
training loss: 0.0072, acc: 0.78
validation loss: 0.0040, acc: 0.75
epoch time: 1739.33 seconds
validation loss decreased from 0.0041 to 0.0040, saving model...
------------------------------------------------------------------------------------------
epoch: 3/20:
------------------------------------------------------------------------------------------
training loss: 0.0056, acc: 0.84
validation loss: 0.0044, acc: 0.76
epoch time: 1735.03 seconds


KeyboardInterrupt: 