In [1]:
import torch
import torch.nn as nn
import torchtext
import spacy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import random
import time

gpu = ('cuda:0')

In [2]:
question_1 = torchtext.data.Field(
    sequential = True,
    lower = True,
    batch_first = True,
    tokenize = 'spacy',
    pad_first = True,
    use_vocab = True
)

question_2 = torchtext.data.Field(
    sequential = True,
    lower = True,
    batch_first = True,
    tokenize = 'spacy',
    pad_first = True,
    use_vocab = True
)

label = torchtext.data.Field(
    sequential = False,
    use_vocab = False,
    is_target = True
)

In [3]:
dataset = torchtext.data.TabularDataset(
    path = 'dataset/train_dataset.csv', 
    format = 'csv',
    skip_header = True,
    fields = [
        ('q1', question_1),
        ('q2', question_2),
        ('label', label)
        ] 
)

In [4]:
print(dataset[0].q1)
print(dataset[0].q2)
print(dataset[0].label)

['how', 'hard', 'is', 'it', 'to', 'learn', 'russian', '(', 'compared', 'to', 'japanese', ')', '?']
['if', 'you', 'can', 'read', 'chinese', ',', 'can', 'you', 'also', 'read', 'japanese', '?']
0


In [5]:
train_dataset, validation_dataset = dataset.split([0.7, 0.3], random_state = random.seed(42))

In [6]:
len(train_dataset), len(validation_dataset)

(208968, 89558)

In [18]:
question_1.build_vocab(train_dataset)
question_2.build_vocab(train_dataset)
question_1.vocab.extend(question_2.vocab)
question_2.vocab = question_1.vocab
vocab = question_1.vocab
len(vocab)
# vocab.stoi

65494

In [8]:
train_iter, valid_iter = torchtext.data.BucketIterator.splits(
    (train_dataset, validation_dataset), 
    sort_key = lambda x: len(x.q1)+len(x.q2),
    batch_sizes = (64,128),
    device = gpu
)

In [9]:
for inputs, labels in train_iter:
    print('questions_1: {}, shape: {}'.format(inputs[0], inputs[0].shape))
    print('questions_2: {}, shape: {}'.format(inputs[1], inputs[1].shape))
    print('lables: {}, shape: {}'.format(labels, labels.shape))
    break

questions_1: tensor([[   1,    1,    1,  ...,    3,  328,    2],
        [   1,    1,    1,  ...,   14, 1086,    2],
        [   1,    1,    1,  ...,   85,  151,    2],
        ...,
        [   1,    1,    1,  ...,  714,  135,    2],
        [   1,    1,    1,  ...,   10,   41,    2],
        [   1,    1,    1,  ...,   69,  223,    2]], device='cuda:0'), shape: torch.Size([64, 30])
questions_2: tensor([[   1,    1,    1,  ...,    3,  375,    2],
        [   1,    1,    1,  ...,   14, 1114,    2],
        [   1,    1,    1,  ...,    9,  149,    2],
        ...,
        [   1,    1,    1,  ...,  708,  140,    2],
        [   1,    1,    1,  ..., 1359, 2319,    2],
        [   1,    1,    1,  ...,   72,  220,    2]], device='cuda:0'), shape: torch.Size([64, 36])
lables: tensor([1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0,
        0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
        0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0], devic

In [10]:
for inputs, labels in valid_iter:
    print('questions_1: {}, shape: {}'.format(inputs[0], inputs[0].shape))
    print('questions_2: {}, shape: {}'.format(inputs[1], inputs[1].shape))
    print('lables: {}, shape: {}'.format(labels, labels.shape))
    break

questions_1: tensor([[    1,    24,    74,  2064,     2],
        [    4,     5,     9, 43922,     2],
        [    4,     5, 48634,  2372,     2],
        [    4,     5,     0,  1353,     2],
        [    4,     5,  3846,   132,     2],
        [    1,     4,     5, 36762,     2],
        [    1, 27714,     5,   145,     2],
        [    1,     5, 58341,  6804,     2],
        [    1,    68,     5,  4100,     2],
        [    1,     4,     5, 19964,     2],
        [    1,    15,   881,  2071,     2],
        [    1,     4,     5,     0,     2],
        [    1,    68,     5,  7816,     2],
        [    1,     4,     5,     0,     2],
        [    1,     4,     5,     0,     2],
        [    1,    13, 10352,   251,     2],
        [    1,     4,     5,     0,     2],
        [    1,     1,     4,  9729,     2],
        [    1,     5, 59958,  2267,     2],
        [    1,     4,     5, 52622,     2],
        [    1,    45,     5, 21550,     2],
        [    1,   178,     0,   167,     2

In [11]:
## itos
# batch_train = next(iter(train_iter))
# doc_train, label_train = batch_train
# batch_test = next(iter(valid_iter))
# doc_valid, label_valid = batch_test

# def itos_(iterator):
#     docs_ = []
#     for i in range (len(iterator)):
#         x = iterator[i].item()
#         doc = str(question1.vocab.itos[x])
#         docs_.append(doc)
#     docs = ' '.join(docs_)
#     return docs, docs_

In [12]:
# print(itos_(doc_train[0][0])[0])
# print(itos_(doc_train[1][0])[0])
# print('label: {}'.format(label_train[0]))

In [14]:
class BiLSTM(nn.Module):
    def __init__(self, n_vocabs, embed_dims, n_lstm_units, n_lstm_layers, n_output_classes):
        super(BiLSTM, self).__init__()
        self.v = n_vocabs
        self.e = embed_dims
        self.u = n_lstm_units
        self.l = n_lstm_layers
        self.o = n_output_classes

        self.embed = nn.Embedding(
            self.v,
            self.e
            )
        self.bilstm = nn.LSTM(
            input_size = self.e,
            hidden_size = self.u,
            num_layers = self.l,
            batch_first = True,
            bidirectional = True,
            dropout = 0.5
        )
        self.linear = nn.Linear(
            self.u * 4,
            self.o
        )

    def forward(self, X1, X2):
        # h0 = torch.zeros(self.l * 2, X.size(0), self.u).to(gpu)
        # c0 = torch.zeros(self.l * 2, X.size(0), self.u).to(gpu)

        out1 = self.embed(X1)
        out2 = self.embed(X2)
        # NxTxh, lxNxh
        out1, _ = self.bilstm(out1)
        out2, _ = self.bilstm(out2)
        out1 = out1[:, -1, :]
        out2 = out2[:, -1, :]
        # concatenate out1&2
        out = torch.cat((out1, out2), 1)
        out = self.linear(out)
        iout = torch.max(out, 1)[1]

        return iout, out

In [15]:
torch.manual_seed(42)
model = BiLSTM(len(vocab), 512, 512, 2, 2).to(gpu)
criterion = nn.CrossEntropyLoss().to(gpu)
optimizer = torch.optim.Adam(
    model.parameters(), 
    lr = 0.001
    )

In [16]:
print(model)

BiLSTM(
  (embed): Embedding(65494, 512)
  (bilstm): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (linear): Linear(in_features=2048, out_features=2, bias=True)
)


In [17]:
import sys
num_epochs = 20
losses = []
accuracies  = []
val_losses = []
val_accuracies = []
val_loss_min = np.inf

for epoch in range(num_epochs):
    print('------------------------------------------------------------------------------------------')
    print('epoch: {}/{}:'.format(epoch + 1, num_epochs))   
    print('------------------------------------------------------------------------------------------')
    t0 = time.time()

    train_tqdm_bar = tqdm(enumerate(train_iter), total = (len(train_iter)), leave = False, position = 0, file = sys.stdout, dynamic_ncols = True)
    val_tqdm_bar = tqdm(enumerate(valid_iter), total = (len(valid_iter)),  leave = False, position = 0, file = sys.stdout, dynamic_ncols = True)

    running_loss = 0.0
    running_corrects = 0.0
    val_running_loss = 0.0
    val_running_corrects = 0.0

    model.train()
    for idx, (questions, labels) in train_tqdm_bar:
        iout, out = model(questions[0], questions[1])
        loss = criterion(out, labels)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
        optimizer.step()
        running_loss += loss
        running_corrects += torch.sum(iout == labels)
        train_tqdm_bar.set_description(desc = 'train   '.format(epoch + 1, num_epochs))
        batch_idx = (idx + 1) * 64

        train_tqdm_bar.set_postfix(
            loss = running_loss.item() / batch_idx if idx + 1 < len(train_iter) else running_loss.item() / len(train_iter.dataset)
            ,acc = running_corrects.item() / batch_idx if idx + 1 < len(train_iter) else running_corrects.item() / len(train_iter.dataset)
            )
    
    model.eval()
    with torch.no_grad():
        for val_idx, (v_questions, v_labels) in val_tqdm_bar:
            v_iout, v_out = model(v_questions[0], v_questions[1])
            v_loss = criterion(v_out, v_labels)
            val_running_loss += v_loss
            val_running_corrects += torch.sum(v_iout == v_labels)
            val_tqdm_bar.set_description('validate'.format(epoch + 1, num_epochs))
            val_batch_idx = (val_idx + 1) * 128

            val_tqdm_bar.set_postfix(
            val_loss = val_running_loss.item() / val_batch_idx if val_idx + 1 < len(valid_iter) else val_running_loss.item() / len(valid_iter.dataset)
            ,val_acc = val_running_corrects.item() / val_batch_idx if val_idx + 1 < len(valid_iter) else val_running_corrects.item() / len(valid_iter.dataset)
            )
    
    epoch_loss = running_loss/len(train_iter.dataset)
    losses.append(epoch_loss)
    epoch_accuracy = running_corrects/len(train_iter.dataset)
    accuracies.append(epoch_accuracy)
    val_epoch_loss = val_running_loss/len(valid_iter.dataset)
    val_losses.append(val_epoch_loss)
    val_epoch_accuracy = val_running_corrects/len(valid_iter.dataset)
    val_accuracies.append(val_epoch_accuracy)

    checkpoint = {
            'epoch': epoch + 1
            ,'state_dict': model.state_dict()
            ,'optimizer' : optimizer.state_dict()
            ,'val_loss_min' : val_epoch_loss
        }
    
    print('training loss: {:.4f}, acc: {:.2f}'.format(epoch_loss, epoch_accuracy))
    print('validation loss: {:.4f}, acc: {:.2f}'.format(val_epoch_loss, val_epoch_accuracy))
    print('epoch time: {:.2f} seconds'.format(time.time() - t0))

    if val_epoch_loss <= val_loss_min:
        print('validation loss decreased from {:.4f} to {:.4f}, saving model...'.format(val_loss_min, val_epoch_loss))
        torch.save(checkpoint, 'checkpoint/question_pairs_lowest_val_loss_epoch_{}.pth'.format(epoch + 1))
        val_loss_min = val_epoch_loss

------------------------------------------------------------------------------------------
epoch: 1/20:
------------------------------------------------------------------------------------------
training loss: 0.0087, acc: 0.71
validation loss: 0.0041, acc: 0.74
epoch time: 1631.84 seconds
validation loss decreased from inf to 0.0041, saving model...
------------------------------------------------------------------------------------------
epoch: 2/20:
------------------------------------------------------------------------------------------
training loss: 0.0072, acc: 0.78
validation loss: 0.0040, acc: 0.75
epoch time: 1739.33 seconds
validation loss decreased from 0.0041 to 0.0040, saving model...
------------------------------------------------------------------------------------------
epoch: 3/20:
------------------------------------------------------------------------------------------
training loss: 0.0056, acc: 0.84
validation loss: 0.0044, acc: 0.76
epoch time: 1735.03 seconds


KeyboardInterrupt: 