In [1]:
import torch
import torchtext
from torchtext import data
import spacy
from torch import nn
from torch import optim
from tqdm import tqdm
import torch.nn.functional as F
import numpy as np

In [3]:
### Data tools
spacy_en = spacy.load('en')

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = data.Field(sequential=True, lower=True, tokenize=tokenizer)
LABEL = data.Field(sequential=False, use_vocab=False)

In [5]:
### Define datasets
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_val_fields = [('Label', LABEL),('Text', TEXT)]
train_set, val_set, test_set = data.TabularDataset.splits(path='../data', 
    format='tsv', 
    train='train.tsv', 
    validation='dev.tsv',
    test='test.tsv',
    fields=train_val_fields, 
    skip_header=True)

unlabelled_fields = [('id', None),('Text', TEXT)]
unlabelled_set = data.TabularDataset(path='../data/unlabelled.tsv', 
    format='tsv', 
    fields=unlabelled_fields, 
    skip_header=True)

In [6]:
### Build vocab
TEXT.build_vocab(train_set, max_size=100000, vectors='glove.6B.100d')
LABEL.build_vocab(train_set)

In [13]:
### Get data iterators
train_it, val_it = data.Iterator.splits(
        (train_set, val_set), sort_key=lambda x: len(x.Text),
        batch_size=64, device=device)
test_it = data.BucketIterator(
    dataset=test_set,
    batch_size=1,
    device=device,
    sort_key=lambda x: len(x.Text),
    shuffle=False)
unlabelled_it = data.BucketIterator(
    dataset=unlabelled_set,
    batch_size=1,
    device=device,
    sort_key=lambda x: len(x.Text),
    shuffle=False)

dataloaders = {
    'train': train_it,
    'val': val_it,
    'test': test_it,
    'unlabelled': unlabelled_it
}

In [8]:
### Classifiers
class EmbeddingClassifier(nn.Module):

    # Initialize the classifier
    def __init__(self, emb_dim, num_labels, vocab_size, pretrained_vocab=None):
        super(EmbeddingClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.linear = nn.Linear(emb_dim, num_labels)
        
        if pretrained_vocab:
            self.embedding.weight.data.copy_(pretrained_vocab.vectors)
        
    def forward(self, inputs):        
        z1 = self.embedding(inputs).permute(1,0,2)
        z2 = F.avg_pool2d(z1, (z1.shape[1], 1)).squeeze(1) 
        out = self.linear(z2)
        return torch.sigmoid(out)
    
class RNNClassifier(nn.Module):
    def __init__(self, emb_dim, num_labels, hidden_dim, vocab_size, pretrained_vocab=None):
        super(RNNClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.RNN(emb_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, num_labels)
        
        if pretrained_vocab:
            self.embedding.weight.data.copy_(pretrained_vocab.vectors)
        
    def forward(self, x):
        
        z1 = self.embedding(x)        
        z2, h2 = self.rnn(z1)
        z3 = self.linear(h2.squeeze(0))
        
        return z3
    
class LSTMClassifier(nn.Module):
    def __init__(self, emb_dim, num_labels, hidden_dim, vocab_size, pretrained_vocab=None):
        super(LSTMClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, num_labels)
        
        if pretrained_vocab:
            self.embedding.weight.data.copy_(pretrained_vocab.vectors)

    def forward(self, x, batch_size=None):
        z1 = self.embedding(x)
        z2, (h2, c2) = self.lstm(z1)
        z3 = self.linear(h2[-1])

        return z3

In [15]:
### Helpers
def forward(model, batch, criterion, multiclass=True):
    if multiclass:
            outputs = model(batch.Text).squeeze(1)
            _,preds = torch.max(outputs,1)
            loss = criterion(outputs, batch.Label)

            correct = (preds == batch.Label).float()
            acc = correct.sum()/len(correct)
    else:
        outputs = model(batch.Text).squeeze(1)
        loss = criterion(outputs, batch.Label.float())

        preds = torch.round(torch.sigmoid(outputs))
        correct = (preds == batch.Label.float()).float()
        acc = correct.sum()/len(correct)
    return loss, acc

def train(model, iterator, optimizer, criterion, multiclass=True):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
                
        loss, acc = forward(model, batch, criterion, multiclass=multiclass)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion, multiclass=True):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            loss, acc = forward(model, batch, criterion, multiclass=multiclass)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [16]:
def run_model(device, dataloaders, model, optimizer, criterion, num_epochs, multiclass=True):
    model = model.to(device)
    
    train_it, val_it, test_it = dataloaders['train'], dataloaders['val'], dataloaders['test']
        
    for epoch in range(num_epochs):
        train_loss, train_acc = train(model, train_it, optimizer, criterion, multiclass=multiclass)
        valid_loss, valid_acc = evaluate(model, val_it, criterion, multiclass=multiclass)

        print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

    test_loss, test_acc = evaluate(model, test_it, criterion, multiclass=multiclass)
    print('Test loss:', test_loss)
    print('Test accuracy:', test_acc)

    return model

In [17]:
### Train Embedding Model (no pretrain)
model = EmbeddingClassifier(100, 2, len(TEXT.vocab))
optimizer = optim.SGD(model.parameters(), lr=0.1)
criterion = nn.CrossEntropyLoss()

run_model(device, dataloaders, model, optimizer, criterion, 25, multiclass=True)

| Epoch: 01 | Train Loss: 0.667 | Train Acc: 61.93% | Val. Loss: 0.644 | Val. Acc: 62.51% |
| Epoch: 02 | Train Loss: 0.635 | Train Acc: 68.96% | Val. Loss: 0.619 | Val. Acc: 66.26% |
| Epoch: 03 | Train Loss: 0.616 | Train Acc: 71.05% | Val. Loss: 0.602 | Val. Acc: 68.80% |
| Epoch: 04 | Train Loss: 0.603 | Train Acc: 72.33% | Val. Loss: 0.588 | Val. Acc: 70.71% |
| Epoch: 05 | Train Loss: 0.592 | Train Acc: 73.03% | Val. Loss: 0.579 | Val. Acc: 71.93% |
| Epoch: 06 | Train Loss: 0.584 | Train Acc: 73.86% | Val. Loss: 0.570 | Val. Acc: 72.75% |
| Epoch: 07 | Train Loss: 0.577 | Train Acc: 74.36% | Val. Loss: 0.563 | Val. Acc: 73.59% |
| Epoch: 08 | Train Loss: 0.570 | Train Acc: 74.90% | Val. Loss: 0.557 | Val. Acc: 74.09% |
| Epoch: 09 | Train Loss: 0.564 | Train Acc: 75.22% | Val. Loss: 0.551 | Val. Acc: 74.81% |
| Epoch: 10 | Train Loss: 0.559 | Train Acc: 75.73% | Val. Loss: 0.546 | Val. Acc: 75.57% |
| Epoch: 11 | Train Loss: 0.554 | Train Acc: 76.19% | Val. Loss: 0.541 | Val. Ac

EmbeddingClassifier(
  (embedding): Embedding(7507, 100)
  (linear): Linear(in_features=100, out_features=2, bias=True)
)

In [18]:
### Train Embedding Model (with pretrain)
model = EmbeddingClassifier(100, 2, len(TEXT.vocab), pretrained_vocab=TEXT.vocab)
optimizer = optim.SGD(model.parameters(), lr=0.1)
criterion = nn.CrossEntropyLoss()

run_model(device, dataloaders, model, optimizer, criterion, 25, multiclass=True)

| Epoch: 01 | Train Loss: 0.674 | Train Acc: 62.26% | Val. Loss: 0.657 | Val. Acc: 56.47% |
| Epoch: 02 | Train Loss: 0.644 | Train Acc: 68.72% | Val. Loss: 0.637 | Val. Acc: 61.39% |
| Epoch: 03 | Train Loss: 0.623 | Train Acc: 70.16% | Val. Loss: 0.615 | Val. Acc: 65.98% |
| Epoch: 04 | Train Loss: 0.606 | Train Acc: 73.15% | Val. Loss: 0.579 | Val. Acc: 73.89% |
| Epoch: 05 | Train Loss: 0.590 | Train Acc: 75.80% | Val. Loss: 0.556 | Val. Acc: 76.83% |
| Epoch: 06 | Train Loss: 0.576 | Train Acc: 77.67% | Val. Loss: 0.540 | Val. Acc: 78.34% |
| Epoch: 07 | Train Loss: 0.562 | Train Acc: 79.45% | Val. Loss: 0.522 | Val. Acc: 80.47% |
| Epoch: 08 | Train Loss: 0.550 | Train Acc: 80.76% | Val. Loss: 0.510 | Val. Acc: 81.64% |
| Epoch: 09 | Train Loss: 0.540 | Train Acc: 81.84% | Val. Loss: 0.502 | Val. Acc: 82.10% |
| Epoch: 10 | Train Loss: 0.531 | Train Acc: 82.64% | Val. Loss: 0.494 | Val. Acc: 82.79% |
| Epoch: 11 | Train Loss: 0.522 | Train Acc: 83.29% | Val. Loss: 0.488 | Val. Ac

EmbeddingClassifier(
  (embedding): Embedding(7507, 100)
  (linear): Linear(in_features=100, out_features=2, bias=True)
)

In [19]:
### Train RNN Model (with pretrain)
model = RNNClassifier(100, 1, 300, len(TEXT.vocab), pretrained_vocab=TEXT.vocab)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 1e-3)
criterion = nn.BCEWithLogitsLoss()

run_model(device, dataloaders, model, optimizer, criterion, 5, multiclass=False)

| Epoch: 01 | Train Loss: 0.258 | Train Acc: 89.75% | Val. Loss: 0.165 | Val. Acc: 95.13% |
| Epoch: 02 | Train Loss: 0.144 | Train Acc: 95.15% | Val. Loss: 0.127 | Val. Acc: 95.50% |
| Epoch: 03 | Train Loss: 0.121 | Train Acc: 96.14% | Val. Loss: 0.121 | Val. Acc: 95.91% |
| Epoch: 04 | Train Loss: 0.107 | Train Acc: 96.52% | Val. Loss: 0.129 | Val. Acc: 95.48% |
| Epoch: 05 | Train Loss: 0.097 | Train Acc: 96.87% | Val. Loss: 0.131 | Val. Acc: 95.47% |
Test loss: 0.1405945674168499
Test accuracy: 0.951


RNNClassifier(
  (embedding): Embedding(7507, 100)
  (rnn): RNN(100, 300)
  (linear): Linear(in_features=300, out_features=1, bias=True)
)

In [20]:
### Train LSTM Model (with pretrain)
model = LSTMClassifier(100, 1, 300, len(TEXT.vocab), pretrained_vocab=TEXT.vocab)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 1e-3)
criterion = nn.BCEWithLogitsLoss()

run_model(device, dataloaders, model, optimizer, criterion, 5, multiclass=False)

| Epoch: 01 | Train Loss: 0.219 | Train Acc: 90.83% | Val. Loss: 0.181 | Val. Acc: 95.28% |
| Epoch: 02 | Train Loss: 0.100 | Train Acc: 96.53% | Val. Loss: 0.124 | Val. Acc: 96.66% |
| Epoch: 03 | Train Loss: 0.072 | Train Acc: 97.61% | Val. Loss: 0.102 | Val. Acc: 96.91% |
| Epoch: 04 | Train Loss: 0.053 | Train Acc: 98.20% | Val. Loss: 0.106 | Val. Acc: 96.77% |
| Epoch: 05 | Train Loss: 0.041 | Train Acc: 98.72% | Val. Loss: 0.104 | Val. Acc: 96.60% |
Test loss: 0.11096419042692068
Test accuracy: 0.9634


LSTMClassifier(
  (embedding): Embedding(7507, 100)
  (lstm): LSTM(100, 300)
  (linear): Linear(in_features=300, out_features=1, bias=True)
)