In [2]:
import torch
import torchtext
from torchtext import data
import spacy
from torch import optim
from cnn import CNN
from torch import nn

In [3]:
### Data tools
spacy_en = spacy.load('en_core_web_sm')

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = data.Field(sequential=True, lower=True, tokenize=tokenizer, fix_length=30)
LABEL = data.Field(sequential=False, use_vocab=False)

In [4]:
### Define datasets
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_val_fields = [('Label', LABEL),('Text', TEXT)]
train_set, val_set, test_set = data.TabularDataset.splits(path='data', 
    format='tsv', 
    train='train.tsv', 
    validation='dev.tsv',
    test='test.tsv',
    fields=train_val_fields, 
    skip_header=True)

unlabelled_fields = [('id', None),('Text', TEXT)]
unlabelled_set = data.TabularDataset(path='data/unlabelled.tsv', 
    format='tsv', 
    fields=unlabelled_fields, 
    skip_header=True)

In [5]:
### Build vocab
TEXT.build_vocab(train_set, max_size=100000, vectors='glove.6B.100d')
LABEL.build_vocab(train_set)

In [13]:
import torch
from torch import nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, H, conv_size, out_channels, pretrained_vocab=None, maxPool=False):
        super(CNN, self).__init__()
                
        self.embedding = nn.Embedding(vocab_size,H)
        self.conv1d = nn.Conv1d(H, out_channels, conv_size)
        if maxPool:
            self.pool = nn.AdaptiveMaxPool2d((out_channels, 1))
        else:
            self.pool = nn.AdaptiveAvgPool2d((out_channels, 1))
        self.linear = nn.Linear(out_channels, 2)
        
        if pretrained_vocab:
            self.embedding.weight.data.copy_(pretrained_vocab.vectors)

    def forward(self, x):
        h_embedding = self.embedding(x).permute(1,2,0)
        h_conv1d = self.conv1d(h_embedding)
        h_pool = self.pool(h_conv1d).clamp(min=0).squeeze(2)
        h_linear = self.linear(h_pool)
        logits = F.softmax(h_linear, dim=1)
        return logits

In [7]:
### Get data iterators
train_it_64, val_it_64 = data.Iterator.splits(
        (train_set, val_set), sort_key=lambda x: len(x.Text),
        batch_size=64, device=device)

train_it, val_it = data.Iterator.splits(
        (train_set, val_set), sort_key=lambda x: len(x.Text),
        batch_size=4, device=device)
test_it = data.BucketIterator(
    dataset=test_set,
    batch_size=1,
    device=device,
    sort_key=lambda x: len(x.Text),
    shuffle=False)
unlabelled_it = data.BucketIterator(
    dataset=unlabelled_set,
    batch_size=1,
    device=device,
    sort_key=lambda x: len(x.Text),
    shuffle=False)

In [8]:
dataloaders = {
    'train': train_it,
    'val': val_it,
    'test': test_it,
    'unlabelled': unlabelled_it
}

In [9]:
### Helpers
def forward(model, batch, criterion, multiclass=True):
    if multiclass:
            outputs = model(batch.Text)
            _,preds = torch.max(outputs,1)
            loss = criterion(outputs, batch.Label)

            correct = (preds == batch.Label).float()
            acc = correct.sum()/len(correct)
    else:
        outputs = model(batch.Text).squeeze(1)
        loss = criterion(outputs, batch.Label.float())

        preds = torch.round(torch.sigmoid(outputs))
        correct = (preds == batch.Label.float()).float()
        acc = correct.sum()/len(correct)
    return loss, acc

def train(model, iterator, optimizer, criterion, multiclass=True):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
                
        loss, acc = forward(model, batch, criterion, multiclass=multiclass)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion, multiclass=True):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            loss, acc = forward(model, batch, criterion, multiclass=multiclass)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def run_model(device, dataloaders, model, optimizer, criterion, num_epochs, multiclass=True):
    model = model.to(device)
    best_val_acc = 0
    best_model_params = None
    
    train_it, val_it, test_it = dataloaders['train'], dataloaders['val'], dataloaders['test']
        
    for epoch in range(num_epochs):
        train_loss, train_acc = train(model, train_it, optimizer, criterion, multiclass=multiclass)
        valid_loss, valid_acc = evaluate(model, val_it, criterion, multiclass=multiclass)
        
        if valid_acc > best_val_acc:
            best_val_acc = valid_acc
            best_model_params = model.parameters()

        print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

    model
    test_loss, test_acc = evaluate(model, test_it, criterion, multiclass=multiclass)
    print('Test loss:', test_loss)
    print('Test accuracy:', test_acc)

    return model

In [27]:
### Train CNN Model kernel size 5 (with pretrain)
model = CNN(len(TEXT.vocab), 100, 5, 128, pretrained_vocab=TEXT.vocab)
optimizer = optim.SGD(model.parameters(), lr=0.3)
criterion = nn.CrossEntropyLoss()

cnn_5_model = run_model(device, dataloaders, model, optimizer, criterion, 25, multiclass=True)

| Epoch: 01 | Train Loss: 0.502 | Train Acc: 79.84% | Val. Loss: 0.435 | Val. Acc: 87.28% |
| Epoch: 02 | Train Loss: 0.430 | Train Acc: 87.81% | Val. Loss: 0.397 | Val. Acc: 91.23% |
| Epoch: 03 | Train Loss: 0.410 | Train Acc: 89.95% | Val. Loss: 0.387 | Val. Acc: 92.32% |
| Epoch: 04 | Train Loss: 0.402 | Train Acc: 90.63% | Val. Loss: 0.405 | Val. Acc: 90.61% |
| Epoch: 05 | Train Loss: 0.394 | Train Acc: 91.62% | Val. Loss: 0.380 | Val. Acc: 93.14% |
| Epoch: 06 | Train Loss: 0.389 | Train Acc: 92.12% | Val. Loss: 0.383 | Val. Acc: 92.70% |
| Epoch: 07 | Train Loss: 0.386 | Train Acc: 92.42% | Val. Loss: 0.380 | Val. Acc: 93.06% |
| Epoch: 08 | Train Loss: 0.383 | Train Acc: 92.73% | Val. Loss: 0.377 | Val. Acc: 93.31% |
| Epoch: 09 | Train Loss: 0.380 | Train Acc: 93.09% | Val. Loss: 0.382 | Val. Acc: 92.87% |
| Epoch: 10 | Train Loss: 0.377 | Train Acc: 93.44% | Val. Loss: 0.394 | Val. Acc: 91.63% |
| Epoch: 11 | Train Loss: 0.375 | Train Acc: 93.58% | Val. Loss: 0.393 | Val. Ac

In [10]:
### Train CNN Model kernel size 7 (with pretrain)
model = CNN(len(TEXT.vocab), 100, 7, 128, pretrained_vocab=TEXT.vocab)
optimizer = optim.SGD(model.parameters(), lr=0.3)
criterion = nn.CrossEntropyLoss()

cnn_model = run_model(device, dataloaders, model, optimizer, criterion, 20, multiclass=True)

| Epoch: 01 | Train Loss: 0.477 | Train Acc: 82.47% | Val. Loss: 0.388 | Val. Acc: 92.30% |
| Epoch: 02 | Train Loss: 0.407 | Train Acc: 90.20% | Val. Loss: 0.392 | Val. Acc: 91.83% |
| Epoch: 03 | Train Loss: 0.391 | Train Acc: 91.84% | Val. Loss: 0.377 | Val. Acc: 93.37% |
| Epoch: 04 | Train Loss: 0.384 | Train Acc: 92.64% | Val. Loss: 0.378 | Val. Acc: 93.35% |
| Epoch: 05 | Train Loss: 0.379 | Train Acc: 93.16% | Val. Loss: 0.376 | Val. Acc: 93.38% |
| Epoch: 06 | Train Loss: 0.375 | Train Acc: 93.56% | Val. Loss: 0.413 | Val. Acc: 89.61% |
| Epoch: 07 | Train Loss: 0.372 | Train Acc: 93.88% | Val. Loss: 0.366 | Val. Acc: 94.56% |
| Epoch: 08 | Train Loss: 0.369 | Train Acc: 94.28% | Val. Loss: 0.372 | Val. Acc: 93.84% |
| Epoch: 09 | Train Loss: 0.367 | Train Acc: 94.44% | Val. Loss: 0.367 | Val. Acc: 94.44% |
| Epoch: 10 | Train Loss: 0.364 | Train Acc: 94.72% | Val. Loss: 0.378 | Val. Acc: 93.38% |
| Epoch: 11 | Train Loss: 0.363 | Train Acc: 94.88% | Val. Loss: 0.365 | Val. Ac

In [21]:
### Train CNN Model Max Pool kernel size 5 (with pretrain)
model = CNN(len(TEXT.vocab), 100, 5, 128, pretrained_vocab=TEXT.vocab, maxPool=True)
optimizer = optim.SGD(model.parameters(), lr=0.1)
criterion = nn.CrossEntropyLoss()

cnn_model_max_5 = run_model(device, dataloaders, model, optimizer, criterion, 10, multiclass=True)

| Epoch: 01 | Train Loss: 0.414 | Train Acc: 89.48% | Val. Loss: 0.387 | Val. Acc: 92.33% |
| Epoch: 02 | Train Loss: 0.379 | Train Acc: 93.23% | Val. Loss: 0.395 | Val. Acc: 91.52% |
| Epoch: 03 | Train Loss: 0.367 | Train Acc: 94.44% | Val. Loss: 0.362 | Val. Acc: 95.00% |
| Epoch: 04 | Train Loss: 0.363 | Train Acc: 94.82% | Val. Loss: 0.361 | Val. Acc: 95.15% |
| Epoch: 05 | Train Loss: 0.361 | Train Acc: 95.13% | Val. Loss: 0.383 | Val. Acc: 92.81% |
| Epoch: 06 | Train Loss: 0.357 | Train Acc: 95.53% | Val. Loss: 0.360 | Val. Acc: 95.28% |
| Epoch: 07 | Train Loss: 0.355 | Train Acc: 95.74% | Val. Loss: 0.360 | Val. Acc: 95.27% |
| Epoch: 08 | Train Loss: 0.354 | Train Acc: 95.86% | Val. Loss: 0.359 | Val. Acc: 95.32% |
| Epoch: 09 | Train Loss: 0.351 | Train Acc: 96.16% | Val. Loss: 0.358 | Val. Acc: 95.39% |
| Epoch: 10 | Train Loss: 0.350 | Train Acc: 96.22% | Val. Loss: 0.359 | Val. Acc: 95.34% |
Test loss: 0.3615973283946514
Test accuracy: 0.9512


In [19]:
### Train CNN Model Max Pool kernel size 5 (with pretrain)
model = CNN(len(TEXT.vocab), 100, 7, 128, pretrained_vocab=TEXT.vocab, maxPool=True)
optimizer = optim.SGD(model.parameters(), lr=0.1)
criterion = nn.CrossEntropyLoss()

cnn_model_max_7 = run_model(device, dataloaders, model, optimizer, criterion, 10, multiclass=True)

| Epoch: 01 | Train Loss: 0.419 | Train Acc: 88.96% | Val. Loss: 0.411 | Val. Acc: 89.83% |
| Epoch: 02 | Train Loss: 0.381 | Train Acc: 92.99% | Val. Loss: 0.373 | Val. Acc: 93.83% |
| Epoch: 03 | Train Loss: 0.371 | Train Acc: 94.10% | Val. Loss: 0.379 | Val. Acc: 93.28% |
| Epoch: 04 | Train Loss: 0.366 | Train Acc: 94.59% | Val. Loss: 0.364 | Val. Acc: 94.71% |
| Epoch: 05 | Train Loss: 0.366 | Train Acc: 94.55% | Val. Loss: 0.373 | Val. Acc: 93.91% |
| Epoch: 06 | Train Loss: 0.360 | Train Acc: 95.25% | Val. Loss: 0.362 | Val. Acc: 95.06% |
| Epoch: 07 | Train Loss: 0.358 | Train Acc: 95.41% | Val. Loss: 0.361 | Val. Acc: 95.16% |
| Epoch: 08 | Train Loss: 0.357 | Train Acc: 95.55% | Val. Loss: 0.362 | Val. Acc: 95.01% |
| Epoch: 09 | Train Loss: 0.354 | Train Acc: 95.81% | Val. Loss: 0.358 | Val. Acc: 95.52% |
| Epoch: 10 | Train Loss: 0.354 | Train Acc: 95.85% | Val. Loss: 0.363 | Val. Acc: 94.97% |
Test loss: 0.36499896264672277
Test accuracy: 0.9473


In [20]:
Test loss: 0.38745442863106727
Test accuracy: 0.9254

SyntaxError: invalid syntax (<ipython-input-20-611d9d8e9920>, line 1)