In [10]:
import torch
import torchtext
from torchtext import data
import spacy
from torch import optim
from cnn import CNN
from torch import nn

In [50]:
### Data tools
spacy_en = spacy.load('en')

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = data.Field(sequential=True, lower=True, tokenize=tokenizer, fix_length=30)
LABEL = data.Field(sequential=False, use_vocab=False)

In [51]:
### Define datasets
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_val_fields = [('Label', LABEL),('Text', TEXT)]
train_set, val_set, test_set = data.TabularDataset.splits(path='data', 
    format='tsv', 
    train='train.tsv', 
    validation='dev.tsv',
    test='test.tsv',
    fields=train_val_fields, 
    skip_header=True)

unlabelled_fields = [('id', None),('Text', TEXT)]
unlabelled_set = data.TabularDataset(path='data/unlabelled.tsv', 
    format='tsv', 
    fields=unlabelled_fields, 
    skip_header=True)

In [52]:
### Build vocab
TEXT.build_vocab(train_set, max_size=100000, vectors='glove.6B.100d')
LABEL.build_vocab(train_set)

In [80]:
import torch
from torch import nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, H, conv_size, out_channels, pretrained_vocab=None):
        super(CNN, self).__init__()
                
        self.embedding = nn.Embedding(vocab_size,H)
        self.conv1d = nn.Conv1d(H, out_channels, conv_size)
        self.avgPool = nn.AdaptiveAvgPool2d((out_channels, 1))
        self.linear = nn.Linear(out_channels, 2)
        
        if pretrained_vocab:
            self.embedding.weight.data.copy_(pretrained_vocab.vectors)

    def forward(self, x):
        h_embedding = self.embedding(x).permute(1,2,0)
        h_conv1d = self.conv1d(h_embedding)
        h_pool = self.avgPool(h_conv1d).clamp(min=0).squeeze()
        h_linear = self.linear(h_pool)
        logits = F.softmax(h_linear, dim=1)
        return logits

In [54]:
### Get data iterators
train_it_64, val_it_64 = data.Iterator.splits(
        (train_set, val_set), sort_key=lambda x: len(x.Text),
        batch_size=64, device=device)

train_it, val_it = data.Iterator.splits(
        (train_set, val_set), sort_key=lambda x: len(x.Text),
        batch_size=4, device=device)
test_it = data.BucketIterator(
    dataset=test_set,
    batch_size=1,
    device=device,
    sort_key=lambda x: len(x.Text),
    shuffle=False)
unlabelled_it = data.BucketIterator(
    dataset=unlabelled_set,
    batch_size=1,
    device=device,
    sort_key=lambda x: len(x.Text),
    shuffle=False)

In [83]:
dataloaders = {
    'train': train_it,
    'val': val_it,
    'test': test_it,
    'unlabelled': unlabelled_it
}

In [55]:
### Helpers
def forward(model, batch, criterion, multiclass=True):
    if multiclass:
            outputs = model(batch.Text).squeeze(1)
            _,preds = torch.max(outputs,1)
            loss = criterion(outputs, batch.Label)

            correct = (preds == batch.Label).float()
            acc = correct.sum()/len(correct)
    else:
        outputs = model(batch.Text).squeeze(1)
        loss = criterion(outputs, batch.Label.float())

        preds = torch.round(torch.sigmoid(outputs))
        correct = (preds == batch.Label.float()).float()
        acc = correct.sum()/len(correct)
    return loss, acc

def train(model, iterator, optimizer, criterion, multiclass=True):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
                
        loss, acc = forward(model, batch, criterion, multiclass=multiclass)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion, multiclass=True):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            loss, acc = forward(model, batch, criterion, multiclass=multiclass)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def run_model(device, dataloaders, model, optimizer, criterion, num_epochs, multiclass=True):
    model = model.to(device)
    
    train_it, val_it, test_it = dataloaders['train'], dataloaders['val'], dataloaders['test']
        
    for epoch in range(num_epochs):
        train_loss, train_acc = train(model, train_it, optimizer, criterion, multiclass=multiclass)
        valid_loss, valid_acc = evaluate(model, val_it, criterion, multiclass=multiclass)

        print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

    test_loss, test_acc = evaluate(model, test_it, criterion, multiclass=multiclass)
    print('Test loss:', test_loss)
    print('Test accuracy:', test_acc)

    return model

In [None]:
### Train CNN Model (with pretrain)
model = CNN(len(TEXT.vocab), 100, , 128, pretrained_vocab=TEXT.vocab)
optimizer = optim.SGD(model.parameters(), lr=0.3)
criterion = nn.CrossEntropyLoss()

cnn_model = run_model(device, dataloaders, model, optimizer, criterion, 25, multiclass=True)

In [85]:
### Train CNN Model (with pretrain)
model = CNN(len(TEXT.vocab), 100, 5, 128, pretrained_vocab=TEXT.vocab)
optimizer = optim.SGD(model.parameters(), lr=0.3)
criterion = nn.CrossEntropyLoss()

cnn_model = run_model(device, dataloaders, model, optimizer, criterion, 20, multiclass=True)

| Epoch: 01 | Train Loss: 0.504 | Train Acc: 79.35% | Val. Loss: 0.445 | Val. Acc: 86.11% |
| Epoch: 02 | Train Loss: 0.429 | Train Acc: 87.91% | Val. Loss: 0.574 | Val. Acc: 72.99% |
| Epoch: 03 | Train Loss: 0.410 | Train Acc: 89.85% | Val. Loss: 0.388 | Val. Acc: 92.07% |
| Epoch: 04 | Train Loss: 0.400 | Train Acc: 90.97% | Val. Loss: 0.599 | Val. Acc: 70.63% |
| Epoch: 05 | Train Loss: 0.393 | Train Acc: 91.68% | Val. Loss: 0.409 | Val. Acc: 90.06% |


KeyboardInterrupt: 