In [1]:
import torch
import torchtext
from torchtext import data
import spacy
from torch import nn
from torch import optim
from tqdm import tqdm
import torch.nn.functional as F

In [2]:
spacy_en = spacy.load('en')

In [3]:
def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
TEXT = data.Field(sequential=True, lower=True, tokenize=tokenizer)
LABEL = data.Field(sequential=False, use_vocab=False)
train_val_fields = [('Label', LABEL),('Text', TEXT)]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
train_set, val_set, test_set = data.TabularDataset.splits(path='../data', 
    format='tsv', 
    train='train.tsv', 
    validation='dev.tsv',
    test='test.tsv',
    fields=train_val_fields, 
    skip_header=True)

In [9]:
unlabelled_fields = [('id', None),('Text', TEXT)]
unlabelled_set = data.TabularDataset(path='../data/unlabelled.tsv', 
    format='tsv', 
    fields=unlabelled_fields, 
    skip_header=True,
    shuffle=False)

In [12]:
TEXT.build_vocab(train_set, max_size=100000, vectors='glove.6B.100d')
LABEL.build_vocab(train_set)

In [14]:
train_iter, val_iter, test_iter = data.Iterator.splits(
        (train_set, val_set, test_set), sort_key=lambda x: len(x.Text),
        batch_size=64, device=device)

In [15]:
unlabelled_it = data.BucketIterator(
    dataset=unlabelled_set,
    batch_size=1,
    device=device,
    sort_key=lambda x: len(x.Text),
    shuffle=False)

In [None]:
vocab = TEXT.vocab
embed = nn.Embedding(len(vocab), 100)
embed.weight.data.copy_(vocab.vectors)

In [17]:
class EmbeddingClassifier(nn.Module):

    # Initialize the classifier
    def __init__(self, emb_dim, num_labels, vocab_size, pretrained_vocab=None):
        super(EmbeddingClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.linear = nn.Linear(emb_dim, num_labels)
        
        if pretrained_vocab:
            self.embedding.weight.data.copy_(pretrained_vocab.vectors)
        
    def forward(self, inputs):        
        z1 = self.embedding(inputs).permute(1,0,2)
        z2 = F.avg_pool2d(z1, (z1.shape[1], 1)).squeeze(1) 
        out = self.linear(z2)
        return torch.sigmoid(out)

In [18]:
def binary_accuracy_rnn(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum()/len(correct)
    return acc

In [19]:
def train_rnn(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        outputs = model(batch.Text).squeeze(1)
        loss = criterion(outputs, batch.Label.float())
        acc = binary_accuracy_rnn(outputs, batch.Label.float())
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:
def binary_accuracy(preds, y):
    correct = (preds == y).float()
    acc = correct.sum()/len(correct)
    return acc

In [26]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        outputs = model(batch.Text).squeeze(1)
        _,preds = torch.max(outputs,1)
        loss = criterion(outputs, batch.Label)
        acc = binary_accuracy(preds, batch.Label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [22]:
def evaluate_rnn(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            outputs = model(batch.Text).squeeze(1)
            loss = criterion(outputs, batch.Label.float())
            
            acc = binary_accuracy_rnn(outputs, batch.Label.float())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            outputs = model(batch.Text).squeeze(1)
            _,preds = torch.max(outputs, 1)
            loss = criterion(outputs, batch.Label)
            
            acc = binary_accuracy(preds, batch.Label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [27]:
def run_model_2():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    train_it, val_it, test_it = data.BucketIterator.splits(
            datasets=(train_set, val_set, test_set),
            batch_size=32, device=device,
            sort_key=lambda x: len(x.Text),
            repeat=False,
            shuffle=True)
    
    ## define model
    model = EmbeddingClassifier(100, 2, len(TEXT.vocab))
#     optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 1e-3)
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    # criterion = nn.BCEWithLogitsLoss()
    criterion = nn.CrossEntropyLoss()
    model = model.to(device)
    
    for epoch in range(25):
        train_loss, train_acc = train(model, train_it, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, val_it, criterion)

        print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

    test_loss, test_acc = evaluate(model, test_it, criterion)
    print('Test loss:', test_loss)
    print('Test accuracy:', test_acc)

    return model

In [28]:
model_2 = run_model_2()

| Epoch: 01 | Train Loss: 0.654 | Train Acc: 63.99% | Val. Loss: 0.639 | Val. Acc: 63.46% |
| Epoch: 02 | Train Loss: 0.616 | Train Acc: 70.01% | Val. Loss: 0.610 | Val. Acc: 68.01% |
| Epoch: 03 | Train Loss: 0.596 | Train Acc: 72.14% | Val. Loss: 0.590 | Val. Acc: 70.55% |
| Epoch: 04 | Train Loss: 0.581 | Train Acc: 73.52% | Val. Loss: 0.575 | Val. Acc: 72.50% |
| Epoch: 05 | Train Loss: 0.569 | Train Acc: 74.55% | Val. Loss: 0.565 | Val. Acc: 73.64% |
| Epoch: 06 | Train Loss: 0.559 | Train Acc: 75.48% | Val. Loss: 0.555 | Val. Acc: 74.52% |
| Epoch: 07 | Train Loss: 0.550 | Train Acc: 76.19% | Val. Loss: 0.546 | Val. Acc: 75.36% |
| Epoch: 08 | Train Loss: 0.543 | Train Acc: 76.93% | Val. Loss: 0.538 | Val. Acc: 76.16% |
| Epoch: 09 | Train Loss: 0.536 | Train Acc: 77.71% | Val. Loss: 0.531 | Val. Acc: 76.61% |
| Epoch: 10 | Train Loss: 0.529 | Train Acc: 78.40% | Val. Loss: 0.525 | Val. Acc: 77.60% |
| Epoch: 11 | Train Loss: 0.523 | Train Acc: 78.90% | Val. Loss: 0.518 | Val. Ac

In [None]:
def pred

In [None]:
def predict_unlabelled(model, it, name):
    model.eval()
    with torch.no_grad():
        f = open(name, 'w')
        for batch in it:
            outputs = model(batch.Text).squeeze(1)
            _,preds = torch.max(outputs,1)
            output_string = '\n'.join(str(s) for s in preds.numpy())
            f.write(output_string+'\n')

In [None]:
def predict_unlabelled_rnn(model, it, name):
    model.eval()
    with torch.no_grad():
        f = open(name, 'w')
        for batch in it:
            outputs = model(batch.Text).squeeze(1)
            s = torch.sigmoid(outputs)
            preds = torch.round(s)
            output_string = '\n'.join(str(s) for s in preds.numpy())
            f.write(output_string+'\n')

In [None]:
predict_unlabelled(model_2, unlabelled_it, 'predictions_q2.txt')

In [None]:
def run_model_3():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    train_it, val_it, test_it = data.BucketIterator.splits(
            datasets=(train_set, val_set, test_set),
            batch_size=4, device=device,
            sort_key=lambda x: len(x.Text),
            repeat=False,
            shuffle=True)
    
    ## define model
    model = EmbeddingClassifier(100, 2, len(TEXT.vocab), pretrained_vocab=TEXT.vocab)
#     optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 1e-3)
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    # criterion = nn.BCEWithLogitsLoss()
    criterion = nn.CrossEntropyLoss()
    model = model.to(device)
    
    for epoch in range(25):
        train_loss, train_acc = train(model, train_it, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, val_it, criterion)

        print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

    test_loss, test_acc = evaluate(model, test_it, criterion)
    
    return model

In [None]:
model_3 = run_model_3()

In [None]:
predict_unlabelled(model_3, unlabelled_it, 'predictions_q3.txt')

In [None]:
class RNNClassifier(nn.Module):
    def __init__(self, emb_dim, num_labels, hidden_dim, vocab_size, pretrained_vocab=None):
        super(RNNClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.RNN(emb_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, num_labels)
        
        if pretrained_vocab:
            self.embedding.weight.data.copy_(pretrained_vocab.vectors)
        
    def forward(self, x):
        
        z1 = self.embedding(x)        
        z2, h2 = self.rnn(z1)
        z3 = self.linear(h2.squeeze(0))
        
        return z3

In [None]:
def run_model_4():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    train_it, val_it, test_it = data.BucketIterator.splits(
            datasets=(train_set, val_set, test_set),
            batch_size=64, device=device,
            sort_key=lambda x: len(x.Text),
            repeat=False,
            shuffle=True)
    
    ## define model
    model = RNNClassifier(100, 1, 300, len(TEXT.vocab), pretrained_vocab=TEXT.vocab)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 1e-3)
#     optimizer = optim.SGD(model.parameters(), lr=0.1)
    criterion = nn.BCEWithLogitsLoss()
#     criterion = nn.CrossEntropyLoss()
    model = model.to(device)
        
    for epoch in range(5):
        train_loss, train_acc = train_rnn(model, train_it, optimizer, criterion)
        valid_loss, valid_acc = evaluate_rnn(model, val_it, criterion)

        print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

    test_loss, test_acc = evaluate_rnn(model, test_it, criterion)
    print('Test info:', test_loss, test_acc)
    
    return model

In [None]:
model_4 = run_model_4()

In [None]:
predict_unlabelled_rnn(model_4, unlabelled_it, 'predictions_q4_t.txt')

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, emb_dim, num_labels, hidden_dim, vocab_size, pretrained_vocab=None):
        super(LSTMClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, num_labels)
        
        if pretrained_vocab:
            self.embedding.weight.data.copy_(pretrained_vocab.vectors)

    def forward(self, x, batch_size=None):
        z1 = self.embedding(x)
        z2, (h2, c2) = self.lstm(z1)
        z3 = self.linear(h2[-1])

        return z3

In [None]:
def run_model_5():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    train_it, val_it, test_it = data.BucketIterator.splits(
            datasets=(train_set, val_set, test_set),
            batch_size=64, device=device,
            sort_key=lambda x: len(x.Text),
            repeat=False,
            shuffle=True)
    
    ## define model
    model = LSTMClassifier(100, 1, 300, len(TEXT.vocab), pretrained_vocab=TEXT.vocab)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 1e-3)
#     optimizer = optim.SGD(model.parameters(), lr=0.1)
    criterion = nn.BCEWithLogitsLoss()
#     criterion = nn.CrossEntropyLoss()
    model = model.to(device)
        
    for epoch in range(5):
        train_loss, train_acc = train_rnn(model, train_it, optimizer, criterion)
        valid_loss, valid_acc = evaluate_rnn(model, val_it, criterion)

        print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

    test_loss, test_acc = evaluate_rnn(model, test_it, criterion)
    print('Test info:', test_loss, test_acc)
    
    return model

In [None]:
model_5 = run_model_5()

In [None]:
m = model_5