In [35]:
import torch
import torchtext
from torchtext import data
import spacy
from torch import nn
from torch import optim
from tqdm import tqdm
import torch.nn.functional as F
import numpy as np

In [2]:
spacy_en = spacy.load('en')

In [3]:
def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
TEXT = data.Field(sequential=True, lower=True, tokenize=tokenizer)
LABEL = data.Field(sequential=False, use_vocab=False)
train_val_fields = [('Label', LABEL),('Text', TEXT)]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
train_set, val_set, test_set = data.TabularDataset.splits(path='../data', 
    format='tsv', 
    train='train.tsv', 
    validation='dev.tsv',
    test='test.tsv',
    fields=train_val_fields, 
    skip_header=True)

In [9]:
unlabelled_fields = [('id', None),('Text', TEXT)]
unlabelled_set = data.TabularDataset(path='../data/unlabelled.tsv', 
    format='tsv', 
    fields=unlabelled_fields, 
    skip_header=True,
    shuffle=False)

In [12]:
TEXT.build_vocab(train_set, max_size=100000, vectors='glove.6B.100d')
LABEL.build_vocab(train_set)

In [14]:
train_iter, val_iter, test_iter = data.Iterator.splits(
        (train_set, val_set, test_set), sort_key=lambda x: len(x.Text),
        batch_size=64, device=device)

In [15]:
unlabelled_it = data.BucketIterator(
    dataset=unlabelled_set,
    batch_size=1,
    device=device,
    sort_key=lambda x: len(x.Text),
    shuffle=False)

In [None]:
vocab = TEXT.vocab
embed = nn.Embedding(len(vocab), 100)
embed.weight.data.copy_(vocab.vectors)

In [17]:
class EmbeddingClassifier(nn.Module):

    # Initialize the classifier
    def __init__(self, emb_dim, num_labels, vocab_size, pretrained_vocab=None):
        super(EmbeddingClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.linear = nn.Linear(emb_dim, num_labels)
        
        if pretrained_vocab:
            self.embedding.weight.data.copy_(pretrained_vocab.vectors)
        
    def forward(self, inputs):        
        z1 = self.embedding(inputs).permute(1,0,2)
        z2 = F.avg_pool2d(z1, (z1.shape[1], 1)).squeeze(1) 
        out = self.linear(z2)
        return torch.sigmoid(out)

In [18]:
def binary_accuracy_rnn(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum()/len(correct)
    return acc

In [19]:
def train_rnn(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        outputs = model(batch.Text).squeeze(1)
        loss = criterion(outputs, batch.Label.float())
        acc = binary_accuracy_rnn(outputs, batch.Label.float())
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:
def binary_accuracy(preds, y):
    correct = (preds == y).float()
    acc = correct.sum()/len(correct)
    return acc

In [26]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        outputs = model(batch.Text).squeeze(1)
        _,preds = torch.max(outputs,1)
        loss = criterion(outputs, batch.Label)
        acc = binary_accuracy(preds, batch.Label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [22]:
def evaluate_rnn(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            outputs = model(batch.Text).squeeze(1)
            loss = criterion(outputs, batch.Label.float())
            
            acc = binary_accuracy_rnn(outputs, batch.Label.float())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            outputs = model(batch.Text).squeeze(1)
            _,preds = torch.max(outputs, 1)
            loss = criterion(outputs, batch.Label)
            
            acc = binary_accuracy(preds, batch.Label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [27]:
def run_model_2():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    train_it, val_it, test_it = data.BucketIterator.splits(
            datasets=(train_set, val_set, test_set),
            batch_size=32, device=device,
            sort_key=lambda x: len(x.Text),
            repeat=False,
            shuffle=True)
    
    ## define model
    model = EmbeddingClassifier(100, 2, len(TEXT.vocab))
#     optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 1e-3)
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    # criterion = nn.BCEWithLogitsLoss()
    criterion = nn.CrossEntropyLoss()
    model = model.to(device)
    
    for epoch in range(25):
        train_loss, train_acc = train(model, train_it, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, val_it, criterion)

        print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

    test_loss, test_acc = evaluate(model, test_it, criterion)
    print('Test loss:', test_loss)
    print('Test accuracy:', test_acc)

    return model

In [28]:
model_2 = run_model_2()

| Epoch: 01 | Train Loss: 0.654 | Train Acc: 63.99% | Val. Loss: 0.639 | Val. Acc: 63.46% |
| Epoch: 02 | Train Loss: 0.616 | Train Acc: 70.01% | Val. Loss: 0.610 | Val. Acc: 68.01% |
| Epoch: 03 | Train Loss: 0.596 | Train Acc: 72.14% | Val. Loss: 0.590 | Val. Acc: 70.55% |
| Epoch: 04 | Train Loss: 0.581 | Train Acc: 73.52% | Val. Loss: 0.575 | Val. Acc: 72.50% |
| Epoch: 05 | Train Loss: 0.569 | Train Acc: 74.55% | Val. Loss: 0.565 | Val. Acc: 73.64% |
| Epoch: 06 | Train Loss: 0.559 | Train Acc: 75.48% | Val. Loss: 0.555 | Val. Acc: 74.52% |
| Epoch: 07 | Train Loss: 0.550 | Train Acc: 76.19% | Val. Loss: 0.546 | Val. Acc: 75.36% |
| Epoch: 08 | Train Loss: 0.543 | Train Acc: 76.93% | Val. Loss: 0.538 | Val. Acc: 76.16% |
| Epoch: 09 | Train Loss: 0.536 | Train Acc: 77.71% | Val. Loss: 0.531 | Val. Acc: 76.61% |
| Epoch: 10 | Train Loss: 0.529 | Train Acc: 78.40% | Val. Loss: 0.525 | Val. Acc: 77.60% |
| Epoch: 11 | Train Loss: 0.523 | Train Acc: 78.90% | Val. Loss: 0.518 | Val. Ac

In [29]:
test_test_set = data.TabularDataset(path='../data/test.tsv', 
    format='tsv', 
    fields=train_val_fields, 
    skip_header=True)
test_test_it = data.BucketIterator(
    dataset=test_test_set,
    batch_size=1,
    device=device,
    sort_key=lambda x: len(x.Text),
    shuffle=False)

In [31]:
def predict_unlabelled(model, it, name):
    model.eval()
    with torch.no_grad():
        f = open(name, 'w')
        for batch in it:
            outputs = model(batch.Text).squeeze(1)
            _,preds = torch.max(outputs,1)
            output_string = '\n'.join(str(s) for s in preds.numpy())
            f.write(output_string+'\n')

In [32]:
predict_unlabelled(model_2, test_test_it, 'TEST_OUT.txt')

In [33]:
def read_data(path, labelled=True):
    f = open(path,'r')
    text = f.read()
    examples = [example.split(' ') for example in text.split('\n')[:-1]]
    if labelled:
        labels = [int(line[0]) for line in examples]
        data = [line[1:] for line in examples]
        return data,np.array(labels)
    else:
        return examples

In [39]:
predict_unlabelled(model_2, unlabelled_it, 'predictions_twoFINAL.txt')

In [50]:
def predict_unlabelled_rnn(model, it, name):
    model.eval()
    with torch.no_grad():
        f = open(name, 'w')
        for batch in it:
            outputs = model(batch.Text).squeeze(1)
            s = torch.sigmoid(outputs)
            preds = torch.round(s)
            output_string = '\n'.join(str(int(s)) for s in preds.numpy())
            f.write(output_string+'\n')

In [None]:
predict_unlabelled(model_2, unlabelled_it, 'predictions_q2.txt')

In [40]:
def run_model_3():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    train_it, val_it, test_it = data.BucketIterator.splits(
            datasets=(train_set, val_set, test_set),
            batch_size=4, device=device,
            sort_key=lambda x: len(x.Text),
            repeat=False,
            shuffle=True)
    
    ## define model
    model = EmbeddingClassifier(100, 2, len(TEXT.vocab), pretrained_vocab=TEXT.vocab)
#     optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 1e-3)
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    # criterion = nn.BCEWithLogitsLoss()
    criterion = nn.CrossEntropyLoss()
    model = model.to(device)
    
    for epoch in range(25):
        train_loss, train_acc = train(model, train_it, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, val_it, criterion)

        print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

    test_loss, test_acc = evaluate(model, test_it, criterion)
    
    return model

In [41]:
model_3 = run_model_3()

| Epoch: 01 | Train Loss: 0.537 | Train Acc: 80.76% | Val. Loss: 0.444 | Val. Acc: 88.53% |
| Epoch: 02 | Train Loss: 0.444 | Train Acc: 89.55% | Val. Loss: 0.409 | Val. Acc: 90.94% |
| Epoch: 03 | Train Loss: 0.415 | Train Acc: 91.71% | Val. Loss: 0.393 | Val. Acc: 92.44% |
| Epoch: 04 | Train Loss: 0.401 | Train Acc: 92.62% | Val. Loss: 0.385 | Val. Acc: 93.14% |
| Epoch: 05 | Train Loss: 0.391 | Train Acc: 93.39% | Val. Loss: 0.378 | Val. Acc: 93.78% |
| Epoch: 06 | Train Loss: 0.385 | Train Acc: 93.90% | Val. Loss: 0.374 | Val. Acc: 94.26% |
| Epoch: 07 | Train Loss: 0.380 | Train Acc: 94.33% | Val. Loss: 0.371 | Val. Acc: 94.38% |
| Epoch: 08 | Train Loss: 0.376 | Train Acc: 94.62% | Val. Loss: 0.369 | Val. Acc: 94.61% |
| Epoch: 09 | Train Loss: 0.373 | Train Acc: 94.92% | Val. Loss: 0.367 | Val. Acc: 94.79% |
| Epoch: 10 | Train Loss: 0.370 | Train Acc: 95.14% | Val. Loss: 0.366 | Val. Acc: 94.80% |
| Epoch: 11 | Train Loss: 0.367 | Train Acc: 95.34% | Val. Loss: 0.364 | Val. Ac

In [42]:
predict_unlabelled(model_3, test_test_it, 'TEST_OUT.txt')
_,test_labels = read_data('../data/test.txt')
f = open('TEST_OUT.txt','r')
text = f.read()
saved_data = [int(c) for c in text.split('\n')[:-1]]
correct = np.array(saved_data) == test_labels
print(np.sum(correct)/correct.size)

0.9518


In [43]:
predict_unlabelled(model_3, unlabelled_it, 'predictions_qTHREE_FINAL.txt')

In [44]:
class RNNClassifier(nn.Module):
    def __init__(self, emb_dim, num_labels, hidden_dim, vocab_size, pretrained_vocab=None):
        super(RNNClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.RNN(emb_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, num_labels)
        
        if pretrained_vocab:
            self.embedding.weight.data.copy_(pretrained_vocab.vectors)
        
    def forward(self, x):
        
        z1 = self.embedding(x)        
        z2, h2 = self.rnn(z1)
        z3 = self.linear(h2.squeeze(0))
        
        return z3

In [45]:
def run_model_4():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    train_it, val_it, test_it = data.BucketIterator.splits(
            datasets=(train_set, val_set, test_set),
            batch_size=64, device=device,
            sort_key=lambda x: len(x.Text),
            repeat=False,
            shuffle=True)
    
    ## define model
    model = RNNClassifier(100, 1, 300, len(TEXT.vocab), pretrained_vocab=TEXT.vocab)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 1e-3)
#     optimizer = optim.SGD(model.parameters(), lr=0.1)
    criterion = nn.BCEWithLogitsLoss()
#     criterion = nn.CrossEntropyLoss()
    model = model.to(device)
        
    for epoch in range(5):
        train_loss, train_acc = train_rnn(model, train_it, optimizer, criterion)
        valid_loss, valid_acc = evaluate_rnn(model, val_it, criterion)

        print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

    test_loss, test_acc = evaluate_rnn(model, test_it, criterion)
    print('Test info:', test_loss, test_acc)
    
    return model

In [46]:
model_4 = run_model_4()

| Epoch: 01 | Train Loss: 0.300 | Train Acc: 87.02% | Val. Loss: 0.154 | Val. Acc: 94.76% |
| Epoch: 02 | Train Loss: 0.149 | Train Acc: 94.98% | Val. Loss: 0.159 | Val. Acc: 94.31% |
| Epoch: 03 | Train Loss: 0.120 | Train Acc: 96.10% | Val. Loss: 0.124 | Val. Acc: 95.66% |
| Epoch: 04 | Train Loss: 0.117 | Train Acc: 96.20% | Val. Loss: 0.187 | Val. Acc: 93.30% |
| Epoch: 05 | Train Loss: 0.100 | Train Acc: 96.83% | Val. Loss: 0.144 | Val. Acc: 95.23% |
Test info: 0.14661894242759724 0.9522292993630573


In [51]:
predict_unlabelled_rnn(model_4, test_test_it, 'TEST_OUT.txt')
_,test_labels = read_data('../data/test.txt')
f = open('TEST_OUT.txt','r')
text = f.read()
saved_data = [int(c) for c in text.split('\n')[:-1]]
correct = np.array(saved_data) == test_labels
print(np.sum(correct)/correct.size)

0.9518


In [52]:
predict_unlabelled_rnn(model_4, unlabelled_it, 'predictions_qFOUR_FINAL.txt')

In [53]:
class LSTMClassifier(nn.Module):
    def __init__(self, emb_dim, num_labels, hidden_dim, vocab_size, pretrained_vocab=None):
        super(LSTMClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, num_labels)
        
        if pretrained_vocab:
            self.embedding.weight.data.copy_(pretrained_vocab.vectors)

    def forward(self, x, batch_size=None):
        z1 = self.embedding(x)
        z2, (h2, c2) = self.lstm(z1)
        z3 = self.linear(h2[-1])

        return z3

In [54]:
def run_model_5():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    train_it, val_it, test_it = data.BucketIterator.splits(
            datasets=(train_set, val_set, test_set),
            batch_size=64, device=device,
            sort_key=lambda x: len(x.Text),
            repeat=False,
            shuffle=True)
    
    ## define model
    model = LSTMClassifier(100, 1, 300, len(TEXT.vocab), pretrained_vocab=TEXT.vocab)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 1e-3)
#     optimizer = optim.SGD(model.parameters(), lr=0.1)
    criterion = nn.BCEWithLogitsLoss()
#     criterion = nn.CrossEntropyLoss()
    model = model.to(device)
        
    for epoch in range(5):
        train_loss, train_acc = train_rnn(model, train_it, optimizer, criterion)
        valid_loss, valid_acc = evaluate_rnn(model, val_it, criterion)

        print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

    test_loss, test_acc = evaluate_rnn(model, test_it, criterion)
    print('Test info:', test_loss, test_acc)
    
    return model

In [55]:
model_5 = run_model_5()

| Epoch: 01 | Train Loss: 0.227 | Train Acc: 90.08% | Val. Loss: 0.180 | Val. Acc: 95.94% |
| Epoch: 02 | Train Loss: 0.098 | Train Acc: 96.66% | Val. Loss: 0.118 | Val. Acc: 96.44% |
| Epoch: 03 | Train Loss: 0.070 | Train Acc: 97.65% | Val. Loss: 0.111 | Val. Acc: 96.32% |
| Epoch: 04 | Train Loss: 0.050 | Train Acc: 98.37% | Val. Loss: 0.105 | Val. Acc: 96.57% |
| Epoch: 05 | Train Loss: 0.040 | Train Acc: 98.71% | Val. Loss: 0.101 | Val. Acc: 96.88% |
Test info: 0.11581412699239649 0.9619824840764332


In [58]:
predict_unlabelled_rnn(model_5, test_test_it, 'TEST_OUT.txt')
_,test_labels = read_data('../data/test.txt')
f = open('TEST_OUT.txt','r')
text = f.read()
saved_data = [int(c) for c in text.split('\n')[:-1]]
correct = np.array(saved_data) == test_labels
print(np.sum(correct)/correct.size)

0.9622


In [59]:
predict_unlabelled_rnn(model_5, unlabelled_it, 'predictions_qFIVE_FINAL.txt')