In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext import datasets
from torchtext import data
from torchtext.data import Field, LabelField
from torchtext.data import BucketIterator
import numpy as np
from tqdm import tqdm

The substitute classifier is a Convolutional Neural Network (CNN) for Sentence Classification proposed in [18]. In our experiments, the CNN consists of multiple convolution layers and max pooling layers. The CNN has one convolution layer for each of the n-gram filter sizes. Each convolution operation gives out a vector of size num_filters. We use [3, 5] n-gram filter sizes with num_filters = 8. The size of the learned embedding matrix E remains the same e = 100. The dropout ratio during training is set to 0.1. For this classifier the accuracy values provided in the main paper are also high.

#Build vocab 

In [10]:
TEXT = data.Field(tokenize='spacy', batch_first=True)
LABEL = data.LabelField(dtype = torch.float)
train_data, test_data = datasets.TREC.splits(TEXT, LABEL)
train_data, valid_data = train_data.split()
MAX_VOCAB_SIZE = 100000
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

downloading train_5500.label



train_5500.label:   0%|          | 0.00/336k [00:00<?, ?B/s][A
train_5500.label:  20%|█▉        | 65.5k/336k [00:00<00:00, 517kB/s][A
train_5500.label: 100%|██████████| 336k/336k [00:00<00:00, 1.27MB/s]


downloading TREC_10.label



TREC_10.label: 100%|██████████| 23.4k/23.4k [00:00<00:00, 393kB/s]


# Build model 

In [160]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([nn.Conv1d(in_channels=embedding_dim, out_channels=n_filters, kernel_size=fs) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
        embs = self.embedding(text)
        #embedded = [batch size, sent len, emb dim]
        embs = embs.permute(0, 2, 1)
        #embedded = [batch size, emb dim, sent len]
        out = [F.relu(c(embs)) for c in self.convs]
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        out_pool = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in out]
        #pooled_n = [batch size, n_filters]
        cat = self.dropout(torch.cat(out_pool, dim=1))
        #cat = [batch size, n_filters * len(filter_sizes)]
        final = self.fc(cat)
        return final

In [161]:
BATCH_SIZE = 64
LEARNING_RATE = 0.001

model = TextCNN(vocab_size=len(TEXT.vocab),
                embedding_dim=100,
                n_filters=8,
                filter_sizes=[3,4,5],
                output_dim=6,
                dropout=0.1,
                pad_idx=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [178]:
def trec_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    correct = []
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.softmax(preds, dim=1))
    for x, yy in zip(rounded_preds, y): 
        correct.append((torch.argmax(x) == yy).float()) #convert into float for division
    acc = np.sum(correct) / len(correct)
    return acc

In [179]:
def train_model(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label.type(torch.LongTensor).to(device))
        acc = trec_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [184]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label.type(torch.LongTensor).to(device))
            acc = trec_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [185]:
import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [186]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)

pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.7840,  0.5547, -1.9164,  ...,  1.0417,  0.9624, -0.7122],
        [ 0.0989, -0.5916,  0.7481,  ...,  0.4283, -1.3365, -0.4723],
        [ 0.1638,  0.6046,  1.0789,  ..., -0.3140,  0.1844,  0.3624],
        ...,
        [ 1.1097,  0.3746, -0.3882,  ..., -0.4966,  0.2572, -0.9995],
        [ 0.0091,  0.2810,  0.7356,  ..., -0.7508,  0.8967, -0.7631],
        [ 0.2269, -0.7989, -2.0088,  ...,  1.0667,  0.5483,  1.4923]],
       device='cuda:0')

In [187]:
N_EPOCHS = 50
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train_model(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'textcnn_trec.pt')
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 1.045 | Train Acc: 50.70%
	 Val. Loss: 0.949 |  Val. Acc: 53.89%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.814 | Train Acc: 63.83%
	 Val. Loss: 0.796 |  Val. Acc: 66.11%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.647 | Train Acc: 73.30%
	 Val. Loss: 0.707 |  Val. Acc: 70.16%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.519 | Train Acc: 79.08%
	 Val. Loss: 0.646 |  Val. Acc: 74.04%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.423 | Train Acc: 83.20%
	 Val. Loss: 0.612 |  Val. Acc: 76.38%
Epoch: 06 | Epoch Time: 0m 0s
	Train Loss: 0.334 | Train Acc: 88.04%
	 Val. Loss: 0.578 |  Val. Acc: 77.70%
Epoch: 07 | Epoch Time: 0m 0s
	Train Loss: 0.280 | Train Acc: 90.17%
	 Val. Loss: 0.579 |  Val. Acc: 78.12%
Epoch: 08 | Epoch Time: 0m 0s
	Train Loss: 0.226 | Train Acc: 92.92%
	 Val. Loss: 0.561 |  Val. Acc: 78.37%
Epoch: 09 | Epoch Time: 0m 0s
	Train Loss: 0.178 | Train Acc: 94.37%
	 Val. Loss: 0.580 |  Val. Acc: 78.85%
Epoch: 10 | Epoch Time: 0m 0