In [1]:
import spacy
import torchtext
import pandas as pd

import torch
import torch.nn            as nn
import torch.nn.functional as F
import torch.optim         as optim
from torch.autograd import Variable

import warnings

warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('../data/training_data.csv')
df_train.sample(5)

Unnamed: 0,Intent,Sentence
774,0,I definately would not discuss this with Petro...
1953,1,"Check out the most popular, trending, and like..."
3440,1,"Let us know how we can help you, your co-worke..."
1575,0,The new HomeRun by Deem offer for $10 for Two ...
1937,0,"• Portland, ME: January 31, 2013"


In [3]:
spacy_en      = spacy.load('en')
embedding_dim = 100
fix_length    = 128
epochs        = 20
print_every   = 30

In [4]:
def en_tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
TEXT = torchtext.data.Field(
    sequential = True,
    tokenize   = en_tokenizer,
    lower      = True,
    fix_length = fix_length # for network needing fixed length inputs 
)

In [6]:
LABEL = torchtext.data.Field(
    sequential = False,
    use_vocab  = False
)

In [7]:
train_data, test_data = torchtext.data.TabularDataset.splits(
    path        = '../data/',
    train       = 'training_data.csv',
    test        = 'test_data.csv',
    format      = 'csv',
    skip_header = True,
    fields      = [
        ('Intent'  , LABEL),
        ('Sentence', TEXT)
    ]
)

In [8]:
train_iter, test_iter = torchtext.data.Iterator.splits(
    (train_data, test_data),
    sort_key    = lambda x: len(x.Sentence),
    batch_sizes = (32, 256),
    device      = -1, # -1 for CPU, 0 for GPU
    repeat      = False
)

In [9]:
def init_unk_emb(vocab, init = 'randn', num_special_toks = 2):
    emb_vectors = vocab.vectors
    sweep_range = len(vocab)
    running_norm = 0.
    num_non_zero = 0
    total_words = 0
    
    for i in range(num_special_toks, sweep_range):
        if len(emb_vectors[i, :].nonzero()) == 0:
            if init == 'randn':
                torch.nn.init.normal(emb_vectors[i], mean = 0, std = 0.05)
        else:
            num_non_zero += 1
            running_norm += torch.norm(emb_vectors[i])
        total_words += 1
    print(f'average GloVE norm {running_norm / num_non_zero}, '
          f'known words {num_non_zero}, '
          f'total words {total_words}')

In [10]:
embedding_fn = f'glove.6B.{embedding_dim}d'
TEXT.build_vocab(train_data, vectors = embedding_fn)
vocab = TEXT.vocab
init_unk_emb(vocab)

average GloVE norm 5.217196088951167, known words 6145, total words 6874


In [11]:
def evaluate_model(model, criterion, train_iter, test_iter):
    def compute_loss_and_accuracy(dataset):
        batch_number      = len(dataset)
        total_loss        = 0
        total_predictions = 0
        total_correct     = 0
        for batch_data in dataset:
            X, y               = batch_data.Sentence, batch_data.Intent
            y_pred             = model(X)
            loss               = criterion(y_pred, y)
            total_loss        += loss.data[0]
            correct_guesses    = (y_pred.max(dim = 1)[1] == y).sum().data[0]
            total_predictions += len(y)
            total_correct     += correct_guesses
            
        accuracy     = total_correct / total_predictions
        average_loss = total_loss / batch_number

        return accuracy, average_loss
            
    return compute_loss_and_accuracy(train_iter), \
            compute_loss_and_accuracy(test_iter)

In [12]:
class CNN(nn.Module):
    def __init__(self, vocab):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embedding_dim)
        self.conv1     = nn.Conv1d(embedding_dim, 32, 3, padding = 1)
        self.conv2     = nn.Conv1d(32, 32, 3, padding = 1)
        self.conv3     = nn.Conv1d(32, 64, 3, padding = 1)
        self.conv4     = nn.Conv1d(64, 64, 3, padding = 1)
        self.linear1   = nn.Linear(64 * 32, 2)
        
        self.embedding.weight.data.copy_(vocab.vectors)
        
    def forward(self, x):
        x = x.transpose(0, 1)
        x = self.embedding(x)
        x = x.transpose(1, 2)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, 2)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = F.max_pool1d(x, 2)
        x = x.view(-1, 64 * 32)
        x = self.linear1(x)
        
        return x

In [13]:
model     = CNN(vocab)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [14]:
for epoch in range(epochs):
    running_loss = 0
    for i, batch_data in enumerate(train_iter):
        optimizer.zero_grad()
        X, y          = batch_data.Sentence, batch_data.Intent
        y_pred        = model(X)
        loss          = criterion(y_pred, y)
        running_loss += loss.data[0]
        loss.backward()
        optimizer.step()
        
        
        if i % print_every == print_every - 1:
            print(f'\t[{i + 1:5}] Running loss: {running_loss / print_every : .5f}')
            running_loss = 0

    (train_acc, train_loss), (test_acc, test_loss) = evaluate_model(
        model,
        criterion, 
        train_iter, 
        test_iter
    )
    print(f'Epoch {epoch}: train acc {train_acc * 100 : .3f}% '
          f'train loss {train_loss : .5f} '
          f'test acc {test_acc * 100 : .3f}% '
          f'test loss {test_loss : .5f}')

	[   30] Running loss:  0.69154
	[   60] Running loss:  0.68119
	[   90] Running loss:  0.66584
Epoch 0: train acc  66.667% train loss  0.61131 test acc  77.319% test loss  0.48951
	[   30] Running loss:  0.54985
	[   60] Running loss:  0.51814
	[   90] Running loss:  0.50798
Epoch 1: train acc  82.199% train loss  0.40671 test acc  77.621% test loss  0.45421
	[   30] Running loss:  0.38908
	[   60] Running loss:  0.39661
	[   90] Running loss:  0.37190
Epoch 2: train acc  89.035% train loss  0.26782 test acc  83.669% test loss  0.36207
	[   30] Running loss:  0.29105
	[   60] Running loss:  0.28615
	[   90] Running loss:  0.27473
Epoch 3: train acc  93.465% train loss  0.17577 test acc  81.956% test loss  0.40869
	[   30] Running loss:  0.19449
	[   60] Running loss:  0.17599
	[   90] Running loss:  0.18593
Epoch 4: train acc  96.445% train loss  0.09818 test acc  80.746% test loss  0.52517
	[   30] Running loss:  0.10937
	[   60] Running loss:  0.11662
	[   90] Running loss:  0.10987