In [1]:
import spacy
import torchtext
import pandas as pd

import torch
import torch.nn            as nn
import torch.nn.functional as F
import torch.optim         as optim
from torch.autograd import Variable

import warnings

warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('../data/training_data.csv')
df_train.sample(5)

Unnamed: 0,Intent,Sentence
2596,1,Maybe on Ben's layover we can have a discussio...
3543,1,"To begin the online administration process, yo..."
242,0,I am going to get much closer to the model thi...
3373,1,"If you need additional help, visit Apple Support."
3609,1,What would you change about the Senate bill?


In [3]:
spacy_en      = spacy.load('en')
embedding_dim = 100
fix_length    = 50
epochs        = 20
print_every   = 30

In [4]:
def en_tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
TEXT = torchtext.data.Field(
    sequential = True,
    tokenize   = en_tokenizer,
    lower      = True,
    fix_length = fix_length # for network needing fixed length inputs 
)

In [6]:
LABEL = torchtext.data.Field(
    sequential = False,
    use_vocab  = False
)

In [7]:
train_data, test_data = torchtext.data.TabularDataset.splits(
    path        = '../data/',
    train       = 'training_data.csv',
    test        = 'test_data.csv',
    format      = 'csv',
    skip_header = True,
    fields      = [
        ('Intent'  , LABEL),
        ('Sentence', TEXT)
    ]
)

In [8]:
train_iter, test_iter = torchtext.data.Iterator.splits(
    (train_data, test_data),
    sort_key    = lambda x: len(x.Sentence),
    batch_sizes = (32, 256),
    device      = -1, # -1 for CPU, 0 for GPU
    repeat      = False
)

In [9]:
def init_unk_emb(vocab, init = 'randn', num_special_toks = 2):
    emb_vectors = vocab.vectors
    sweep_range = len(vocab)
    running_norm = 0.
    num_non_zero = 0
    total_words = 0
    
    for i in range(num_special_toks, sweep_range):
        if len(emb_vectors[i, :].nonzero()) == 0:
            if init == 'randn':
                torch.nn.init.normal(emb_vectors[i], mean = 0, std = 0.05)
        else:
            num_non_zero += 1
            running_norm += torch.norm(emb_vectors[i])
        total_words += 1
    print(f'average GloVE norm {running_norm / num_non_zero}, '
          f'known words {num_non_zero}, '
          f'total words {total_words}')

In [10]:
embedding_fn = f'glove.6B.{embedding_dim}d'
TEXT.build_vocab(train_data, vectors = embedding_fn)
vocab = TEXT.vocab
init_unk_emb(vocab)

average GloVE norm 5.217196088951167, known words 6145, total words 6874


In [11]:
def evaluate_model(model, criterion, train_iter, test_iter):
    def compute_loss_and_accuracy(dataset):
        batch_number      = len(dataset)
        total_loss        = 0
        total_predictions = 0
        total_correct     = 0
        for batch_data in dataset:
            X, y               = batch_data.Sentence, batch_data.Intent
            y_pred             = model(X)
            loss               = criterion(y_pred, y)
            total_loss        += loss.data[0]
            correct_guesses    = (y_pred.max(dim = 1)[1] == y).sum().data[0]
            total_predictions += len(y)
            total_correct     += correct_guesses
            
        accuracy     = total_correct / total_predictions
        average_loss = total_loss / batch_number

        return accuracy, average_loss
            
    return compute_loss_and_accuracy(train_iter), \
            compute_loss_and_accuracy(test_iter)

In [12]:
class MLP(nn.Module):
    def __init__(self, vocab):
        super(MLP, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embedding_dim)
        self.linear1   = nn.Linear(fix_length * embedding_dim, 256)
        self.linear2   = nn.Linear(256, 256)
        self.linear3   = nn.Linear(256, 2)
        
        self.embedding.weight.data.copy_(vocab.vectors)
        
    def forward(self, x):
        x = x.transpose(0, 1)
        x = self.embedding(x)
        x = x.view(-1, fix_length * embedding_dim)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        
        return x

In [13]:
model     = MLP(vocab)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [14]:
for epoch in range(epochs):
    running_loss = 0
    for i, batch_data in enumerate(train_iter):
        optimizer.zero_grad()
        X, y          = batch_data.Sentence, batch_data.Intent
        y_pred        = model(X)
        loss          = criterion(y_pred, y)
        running_loss += loss.data[0]
        loss.backward()
        optimizer.step()
        
        
        if i % print_every == print_every - 1:
            print(f'\t[{i + 1:5}] Running loss: {running_loss / print_every : .5f}')
            running_loss = 0

    (train_acc, train_loss), (test_acc, test_loss) = evaluate_model(
        model,
        criterion, 
        train_iter, 
        test_iter
    )
    print(f'Epoch {epoch}: train acc {train_acc * 100 : .3f}% '
          f'train loss {train_loss : .5f} '
          f'test acc {test_acc * 100 : .3f}% '
          f'test loss {test_loss : .5f}')

	[   30] Running loss:  0.66395
	[   60] Running loss:  0.58485
	[   90] Running loss:  0.56589
Epoch 0: train acc  73.476% train loss  0.49094 test acc  79.032% test loss  0.46268
	[   30] Running loss:  0.40897
	[   60] Running loss:  0.39595
	[   90] Running loss:  0.39875
Epoch 1: train acc  91.660% train loss  0.23547 test acc  78.427% test loss  0.44900
	[   30] Running loss:  0.22364
	[   60] Running loss:  0.24112
	[   90] Running loss:  0.20674
Epoch 2: train acc  94.777% train loss  0.15136 test acc  80.141% test loss  0.51741
	[   30] Running loss:  0.12051
	[   60] Running loss:  0.10543
	[   90] Running loss:  0.10840
Epoch 3: train acc  98.578% train loss  0.04514 test acc  76.714% test loss  0.84310
	[   30] Running loss:  0.04512
	[   60] Running loss:  0.05547
	[   90] Running loss:  0.08874
Epoch 4: train acc  99.617% train loss  0.01760 test acc  76.411% test loss  1.01915
	[   30] Running loss:  0.04770
	[   60] Running loss:  0.02606
	[   90] Running loss:  0.05916