* Using same basic architecture as in the first task, but except to do per-token multiclass classification
* Calculations in train/eval loops needed to be modified to account for the extra channel and compute joint accuracy, mostly technical
* Threw away max/avg pooling layers, just lazy to update them
* In preprocessing, discard all tokens without pretrained embedding from the vocabulary - so that at runtime they'd get mapped to `[UNK]` which by itself is a useful signal in this task (many unknown tokens are named entities)
* TODO: spatial dropout on embedding layer, also in the first task

In [1]:
import json
import os
import pickle
import numpy as np
import pandas as pd
import sklearn.model_selection

### Split training data

In [2]:
CV_FOLDS = 10
CV_SEED = 42
CV_PATH_FMT = 'cache/slot/cv{fold}/{split}.json'

In [3]:
train_df = pd.read_json('data/slot/train.json')
eval_df = pd.read_json('data/slot/eval.json')
test_df = pd.read_json('data/slot/test.json')

#TODO: use stratified K-Fold instead
cv = sklearn.model_selection.KFold(n_splits=CV_FOLDS, shuffle=True, random_state=CV_SEED)
for fold_idx, (train_idx, eval_idx) in enumerate(cv.split(train_df.index)):
    for split in ['train', 'eval']:
        filename = CV_PATH_FMT.format(fold=fold_idx, split=split)
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        df = train_df.iloc[train_idx if split == 'train' else eval_idx]
        df.to_json(filename, orient='records', indent=2)
        print(filename)

cache/slot/cv0/train.json
cache/slot/cv0/eval.json
cache/slot/cv1/train.json
cache/slot/cv1/eval.json
cache/slot/cv2/train.json
cache/slot/cv2/eval.json
cache/slot/cv3/train.json
cache/slot/cv3/eval.json
cache/slot/cv4/train.json
cache/slot/cv4/eval.json
cache/slot/cv5/train.json
cache/slot/cv5/eval.json
cache/slot/cv6/train.json
cache/slot/cv6/eval.json
cache/slot/cv7/train.json
cache/slot/cv7/eval.json
cache/slot/cv8/train.json
cache/slot/cv8/eval.json
cache/slot/cv9/train.json
cache/slot/cv9/eval.json


In [4]:
intents = set()
for tags in train_df.tags:
    intents |= set(tags)
intents = ['[PAD]'] + list(sorted(intents))

intent2idx = {s: i for (i, s) in enumerate(intents)}
with open('cache/slot/intent2idx.json', 'w') as fp:
    json.dump(intent2idx, fp, indent=2)

print(intents)

['[PAD]', 'B-date', 'B-first_name', 'B-last_name', 'B-people', 'B-time', 'I-date', 'I-people', 'I-time', 'O']


### Download and parse pre-trained embeddings

In [8]:
# reusing data from nb-intent.ipynb

# parse .txt with fasttext/glove embeddings
def parse_embedding_txt(path):
    vectors = {}
    dim = 0
    with open(path) as fp:
        for line in fp:
            line = line.split()
            if len(line) == 2: continue  # fasttext header
            if dim == 0:
                dim = len(line) - 1
            elif dim != len(line) - 1:
                continue
            vectors[line[0]] = np.array(line[1:], dtype=np.float32)  # will parse strings
    print('Parsed %d x %dd vectors from %s' % (len(vectors), dim, path))
    return vectors

fasttext_vec = parse_embedding_txt('cache/crawl-300d-2M.vec')
glove_vec = parse_embedding_txt('cache/glove.840B.300d.txt')
glove_vec["'t"] = glove_vec["n't"]  # alias 't to n't for glove, a quick fix for one fasttext/glove discrepancy

Parsed 1999995 x 300d vectors from cache/crawl-300d-2M.vec
Parsed 2195875 x 300d vectors from cache/glove.840B.300d.txt


### Generate vocab and embedding matrix

In [5]:
import torch

from dataset import basic_tokenizer
from utils import Vocab

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
vocab = set()
lens = []
for df in [train_df, eval_df]:
    for tok in df.tokens:
        vocab |= set(tok)
        lens.append(len(tok))
vocab = Vocab(list(sorted(vocab)))
print(f'Full vocab size {len(vocab.tokens)}, max len {max(lens)}')

Full vocab size 4117, max len 35


In [11]:
# task has a large number of tokens for which we don't have any embeddings
# missing pretrained embedding for a token alone is likely a useful signal in this task
# exclude all such tokens from vocabulary so they'd all get mapped to [UNK]

import re

def impute(token, pretrained):
    # deal with differences in tokenization
    token = re.sub(r"('(s|m|d|t)|)$", "", token) # XXX's i'm i'd can't -> ""
    token = re.sub(r"[`~!@#$%^&*(){}\[\]\":;,.<>/?+’`]+", "", token) #punctuations -> ""
    return pretrained.get(token)

imputed_vocab = set()
for token in vocab.tokens[2:]:  # exclk PAD, UNK
    vec = impute(token, fasttext_vec)
    if vec is not None:
        imputed_vocab.add(token)
    vec = impute(token, glove_vec)
    if vec is not None:
        imputed_vocab.add(token)

imputed_vocab = Vocab(list(sorted(imputed_vocab)))
print(f'Found embeddings for {len(imputed_vocab.tokens)} of {len(vocab.tokens)} tokens')

Found embeddings for 3167 of 4117 tokens


In [158]:
vocab = imputed_vocab
with open('cache/slot/vocab.pkl', 'wb') as fp:
    pickle.dump(vocab, fp)
with open('cache/slot/vocab.json', 'w') as fp:
    json.dump(vocab.tokens, fp, indent=2)

In [13]:
emb = np.random.normal(size=(len(vocab.tokens), 600), loc=0.0, scale=0.2)
for token in vocab.tokens:
    i = vocab.token_to_id(token)
    vec = impute(token, fasttext_vec)
    if vec is not None:
        emb[i, :300] = vec
    vec = impute(token, glove_vec)
    if vec is not None:
        emb[i, 300:] = vec

emb[0, :] = 0.  # zero init the padding token

emb = torch.tensor(emb, dtype=torch.float32)
torch.save(emb, 'cache/slot/embeddings.pt')
print(emb.shape)

torch.Size([4117, 600])
