In [1]:
import IPython as ipy
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils
import torch.optim as optim
import torch.utils.data as data
from tqdm.auto import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)

import itertools as it
import time

device: cuda


In [15]:
def load_emb(path, total=None):
    toks = []
    embs = []
    with open(path, 'r') as f:
        for l in tqdm(f, path, total=total):
            tok, *emb = l.strip().split()
            emb = [float(x) for x in emb]
            toks.append(tok)
            embs.append(emb)
    assert('PAD_TOK' not in toks and 'UNK_TOK' not in toks)
    toks += ['PAD_TOK', 'UNK_TOK']
    embs += [[0.]*len(emb), [0.]*len(emb)]
    tok_to_id = dict(zip(toks, it.count()))
    emb = torch.tensor(embs)
    return tok_to_id, emb

In [12]:
#tok_to_id, glv_emb = load_emb('data/glove/glove.6B.100d.txt', int(4e5))
#torch.save((tok_to_id, glv_emb), 'data/pt-cache/tok_to_id__glv_emb.pt')
tok_to_id, glv_emb = torch.load('data/pt-cache/tok_to_id__glv_emb.pt')

In [77]:
def load_chars(path, total=None):
    chars = set()
    with open(path, 'r') as f:
        for l in tqdm(f, path, total=total):
            try:
                for c in l.strip().split()[2]:
                    chars.add(c)
            except:
                pass
    assert('PAD_CHR' not in chars and 'UNK_CHR' not in chars)
    chars.add('PAD_CHR')
    chars.add('UNK_CHR')
    return dict(zip(chars, it.count()))

In [79]:
#chr_to_id = load_chars('data/ner-gmb/train+dev.txt')
#torch.save(chr_to_id, 'data/pt-cache/chr_to_id.pt')
chr_to_id = torch.load('data/pt-cache/chr_to_id.pt')

In [21]:
def load_classes(path, total=None):
    id_to_lbl = set()
    with open(path, 'r') as f:
        for l in tqdm(f, path, total=total):
            try:
                id_to_lbl.add(l.strip().split()[3])
            except:
                pass
    assert('PAD_LBL' not in id_to_lbl)
    id_to_lbl.add('PAD_LBL')
    id_to_lbl = list(id_to_lbl)
    lbl_to_id = {k:v for v, k in enumerate(id_to_lbl)}
    return lbl_to_id, id_to_lbl

In [24]:
#lbl_to_id, id_to_lbl = load_classes('data/ner-gmb/train.txt')
#torch.save((lbl_to_id, id_to_lbl), 'data/pt-cache/lbl_to_id__id_to_lbl')
lbl_to_id, id_to_lbl = torch.load('data/pt-cache/lbl_to_id__id_to_lbl')

In [74]:
def load_data(path, tok_to_id, lbl_to_id, chr_to_id, seq_len=128, word_len=64):
    with open(path, 'r') as f:
        seqs = f.read().split('\n\n')
        seqs.pop()
        seqs[0] = seqs[0][1:]
    X = tok_to_id['PAD_TOK'] * torch.ones((len(seqs), seq_len), dtype=torch.long)
    Y = lbl_to_id['PAD_LBL'] * torch.ones((len(seqs), seq_len), dtype=torch.long)
    W = chr_to_id['PAD_CHR'] * torch.ones((len(seqs), seq_len, word_len), dtype=torch.long)
    for i, seq in enumerate(tqdm(seqs, 'sequences')):
        for j, l in enumerate(seq.split('\n')):
            assert(j < seq_len)
            tok, _, wrd, lbl = l.split(' ')
            try:
                X[i,j] = tok_to_id[tok]
            except KeyError:
                X[i,j] = tok_to_id['UNK_TOK']
                
            for k, ch in enumerate(wrd):
                try:
                    W[i,j,k] = chr_to_id[ch]
                except KeyError:
                    W[i,j,k] = chr_to_id['UNK_CHR']
                    
            Y[i,j] = lbl_to_id[lbl]
    return X, Y, W

In [83]:
#train_X, train_Y, train_W = load_data('data/ner-gmb/train.txt', tok_to_id, lbl_to_id, chr_to_id)
#torch.save((train_X, train_Y, train_W), 'data/pt-cache/train_X__train_Y__train_W.pt')
train_X, train_Y, train_W = torch.load('data/pt-cache/train_X__train_Y__train_W.pt')

In [84]:
#dev_X, dev_Y, dev_W = load_data('data/ner-gmb/dev.txt', tok_to_id, lbl_to_id, chr_to_id)
#torch.save((dev_X, dev_Y, dev_W), 'data/pt-cache/dev_X__dev_Y__dev_W.pt')
dev_X, dev_Y, dev_W = torch.load('data/pt-cache/dev_X__dev_Y__dev_W.pt')

In [86]:
#test_X, test_Y, test_W = load_data('data/ner-gmb/test.txt', tok_to_id, lbl_to_id, chr_to_id)
#torch.save((test_X, test_Y, test_W), 'data/pt-cache/test_X__test_Y__test_W.pt')
test_X, test_Y, test_W = torch.load('data/pt-cache/test_X__test_Y__test_W.pt')

In [88]:
class NERModel(nn.Module):
    def __init__(self, embed_model, seq_tag_model, pad_lbl_id):
        super().__init__()
        self.embed_model = embed_model
        self.seq_tag_model = seq_tag_model
        self.cross_entropy_loss = nn.CrossEntropyLoss(ignore_index=pad_lbl_id)

    def forward(self, X):
        return self.seq_tag_model(self.embed_model(X))
    
    def predict(self, X):
        with torch.no_grad():
            self.eval()
            return torch.argmax(self(X), dim=-1)
        
    def criterion(self, Y, Y_hat):
        return self.cross_entropy_loss(Y_hat.transpose(1,2), Y)

In [89]:
class SeqTagModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_prob):
        super().__init__()
        self.dropout = nn.Dropout(dropout_prob)
        self.h0 = nn.Parameter(torch.zeros(2, hidden_size))
        self.c0 = nn.Parameter(torch.zeros(2, hidden_size))
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(2*hidden_size, output_size)
    
    def forward(self, X):
        D = self.dropout(X)
        H, _ = self.lstm(D, (self.h0.expand(D.shape[0],-1,-1), self.c0.expand(D.shape[0],-1,-1)))
        return self.linear(H)

In [90]:
class ChrEmbModel(nn.Module):
    def __init__(self, n_embs, pad_chr_id, input_size, hidden_size, output_size):
        super().__init__()
        self.embedding = nn.Embedding(n_embs, input_size, padding_idx=pad_chr_id)
        self.h0 = nn.Parameter(torch.zeros(2, hidden_size))
        self.c0 = nn.Parameter(torch.zeros(2, hidden_size))
        self.lstm = nn.LSTM(emb_size, hidden_size, batch_first=True, bidirectional=True)

    def forward(self, W):
        X = W.reshape(-1,W.shape[-1])
        E = self.embedding(X)
        _, (H, _) = self.lstm(E, (self.h0.expand(E.shape[0],-1,-1), self.c0.expand(E.shape[0],-1,-1)))
        return H.reshape(*W.shape,-1)

In [91]:
class ChrTokEmbModel(nn.Module):
    def __init__(self, chr_emb_model, tok_emb_model):
        super().__init__()
        self.chr_emb_model = chr_emb_model
        self.tok_emb_model = tok_emb_model
    
    def forward(self, W, X):
        return torch.cat((self.chr_emb_model(W), self.tok_emb_model(X)), dim=-1)

In [42]:
net = NERModel(100, 100, len(lbl_to_id)-1, 0.5)

In [43]:
opt = optim.SGD(net.parameters(), lr=0.01)