In [139]:
import nltk
from sklearn.metrics import classification_report, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
import torch
import torch.nn as nn

#from torch_model_base import TorchModelBase
#from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from torch_rnn_classifier import TorchRNNDataset, TorchRNNClassifier, TorchRNNModel
import utils

import pandas as pd
from collections import Counter

In [140]:
# refresh torch rnn classifier:
import importlib
import torch_rnn_classifier
importlib.reload(torch_rnn_classifier)
from torch_rnn_classifier import TorchRNNDataset

In [3]:
import json

In [4]:
with open('annotations2.jsonl') as jsonl_file:
    # note: after running data-preprocessing.ipynb this file already has token-level labels
    lines = jsonl_file.readlines()
annot = [json.loads(line) for line in lines]

In [5]:
# now get data into format that TorchRNN expects:
X=[] 
y=[]
for j in range(0,len(annot)):
    a = annot[j]['tokens']
    auxX = []
    auxy = []
    if annot[j]['spans']!=[]: # are there annot for this example?
        for i in range(0,len(a)):
            #token_element = (a[i]['text'],a[i]['label'])
            auxX.append(a[i]['text'])
            auxy.append(a[i]['label'])
        X.append(auxX)
        y.append(auxy)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train, X_test, y_train, y_test = X[:120], X[120:], y[:120], y[120:]
vocab = sorted({w for seq in X_train for w in seq}) + ["$UNK"]

In [675]:
# reload vsm module
import torch_rnn_classifier, torch_model_base
import importlib
importlib.reload(torch_model_base)
importlib.reload(torch_rnn_classifier)
from torch_model_base import TorchModelBase
from torch_rnn_classifier import TorchRNNClassifier, TorchRNNModel, TorchRNNDataset

In [676]:
class TorchRNNSequenceLabeler(TorchRNNClassifier):

    def build_graph(self): # uses this build_graph instead of TorchRNNClassifier.build_graph
        print("here0")
        rnn = TorchRNNModel(
            vocab_size=len(self.vocab),
            embedding=self.embedding,
            use_embedding=self.use_embedding,
            embed_dim=self.embed_dim,
            rnn_cell_class=self.rnn_cell_class,
            hidden_dim=self.hidden_dim,
            bidirectional=self.bidirectional,
            freeze_embedding=self.freeze_embedding)
        print("here02")
        model = TorchSequenceLabeler( # this defines self.model
            rnn=rnn,
            output_dim=self.n_classes_)
        self.embed_dim = rnn.embed_dim
        return model

    def build_dataset(self, X, y=None):
        START_TAG = "<START>"
        STOP_TAG = "<STOP>"
        X, seq_lengths = self._prepare_sequences(X) # converts tokens into tokenIds
        if y is None:
            return TorchRNNDataset(X, seq_lengths)
        else:
            # These are the changes from a regular classifier. All
            # concern the fact that our labels are sequences of labels.
            self.classes_ = sorted({x for seq in y for x in seq})
            self.n_classes_ = len(self.classes_)
            class2index = dict(zip(self.classes_, range(self.n_classes_)))
            # `y` is a list of tensors of different length. Our Dataset
            # class will turn it into a padding tensor for processing.
            y = [torch.tensor([class2index[label] for label in seq])
                 for seq in y] # converts labels to indices
            return TorchRNNDataset(X, seq_lengths, y)

    def predict_proba(self, X):
        seq_lengths = [len(ex) for ex in X]
        # The base class does the heavy lifting:
        preds = self._predict(X)
        # Trim to the actual sequence lengths:
        preds = [p[: l] for p, l in zip(preds, seq_lengths)]
        # Use `softmax`; the model doesn't do this because the loss
        # function does it internally.
        probs = [torch.softmax(seq, dim=1) for seq in preds]
        return probs

    def predict(self, X): #out: list of lists with text labels of predictions
        probs = self.predict_proba(X)
        return [[self.classes_[i] for i in seq.argmax(axis=1)] for seq in probs] # seq.argmax(axis=1) gives index of col that maximizes softmax prob
        # see difference vs TorchRNNClassifier.predict

    def score(self, X, y):
        preds = self.predict(X)
        flat_preds = [x for seq in preds for x in seq]
        flat_y = [x for seq in y for x in seq]
        return utils.safe_macro_f1(flat_y, flat_preds)
    

    
    
class TorchSequenceLabeler(nn.Module): # no self.hidden_layer or self.classifier_activation as TorchRNNClassifierModel
    def __init__(self, rnn, output_dim):
        print("here021")
        super().__init__()
        self.rnn = rnn
        self.output_dim = output_dim
        if self.rnn.bidirectional:
            self.classifier_dim = self.rnn.hidden_dim * 2
        else:
            self.classifier_dim = self.rnn.hidden_dim
        self.classifier_layer = nn.Linear(
            self.classifier_dim, self.output_dim)

    def forward(self, X, seq_lengths): # X is (noExsInBatch,MaxLen)=(108,117), seq_lengths is the number of tokens in each example in each batch
        # Out are logits - probs of each token for each class; logits are (108,117,12) or (1,11,5) = (batchSize,MaxLen of examples in batch,noLabelClasses) noLabelClasses include Start + End
        # this is the forward method of self.model
        print("here2")
        outputs, state = self.rnn(X, seq_lengths) # X is (batchSize, maxLen of exs in batch); outputs is (noTokensInEx,hiddDim), state is ((batch_size,1,hiddDim),(batch_size,1,hiddDim)) = (finalHiddState,finalCellState) 
       # print("out1")
        #print(state[0].data.shape)
        #print(state[1].data.shape)
        outputs, seq_length = torch.nn.utils.rnn.pad_packed_sequence(
            outputs, batch_first=True) # outputs is (batchSize,MaxLen of examples in batch,hidden_dim); seq_length is noTokenInEx for each ex in batch
       # print("out2")
        print(outputs.data.shape)
        #print(seq_length)
        logits = self.classifier_layer(outputs) # this is an FCL from hidden_dim to output_dim (NoLabelClasses)
        # During training, we need to swap the dimensions of logits
        # to accommodate `nn.CrossEntropyLoss`:
        if self.training:
            return logits.transpose(1, 2) # transpose dimensions 1 and 2 w/ each other (3d array) # outputs (108,12,117) or (1,5,11)
        else:
            return logits

In [6]:
X_train = ["the wall street journal reported today that apple corporation made money".split(),"georgia tech is a university in georgia".split()]
y_train = ["B I I I O O O B I O O".split(),"B I O O O O B".split()]
vocab = sorted({w for seq in X_train for w in seq}) + ["$UNK"]

In [677]:
seq_mod = TorchRNNSequenceLabeler(
    vocab,
    early_stopping=True,
    eta=0.001)

In [187]:
graph0 = seq_mod.build_graph()
print(graph0)

here0
here01
here02
here021
TorchSequenceLabeler(
  (rnn): TorchRNNModel(
    (embedding): Embedding(1049, 50)
    (rnn): LSTM(50, 50, batch_first=True)
  )
  (classifier_layer): Linear(in_features=50, out_features=12, bias=True)
)


In [678]:
%time _ = seq_mod.fit(X_train, y_train)

Finished epoch 3 of 1000; error is 2.481832504272461

here00
here0
here02
here021
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.5070, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.4944, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.4818, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])


Finished epoch 7 of 1000; error is 2.4303898811340336

batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.4692, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.4564, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.4435, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.4304, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])


Finished epoch 10 of 1000; error is 2.389024257659912

batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.4170, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.4032, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.3890, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.3743, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])


Finished epoch 15 of 1000; error is 2.3074116706848145

batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.3589, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.3427, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.3256, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.3074, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])


Finished epoch 19 of 1000; error is 2.2204887866973877

batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.2880, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.2672, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.2448, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.2205, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])


Stopping after epoch 23. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 2.1011080741882324

batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.1942, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.1656, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.1347, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
batch1
here-model
here2
torch.Size([108, 92, 50])
err
tensor(2.1011, device='cuda:0', grad_fn=<NllLoss2DBackward>)
here2
torch.Size([12, 117, 50])
Wall time: 1.19 s


In [3]:
# reload vsm module
import torch_rnn_classifier, torch_model_base
import importlib
importlib.reload(torch_model_base)
importlib.reload(torch_rnn_classifier)
from torch_model_base import TorchModelBase
from torch_rnn_classifier import TorchRNNClassifier, TorchRNNModel, TorchRNNDataset

In [102]:
from torchcrf import CRF

class TorchCRFSequenceLabeler_1(TorchRNNClassifier):

    def build_graph(self): # uses this build_graph instead of TorchRNNClassifier.build_graph
        print("here0")
        rnn = TorchRNNModel(
            vocab_size=len(self.vocab),
            embedding=self.embedding,
            use_embedding=self.use_embedding,
            embed_dim=self.embed_dim,
            rnn_cell_class=self.rnn_cell_class,
            hidden_dim=self.hidden_dim,
            bidirectional=self.bidirectional,
            freeze_embedding=self.freeze_embedding)
        print("here02")
        model = TorchSequenceLabeler_forCRF_1( # this defines self.model
            rnn=rnn,
            output_dim=self.n_classes_)
        self.embed_dim = rnn.embed_dim
        return model

    def build_dataset(self, X, y=None):
        START_TAG = "<START>"
        STOP_TAG = "<STOP>"
        X, seq_lengths = self._prepare_sequences(X) # converts tokens into tokenIds
        if y is None:
            return TorchRNNDataset(X, seq_lengths)
        else:
            # These are the changes from a regular classifier. All
            # concern the fact that our labels are sequences of labels.
            self.classes_ = sorted({x for seq in y for x in seq})
            self.n_classes_ = len(self.classes_)
            class2index = dict(zip(self.classes_, range(self.n_classes_)))
            #class2index = dict(zip(self.classes_, range(2,2+self.n_classes_)))
            #class2index[STOP_TAG]=0    # add start and stop tags (note: stop needs to be 0 as that is default for padding in collate_fn)
            #class2index[START_TAG]=1 
            # `y` is a list of tensors of different length. Our Dataset
            # class will turn it into a padding tensor for processing.
            y = [torch.tensor([class2index[label] for label in seq])
                 for seq in y] # converts labels to indices
            return TorchRNNDataset(X, seq_lengths, y)

    def predict_proba(self, X):
        seq_lengths = [len(ex) for ex in X]
        # The base class does the heavy lifting:
        preds = self._predict(X)
        # Trim to the actual sequence lengths:
        preds = [p[: l] for p, l in zip(preds, seq_lengths)]
        # Use `softmax`; the model doesn't do this because the loss
        # function does it internally.
        probs = [torch.softmax(seq, dim=1) for seq in preds]
        return probs

    def predict(self, X):
        probs = self.predict_proba(X)
        return [[self.classes_[i] for i in seq.argmax(axis=1)] for seq in probs] # seq.argmax(axis=1) gives index of col that maximizes softmax prob
        # see difference vs TorchRNNClassifier.predict

    def score(self, X, y):
        preds = self.predict(X)
        flat_preds = [x for seq in preds for x in seq]
        flat_y = [x for seq in y for x in seq]
        return utils.safe_macro_f1(flat_y, flat_preds)
    
    def nClasses(self):
        return len(self.classes_)


In [103]:
class TorchSequenceLabeler_forCRF_1(nn.Module): # no self.hidden_layer or self.classifier_activation as TorchRNNClassifierModel
    def __init__(self, rnn, output_dim):
        print("here021")
        super().__init__()
        self.rnn = rnn
        self.output_dim = output_dim
        if self.rnn.bidirectional:
            self.classifier_dim = self.rnn.hidden_dim * 2
        else:
            self.classifier_dim = self.rnn.hidden_dim
        self.classifier_layer = nn.Linear(
            self.classifier_dim, self.output_dim)

    def forward(self, X, seq_lengths): # X is (noExsInBatch,MaxLen)=(108,117), seq_lengths is the number of tokens in each example in each batch
        # this is the forward method of self.model
        print("here2")
        outputs, state = self.rnn(X, seq_lengths) # X is (batchSize, maxLen of exs in batch); outputs is (noTokensInEx,hiddDim), state is ((batch_size,1,hiddDim),(batch_size,1,hiddDim)) = (finalHiddState,finalCellState) 
        outputs, seq_length = torch.nn.utils.rnn.pad_packed_sequence(
            outputs, batch_first=True) # outputs is (batchSize,MaxLen of examples in batch,hidden_dim); seq_length is noTokenInEx for each ex in batch
        logits = self.classifier_layer(outputs) # this is an FCL from hidden_dim to output_dim (NoLabelClasses)
        # logits are (108,117,12) or (1,11,5) = (batchSize,MaxLen of examples in batch,noLabelClasses) noLabelClasses include Start + End
        # During training, we need to swap the dimensions of logits
        # to accommodate `nn.CrossEntropyLoss`:
        if self.training:
            return logits.transpose(1, 2) # transpose dimensions 1 and 2 w/ each other (3d array) # outputs (108,12,117) or (1,5,11)
        else:
            return logits

In [234]:
# reload vsm module
import torch_rnn_classifier, torch_model_base
import importlib
importlib.reload(torch_model_base)
importlib.reload(torch_rnn_classifier)
from torch_model_base import TorchModelBase
from torch_rnn_classifier import TorchRNNClassifier, TorchRNNModel, TorchRNNDataset

In [239]:
from torchcrf import CRF

class TorchCRFSequenceLabeler_2(TorchRNNClassifier):

    def build_graph(self): # uses this build_graph instead of TorchRNNClassifier.build_graph
        print("here0")
        rnn = TorchRNNModel(
            vocab_size=len(self.vocab),
            embedding=self.embedding,
            use_embedding=self.use_embedding,
            embed_dim=self.embed_dim,
            rnn_cell_class=self.rnn_cell_class,
            hidden_dim=self.hidden_dim,
            bidirectional=self.bidirectional,
            freeze_embedding=self.freeze_embedding)
        self.crf = CRF(self.n_classes_,batch_first=True)
        print("here02")
        model = TorchSequenceLabeler_forCRF_2( # this defines self.model
            rnn=rnn,
            output_dim=self.n_classes_,
            crf=self.crf)
        print("here002")
        self.embed_dim = rnn.embed_dim
        return model

    def build_dataset(self, X, y=None):
        X, seq_lengths = self._prepare_sequences(X) # converts tokens into tokenIds
        if y is None:
            return TorchRNNDataset(X, seq_lengths)
        else:
            # These are the changes from a regular classifier. All
            # concern the fact that our labels are sequences of labels.
            self.classes_ = sorted({x for seq in y for x in seq})
            self.n_classes_ = len(self.classes_)
            class2index = dict(zip(self.classes_, range(self.n_classes_)))
            #class2index = dict(zip(self.classes_, range(2,2+self.n_classes_)))
            #class2index[STOP_TAG]=0    # add start and stop tags (note: stop needs to be 0 as that is default for padding in collate_fn)
            #class2index[START_TAG]=1 
            # `y` is a list of tensors of different length. Our Dataset
            # class will turn it into a padding tensor for processing.
            y = [torch.tensor([class2index[label] for label in seq])
                 for seq in y] # converts labels to indices
            return TorchRNNDataset(X, seq_lengths, y)

    def predict_proba(self, X):
        seq_lengths = [len(ex) for ex in X]
        # The base class does the heavy lifting:
        preds = self._predict(X)
        # Trim to the actual sequence lengths:
        preds = [p[: l] for p, l in zip(preds, seq_lengths)]
        # Use `softmax`; the model doesn't do this because the loss
        # function does it internally.
        probs = [torch.softmax(seq, dim=1) for seq in preds]
        return probs

    def predict(self, X): # for CRF-RNN X are logits from RNN
       # probs = self.predict_proba(X)
       # return [[self.classes_[i] for i in seq.argmax(axis=1)] for seq in probs] # seq.argmax(axis=1) gives index of col that maximizes softmax prob
        seq_lengths = [len(ex) for ex in X]
        preds = self._predict(X)
        # Trim to the actual sequence lengths:
        preds = [p[: l] for p, l in zip(preds, seq_lengths)]        
        mask=self.create_mask(seq_lengths) # creates mask matrix (1s are obs used in CRF; 0s are discarded)  
        print("pred")
        print(X.shape)
        print(mask.shape)
       # tag_seq = self.crf.decode(X,mask=mask) # note: X is (nExs,maxTokLen) and here input must be (nExs,maxTokLen,nDistinctTags)
        # [[self.classes_[i] for i in seq] for seq in tag_seq]
        return 0
        # see difference vs TorchRNNClassifier.predict

    def score(self, X, y):
       # preds = self.predict(X)
       # flat_preds = [x for seq in preds for x in seq]
       # flat_y = [x for seq in y for x in seq]
       # return utils.safe_macro_f1(flat_y, flat_preds)
        seq_lengths = [len(ex) for ex in X]
        mask=self.create_mask(seq_lengths) # creates mask matrix (1s are obs used in CRF; 0s are discarded)
        return self.crf(logits, y, mask=mask) # no negative sign here as we want to max likelihood     
    
    def nClasses(self):
        return len(self.classes_)
    
    def create_mask(self, seq_length):
        maxLen=max(seq_length)
        auxLen=len(seq_length)
        auxOne = torch.ones(maxLen)
        auxZero = torch.zeros(maxLen)
        auxOne_l=[1]*maxLen
        auxZero_l=[0]*maxLen
        auxMatrix=[]
        for i in range(auxLen):
            auxRow=auxOne_l[:seq_length[i]]+auxZero_l[seq_length[i]:]
            auxMatrix.append(auxRow)
        return torch.tensor(auxMatrix,dtype=torch.uint8)  

In [240]:
class TorchSequenceLabeler_forCRF_2(nn.Module): # no self.hidden_layer or self.classifier_activation as TorchRNNClassifierModel
    def __init__(self, rnn, output_dim, crf):
        print("here021")
        super().__init__()
        self.rnn = rnn
        self.output_dim = output_dim
        if self.rnn.bidirectional:
            self.classifier_dim = self.rnn.hidden_dim * 2
        else:
            self.classifier_dim = self.rnn.hidden_dim
        self.classifier_layer = nn.Linear(
            self.classifier_dim, self.output_dim)
        self.crf = crf

    def forward(self, X, seq_lengths): # X is (noExsInBatch,MaxLen)=(108,117), seq_lengths is the number of tokens in each example in each batch
        # this is the forward method of self.model
        print("here2")
        outputs, state = self.rnn(X, seq_lengths) # X is (batchSize, maxLen of exs in batch); outputs is (noTokensInEx,hiddDim), state is ((batch_size,1,hiddDim),(batch_size,1,hiddDim)) = (finalHiddState,finalCellState) 
        outputs, seq_length = torch.nn.utils.rnn.pad_packed_sequence(
            outputs, batch_first=True) # outputs is (batchSize,MaxLen of examples in batch,hidden_dim); seq_length is noTokenInEx for each ex in batch
        logits = self.classifier_layer(outputs) # this is an FCL from hidden_dim to output_dim (NoLabelClasses)
       # print(logits.shape)
        # logits are (108,117,12) or (1,11,5) = (batchSize,MaxLen of examples in batch,noLabelClasses) noLabelClasses include Start + End
        # During training, we need to swap the dimensions of logits
        # to accommodate `nn.CrossEntropyLoss`:
       # if self.training:
       #     return logits.transpose(1, 2) # transpose dimensions 1 and 2 w/ each other (3d array) # outputs (108,12,117) or (1,5,11)
       # else:
       #     return logits
        device = "cuda" if torch.cuda.is_available() else "cpu"
        mask = (self.create_mask(seq_lengths)).to(device, non_blocking=True)
        print("fwd")
        print(logits.shape)
        print(mask.shape)
        #tag_seqs = self.crf.decode(logits, mask=mask) # most likely tag sequences
        return self.crf.forward(logits, mask=mask)
    
    def create_mask(self, seq_length):
        maxLen=max(seq_length)
        auxLen=len(seq_length)
        auxOne = torch.ones(maxLen)
        auxZero = torch.zeros(maxLen)
        auxOne_l=[1]*maxLen
        auxZero_l=[0]*maxLen
        auxMatrix=[]
        for i in range(auxLen):
            auxRow=auxOne_l[:seq_length[i]]+auxZero_l[seq_length[i]:]
            auxMatrix.append(auxRow)
        return torch.tensor(auxMatrix,dtype=torch.uint8)      

In [175]:
# Following converts words to indices and pads sequences
seq_mod1 = TorchCRFSequenceLabeler_1(
    vocab,
    early_stopping=True,
    eta=0.001)

In [None]:
%time _ = seq_mod1.fit(X_train, y_train)

In [109]:
def create_mask(seq_length):
    maxLen=max(seq_length)
    auxLen=len(seq_length)
    auxOne = torch.ones(maxLen)
    auxZero = torch.zeros(maxLen)
    auxOne_l=[1]*maxLen
    auxZero_l=[0]*maxLen
    auxMatrix=[]
    for i in range(auxLen):
        auxRow=auxOne_l[:seq_length[i]]+auxZero_l[seq_length[i]:]
        auxMatrix.append(auxRow)
    return torch.tensor(auxMatrix,dtype=torch.uint8)
#check:
#out=create_mask(seq_length)
#print(out)
#print(torch.sum(out,dim=1))

#Note: before was doing this but can't use this as not guaranteed that label[STOP]=0
    # a=torch.full_like(y,0,dtype=torch.uint8)
    # b=torch.full_like(y,1,dtype=torch.uint8)
    # mask=torch.where(y==0,a,b) 

In [110]:
########## SAMPLE CRF ON REAL DATA
import torch.optim as optim

torch.manual_seed(1)
dataset = seq_mod1.build_dataset(X_train, y_train) 
dataloader = seq_mod1._build_dataloader(dataset, shuffle=False) 
graph = seq_mod1.build_graph()
num_tags = seq_mod1.nClasses()
model_CRF = CRF(num_tags,batch_first=True)

optimizer = optim.SGD(model_CRF.parameters(), lr=0.01, weight_decay=1e-4)

for batch_num, batch in enumerate(dataloader, start=1):
    x=batch[0]   
    seq_length=batch[1]
    y=batch[2] 
    logits = graph.forward(x,seq_length).transpose(1,2)
    print(logits.shape)
    
    # CRF piece:
    mask=create_mask(seq_length) # creates mask matrix (1s are obs used in CRF; 0s are discarded)
    loss=-model_CRF(logits, y, mask=mask) # in this way minimizing loss means max log likelihood (i.e. we're converting to NLL)
    print(loss)
    loss.backward()
    optimizer.step()
    print(model_CRF.decode(logits,mask=mask))


here0
here01
here02
here021
here2
here21
torch.Size([120, 117, 12])
tensor(16929.6426, grad_fn=<NegBackward>)
[[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6

In [111]:
print(graph)

TorchSequenceLabeler_forCRF_1(
  (rnn): TorchRNNModel(
    (embedding): Embedding(1049, 50)
    (rnn): LSTM(50, 50, batch_first=True)
  )
  (classifier_layer): Linear(in_features=50, out_features=12, bias=True)
)


In [237]:
# Following converts words to indices and pads sequences
seq_mod2 = TorchCRFSequenceLabeler_2(
    vocab,
    early_stopping=True,
    eta=0.001)

In [245]:
%time _ = seq_mod2.fit(X_train, y_train)

here00
here0
here01
here02
here021
here002
batch1
here-model
here2
here21
fwd
torch.Size([108, 117, 12])
torch.Size([108, 117])


TypeError: forward() missing 1 required positional argument: 'tags'

In [244]:
#############################################################
################### I AM WORKING HERE #######################

import torch.optim as optim

torch.manual_seed(1)
dataset = seq_mod2.build_dataset(X_train, y_train) 
dataloader = seq_mod2._build_dataloader(dataset, shuffle=False) 
graph = seq_mod2.build_graph()
num_tags = seq_mod2.nClasses()
#model_CRF = CRF(num_tags,batch_first=True)

#optimizer = optim.SGD(model_CRF.parameters(), lr=0.01, weight_decay=1e-4)

for batch_num, batch in enumerate(dataloader, start=1):
    x=batch[0]   
    seq_length=batch[1]
    y=batch[2] 
  #  print(x.shape)
  #  print(seq_length.shape)
    logits = graph.forward(x,seq_length)
   # print(logits)
    print(seq_mod2.predict(x))
    
    # CRF piece:
   # mask=create_mask(seq_length) # creates mask matrix (1s are obs used in CRF; 0s are discarded)
   # loss=-model_CRF(logits, y, mask=mask) # in this way minimizing loss means max log likelihood (i.e. we're converting to NLL)
   # print(loss)
   # loss.backward()
   # optimizer.step()
   # print(model_CRF.decode(logits,mask=mask))

here0
here01
here02
here021
here002
here2
here21
fwd
torch.Size([120, 117, 12])
torch.Size([120, 117])


TypeError: forward() missing 1 required positional argument: 'tags'

In [188]:
graph = seq_mod2.build_graph()
print(graph)

here0
here01
here02
here021
here002
TorchSequenceLabeler_forCRF_2(
  (rnn): TorchRNNModel(
    (embedding): Embedding(1049, 50)
    (rnn): LSTM(50, 50, batch_first=True)
  )
  (classifier_layer): Linear(in_features=50, out_features=12, bias=True)
  (crf): CRF(num_tags=12)
)


In [679]:
# reload vsm module
import torch_rnn_classifier, torch_model_base
import importlib
importlib.reload(torch_model_base)
importlib.reload(torch_rnn_classifier)
from torch_model_base import TorchModelBase
from torch_rnn_classifier import TorchRNNClassifier, TorchRNNModel, TorchRNNDataset

In [692]:
from torchcrf import CRF
import copy
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import utils

class TorchCRFSequenceLabeler_3(TorchRNNClassifier):

    def __init__(self,             
            vocab,
            hidden_dim=50,
            embedding=None,
            use_embedding=True,
            embed_dim=50,
            rnn_cell_class=nn.LSTM,
            bidirectional=False,
            freeze_embedding=False,
            classifier_activation=nn.ReLU(),
            **base_kwargs):   
        self.vocab = vocab
        self.hidden_dim = hidden_dim
        self.embedding = embedding
        self.use_embedding = use_embedding
        self.embed_dim = embed_dim
        self.rnn_cell_class = rnn_cell_class
        self.bidirectional = bidirectional
        self.freeze_embedding = freeze_embedding
        self.classifier_activation = classifier_activation
        super().__init__(vocab,**base_kwargs)
        self.params += [
            'hidden_dim',
            'embed_dim',
            'embedding',
            'use_embedding',
            'rnn_cell_class',
            'bidirectional',
            'freeze_embedding',
            'classifier_activation']
        self.loss = lambda x:x
        if self.bidirectional:
            self.classifier_dim = self.hidden_dim * 2
        else:
            self.classifier_dim = self.hidden_dim
       # self.classifier_layer = nn.Linear(
       #     self.classifier_dim, self.n_classes_)

       
    def build_graph(self): # uses this build_graph instead of TorchRNNClassifier.build_graph
       # print("here0")
        rnn = TorchRNNModel(
            vocab_size=len(self.vocab),
            embedding=self.embedding,
            use_embedding=self.use_embedding,
            embed_dim=self.embed_dim,
            rnn_cell_class=self.rnn_cell_class,
            hidden_dim=self.hidden_dim,
            bidirectional=self.bidirectional,
            freeze_embedding=self.freeze_embedding)
      #  print("here02")
        model = TorchSequenceLabeler_forCRF_3( # this defines self.model
            rnn=rnn,
            output_dim=self.n_classes_)
      #  print("here002")
        self.embed_dim = rnn.embed_dim
        self.rnn = rnn
        return model

    def build_dataset(self, X, y=None):
        X, seq_lengths = self._prepare_sequences(X) # converts tokens into tokenIds
        if y is None:
            return TorchRNNDataset(X, seq_lengths)
        else:
            # These are the changes from a regular classifier. All
            # concern the fact that our labels are sequences of labels.
            self.classes_ = sorted({x for seq in y for x in seq})
            self.n_classes_ = len(self.classes_)
            class2index = dict(zip(self.classes_, range(self.n_classes_)))
            print(class2index)
            #class2index = dict(zip(self.classes_, range(2,2+self.n_classes_)))
            #class2index[STOP_TAG]=0    # add start and stop tags (note: stop needs to be 0 as that is default for padding in collate_fn)
            #class2index[START_TAG]=1 
            # `y` is a list of tensors of different length. Our Dataset
            # class will turn it into a padding tensor for processing.
            y = [torch.tensor([class2index[label] for label in seq])
                 for seq in y] # converts labels to indices
            return TorchRNNDataset(X, seq_lengths, y)

#    def predict_proba(self, X):
#        seq_lengths = [len(ex) for ex in X]
#        # The base class does the heavy lifting:
#        preds = self._predict(X)
#        # Trim to the actual sequence lengths:
#        preds = [p[: l] for p, l in zip(preds, seq_lengths)]
#        # Use `softmax`; the model doesn't do this because the loss
#        # function does it internally.
#        probs = [torch.softmax(seq, dim=1) for seq in preds]
#        return probs

    def predict(self, X): # for CRF-RNN X are logits from RNN
       # probs = self.predict_proba(X)
       # return [[self.classes_[i] for i in seq.argmax(axis=1)] for seq in probs] # seq.argmax(axis=1) gives index of col that maximizes softmax prob
        device = "cuda" if torch.cuda.is_available() else "cpu"
        seq_lengths = [len(ex) for ex in X]
        preds = self._predict(X)     
        mask=self.create_mask(seq_lengths).to(device, non_blocking=True) # creates mask matrix (1s are obs used in CRF; 0s are discarded)  
        tag_seq = self.crf.decode(preds,mask=mask) # note: X is (nExs,maxTokLen) and here input must be (nExs,maxTokLen,nDistinctTags); out is optimal seq of tagIds
        return [[self.classes_[i] for i in seq] for seq in tag_seq] 
        # see difference vs TorchRNNClassifier.predict

#    def score(self, X, y):
#       # preds = self.predict(X)
#       # flat_preds = [x for seq in preds for x in seq]
#       # flat_y = [x for seq in y for x in seq]
#       # return utils.safe_macro_f1(flat_y, flat_preds)
#        device = "cuda" if torch.cuda.is_available() else "cpu"
#        seq_lengths = [len(ex) for ex in X]
#        mask=self.create_mask(seq_lengths).to(device, non_blocking=True) # creates mask matrix (1s are obs used in CRF; 0s are discarded)
#        outputs, state = self.rnn(X, torch.tensor(seq_lengths)) # X is (batchSize, maxLen of exs in batch); outputs is (noTokensInEx,hiddDim), state is ((batch_size,1,hiddDim),(batch_size,1,hiddDim)) = (finalHiddState,finalCellState) 
#        outputs, seq_length = torch.nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) # outputs is (batchSize,MaxLen of examples in batch,hidden_dim); seq_length is noTokenInEx for each ex in batch
#        fcl = nn.Linear(self.classifier_dim, self.n_classes_).to(device, non_blocking=True)
#        logits = fcl(outputs) # this is an FCL from hidden_dim to output_dim (NoLabelClasses)
#        # score
#        print(self.crf(logits, y, mask=mask))
#        return self.crf(logits, y, mask=mask)
    
    def score(self, X, y):
        preds = self.predict(X)
        flat_preds = [x for seq in preds for x in seq]
        flat_y = [x for seq in y for x in seq]
        return utils.safe_macro_f1(flat_y, flat_preds)  
    
    def nClasses(self):
        return len(self.classes_)
    
    def classes(self):
        return self.classes_
    
    def create_mask(self, seq_length):
        maxLen=max(seq_length)
        auxLen=len(seq_length)
        auxOne = torch.ones(maxLen)
        auxZero = torch.zeros(maxLen)
        auxOne_l=[1]*maxLen
        auxZero_l=[0]*maxLen
        auxMatrix=[]
        for i in range(auxLen):
            auxRow=auxOne_l[:seq_length[i]]+auxZero_l[seq_length[i]:]
            auxMatrix.append(auxRow)
        return torch.tensor(auxMatrix,dtype=torch.uint8)  

    
    def fit(self, *args):
        """
        Generic optimization method.

        Parameters
        ----------
        *args: list of objects
            We assume that the final element of args give the labels
            and all the preceding elements give the system inputs.
            For regular supervised learning, this is like (X, y), but
            we allow for models that might use multiple data structures
            for their inputs.

        Attributes
        ----------
        model: nn.Module or subclass thereof
            Set by `build_graph`. If `warm_start=True`, then this is
            initialized only by the first call to `fit`.

        optimizer: torch.optimizer.Optimizer
            Set by `build_optimizer`. If `warm_start=True`, then this is
            initialized only by the first call to `fit`.

        errors: list of float
            List of errors. If `warm_start=True`, then this is
            initialized only by the first call to `fit`. Thus, where
            `max_iter=5`, if we call `fit` twice with `warm_start=True`,
            then `errors` will end up with 10 floats in it.

        validation_scores: list
            List of scores. This is filled only if `early_stopping=True`.
            If `warm_start=True`, then this is initialized only by the
            first call to `fit`. Thus, where `max_iter=5`, if we call
            `fit` twice with `warm_start=True`, then `validation_scores`
            will end up with 10 floats in it.

        no_improvement_count: int
            Used to control early stopping and convergence. These values
            are controlled by `_update_no_improvement_count_early_stopping`
            or `_update_no_improvement_count_errors`.  If `warm_start=True`,
            then this is initialized only by the first call to `fit`. Thus,
            in that situation, the values could accumulate across calls to
            `fit`.

        best_error: float
           Used to control convergence. Smaller is assumed to be better.
           If `warm_start=True`, then this is initialized only by the first
           call to `fit`. It will be reset by
           `_update_no_improvement_count_errors` depending on how the
           optimization is proceeding.

        best_score: float
           Used to control early stopping. If `warm_start=True`, then this
           is initialized only by the first call to `fit`. It will be reset
           by `_update_no_improvement_count_early_stopping` depending on how
           the optimization is proceeding. Important: we currently assume
           that larger scores are better. As a result, we will not get the
           correct results for, e.g., a scoring function based in
           `mean_squared_error`. See `self.score` for additional details.

        best_parameters: dict
            This is a PyTorch state dict. It is used if and only if
            `early_stopping=True`. In that case, it is updated whenever
            `best_score` is improved numerically. If the early stopping
            criteria are met, then `self.model` is reset to contain these
            parameters before `fit` exits.

        Returns
        -------
        self

        """
      #  print("here00")
        if self.early_stopping:
            args, dev = self._build_validation_split(
                *args, validation_fraction=self.validation_fraction)
            

        # Dataset:
        dataset = self.build_dataset(*args)
        dataloader = self._build_dataloader(dataset, shuffle=True)

        # Graph:
        if not self.warm_start or not hasattr(self, "model"):
            self.model = self.build_graph()
            # This device move has to happen before the optimizer is built:
            # https://pytorch.org/docs/master/optim.html#constructing-it
            self.model.to(self.device)
            self.optimizer = self.build_optimizer()
            self.errors = []
            self.validation_scores = []
            self.no_improvement_count = 0
            self.best_error = np.inf
            self.best_score = -np.inf
            self.best_parameters = None

        # Make sure the model is where we want it:
        self.model.to(self.device)

        self.model.train()
        self.optimizer.zero_grad()
        
        self.crf = CRF(self.n_classes_,batch_first=True).to(self.device, non_blocking=True)

        for iteration in range(1, self.max_iter+1):

            epoch_error = 0.0

            for batch_num, batch in enumerate(dataloader, start=1):
               # print("batch"+str(batch_num)) 

               # print(batch)
                batch = [x.to(self.device, non_blocking=True) for x in batch]

                X_batch = batch[: -1] # list w/ 2 els: 1st el is tensor (108xmaxLen) w/ tokens for each example in batch; 2nd el is (108x1) with lengths of each example
                y_batch = batch[-1] # list with each element of this batch (108 el in list) with tensor (maxLen x 1) labels converted to ints and w/ len = maxLen of all example sequences # print(y_batch[0].shape)
               # print(X_batch[1].shape)
               # print(y_batch[0])
               
                batch_preds = self.model(*X_batch) # produces logits outputs of lstm
               # print("batch_preds")

               # print("here-model2")
                mask = (self.create_mask(X_batch[1])).to(self.device, non_blocking=True)
                #err = self.loss(batch_preds, y_batch) # batch_preds = (108,12,117); y_batch = (108,117)
                err = -self.crf(batch_preds,y_batch,mask=mask,reduction='mean') 
                # NOTE: self.crf outputs log likelihood so we multiply by (-1) so as to minimize this result

                if self.gradient_accumulation_steps > 1 and \
                  self.loss.reduction == "mean":
                    err /= self.gradient_accumulation_steps

                err.backward()

                epoch_error += err.item()

                if batch_num % self.gradient_accumulation_steps == 0 or \
                  batch_num == len(dataloader):
                    if self.max_grad_norm is not None:
                        torch.nn.utils.clip_grad_norm_(
                            self.model.parameters(), self.max_grad_norm)
                    self.optimizer.step()
                    self.optimizer.zero_grad()

            # Stopping criteria:

            if self.early_stopping:
                self._update_no_improvement_count_early_stopping(*dev) # here we max macro avg f1 score (on dev = validation set)
                if self.no_improvement_count > self.n_iter_no_change:
                    utils.progress_bar(
                        "Stopping after epoch {}. Validation score did "
                        "not improve by tol={} for more than {} epochs. "
                        "Final error is {}".format(iteration, self.tol,
                            self.n_iter_no_change, epoch_error),
                        verbose=self.display_progress)
                    break

            else:
                self._update_no_improvement_count_errors(epoch_error)
                if self.no_improvement_count > self.n_iter_no_change:
                    utils.progress_bar(
                        "Stopping after epoch {}. Training loss did "
                        "not improve more than tol={}. Final error "
                        "is {}.".format(iteration, self.tol, epoch_error),
                        verbose=self.display_progress)
                    break

            utils.progress_bar(
                "Finished epoch {} of {}; error is {}".format(
                    iteration, self.max_iter, epoch_error),
                verbose=self.display_progress)

        if self.early_stopping:
            self.model.load_state_dict(self.best_parameters)

        return self

In [693]:
class TorchSequenceLabeler_forCRF_3(nn.Module): # no self.hidden_layer or self.classifier_activation as TorchRNNClassifierModel
    def __init__(self, rnn, output_dim):
       # print("here021")
        super().__init__()
        self.rnn = rnn
        self.output_dim = output_dim
        if self.rnn.bidirectional:
            self.classifier_dim = self.rnn.hidden_dim * 2
        else:
            self.classifier_dim = self.rnn.hidden_dim
        self.classifier_layer = nn.Linear(
            self.classifier_dim, self.output_dim)
        

    def forward(self, X, seq_lengths): # X is (noExsInBatch,MaxLen)=(108,117), seq_lengths is the number of tokens in each example in each batch
        # this is the forward method of self.model
       # print("here2")
        outputs, state = self.rnn(X, seq_lengths) # X is (batchSize, maxLen of exs in batch); outputs is (noTokensInEx,hiddDim), state is ((batch_size,1,hiddDim),(batch_size,1,hiddDim)) = (finalHiddState,finalCellState) 
        outputs, seq_length = torch.nn.utils.rnn.pad_packed_sequence(
            outputs, batch_first=True) # outputs is (batchSize,MaxLen of examples in batch,hidden_dim); seq_length is noTokenInEx for each ex in batch
        logits = self.classifier_layer(outputs) # this is an FCL from hidden_dim to output_dim (NoLabelClasses)
       # print(logits.shape)
        # logits are (108,117,12) or (1,11,5) = (batchSize,MaxLen of examples in batch,noLabelClasses) noLabelClasses include Start + End
        return logits  

In [694]:
# Following converts words to indices and pads sequences
seq_mod3 = TorchCRFSequenceLabeler_3(
    vocab,
    early_stopping=True,
    eta=0.001)

In [688]:
dataset = seq_mod3.build_dataset(X_train, y_train) 
graph = seq_mod3.build_graph()
print(graph)

{'DATUM_VERBUECHERUNG': 0, 'DATUM_VERTRAG': 1, 'FLAECHE': 2, 'GESAMTPREIS': 3, 'IMMO_TYP': 4, 'KAEUFER': 5, 'O': 6, 'ORT': 7, 'QMPREIS': 8, 'STRASSE': 9, 'TERRASSENGROESSE': 10, 'VERKAEUFER': 11}
TorchSequenceLabeler_forCRF_3(
  (rnn): TorchRNNModel(
    (embedding): Embedding(1049, 50)
    (rnn): LSTM(50, 50, batch_first=True)
  )
  (classifier_layer): Linear(in_features=50, out_features=12, bias=True)
)


In [695]:
%time _ = seq_mod3.fit(X_train, y_train)

{'DATUM_VERBUECHERUNG': 0, 'DATUM_VERTRAG': 1, 'FLAECHE': 2, 'GESAMTPREIS': 3, 'IMMO_TYP': 4, 'KAEUFER': 5, 'O': 6, 'ORT': 7, 'QMPREIS': 8, 'STRASSE': 9, 'TERRASSENGROESSE': 10, 'VERKAEUFER': 11}


Stopping after epoch 23. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 93.69168853759766

Wall time: 4.86 s


In [610]:
out = seq_mod3.predict(X_train)

pred
torch.Size([120, 117, 12])
torch.Size([120, 117])


In [696]:
y_pred = seq_mod3.predict(X_test)
print(y_pred)

[['O', 'QMPREIS', 'O', 'GESAMTPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'DATUM_VERBUECHERUNG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'QMPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['STRASSE', 'O', 'GESAMTPREIS', 'O', 'GESAMTPREIS', 'O', 'O', 'O', 'O', 'STRASSE', 'O', 'DATUM_VERBUECHERUNG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'QMPREIS', 'O', 'O', 'STRASSE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'QMPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'QMPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

In [697]:
from sklearn_crfsuite import metrics
classes = seq_mod3.classes()
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=classes))
sorted_labels = sorted(
    classes,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

0.6605704077682553
                     precision    recall  f1-score   support

                  O      0.785     0.930     0.851       643
            KAEUFER      0.000     0.000     0.000        18
DATUM_VERBUECHERUNG      0.000     0.000     0.000        25
      DATUM_VERTRAG      0.000     0.000     0.000        27
         VERKAEUFER      0.000     0.000     0.000        24
   TERRASSENGROESSE      0.000     0.000     0.000         5
        GESAMTPREIS      0.000     0.000     0.000        11
            FLAECHE      0.000     0.000     0.000        15
           IMMO_TYP      0.000     0.000     0.000        19
            QMPREIS      0.000     0.000     0.000        10
                ORT      1.000     0.115     0.207        26
            STRASSE      0.074     0.125     0.093        16

           accuracy                          0.719       839
          macro avg      0.155     0.098     0.096       839
       weighted avg      0.634     0.719     0.661       839



In [149]:
########## SAMPLE CRF ON TOY DATA
# this is where I matched my first model score (i.e. log likelihood of crf)
torch.manual_seed(1)
seq_length = 1  # maximum sequence length in a batch
batch_size = 1  # number of samples in the batch
num_tags=2
model = CRF(num_tags,batch_first=True)
#emissions = torch.randn(batch_size, seq_length, num_tags)
#tags = torch.tensor([[0, 2, 3], [1, 4, 1]], dtype=torch.long)  # (batch_size, seq_length)
emissions = torch.randn(1, seq_length, num_tags)

tags = torch.tensor([[1]], dtype=torch.long)  
print(model(emissions, tags))
print(emissions)
print(tags)
print(model.transitions)
print(model.start_transitions)
print(model.end_transitions)

tensor(-0.5731, grad_fn=<SumBackward0>)
tensor([[[-0.4519, -0.1661]]])
tensor([[1]])
Parameter containing:
tensor([[-0.0941,  0.0600],
        [-0.0206,  0.0509]], requires_grad=True)
Parameter containing:
tensor([ 0.0515, -0.0441], requires_grad=True)
Parameter containing:
tensor([-0.0194,  0.0469], requires_grad=True)


In [371]:
########## SAMPLE CRF ON TOY DATA
# add mask vector so as to not to consider padding part of sequences
# e.g. want to mask last zero of 2nd obs
torch.manual_seed(1)
seq_length = 3  # maximum sequence length in a batch
batch_size = 2  # number of samples in the batch
num_tags=5
model = CRF(num_tags,batch_first=True)
emissions = torch.randn(batch_size, seq_length, num_tags)
tags = torch.tensor([[1, 2, 3], [1, 4, 0]], dtype=torch.long)  # (batch_size, seq_length)
mask = torch.tensor([[1, 1, 1], [1, 1, 0]], dtype=torch.uint8) # i.e. mask 3rd token of 2nd example in batch
print(emissions.shape)
print(tags.shape)
print(mask.shape)
# 1.
print(model(emissions, tags, mask=mask)) # model log likelihood
# 2.
print(model.decode(emissions,mask=mask)) # most likely tag sequences
# 3. inference:
tags_test = torch.tensor([[1, 2, 3]], dtype=torch.long)  # (batch_size, seq_length)
print(model.forward(emissions[0].unsqueeze(dim=0),tags_test)) # returns log likelihood of test tag sequence (larger no. means more likely)
# note: need to use torch array w/ same no. of examples as tags_test

torch.Size([2, 3, 5])
torch.Size([2, 3])
torch.Size([2, 3])
tensor(-8.3568, grad_fn=<SumBackward0>)
[[3, 4, 3], [4, 1]]
tensor(-2.7487, grad_fn=<SumBackward0>)


In [377]:
# note: this isn't exactly correct as training examples are shuffled (esp. max len of the smaller 12 ex batch is != 92)
auxMax=0
x_max_idx=108
for i in range(0,min(len(X_train),x_max_idx)):
    if len(X_train[i])>auxMax:
        auxMax=len(X_train[i])
print(auxMax)
auxMax2=0
x_min_idx=109
for i in range(max(0,x_min_idx),len(X_train)):
    if len(X_train[i])>auxMax2:
        auxMax2=len(X_train[i])
print(auxMax2)

117
92


In [107]:
y_pred = seq_mod.predict(X_test)
print(y_test[0])
print(y_pred[0])

['ORT', 'O', 'O', 'STRASSE', 'STRASSE', 'O', 'ORT', 'O', 'O', 'FLAECHE', 'O', 'O', 'IMMO_TYP', 'O', 'O', 'O', 'O', 'QMPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'VERKAEUFER', 'O', 'O', 'O', 'O', 'O', 'GESAMTPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'DATUM_VERTRAG', 'DATUM_VERTRAG', 'O', 'O', 'O', 'O', 'O', 'DATUM_VERBUECHERUNG', 'DATUM_VERBUECHERUNG', 'O']
['KAEUFER', 'KAEUFER', 'O', 'O', 'KAEUFER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'IMMO_TYP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [108]:
labels=seq_mod.classes_
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

In [109]:
# unfold all our data - NOTE: this means we don't care about per sentence results. 
# i.e. each classification is worth same regardless of sentence in which it occurs
y_test_unfold = [y for element in y_test for y in element]
y_pred_unfold = [y for element in y_pred for y in element]

In [126]:
print(y_test_unfold[:10])
print(y_pred_unfold[:10])

['ORT', 'O', 'O', 'STRASSE', 'STRASSE', 'O', 'ORT', 'O', 'O', 'FLAECHE']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [82]:
# convert y_test and y_pred into binary formats
#from sklearn.preprocessing import MultiLabelBinarizer

In [110]:
print(classification_report(
    y_test_unfold, y_pred_unfold, labels=sorted_labels, digits=3
))

                     precision    recall  f1-score   support

                  O      0.808     0.888     0.846       643
            KAEUFER      0.070     0.278     0.112        18
DATUM_VERBUECHERUNG      0.000     0.000     0.000        25
      DATUM_VERTRAG      1.000     0.037     0.071        27
         VERKAEUFER      0.333     0.042     0.074        24
   TERRASSENGROESSE      0.000     0.000     0.000         5
        GESAMTPREIS      0.000     0.000     0.000        11
            FLAECHE      0.000     0.000     0.000        15
           IMMO_TYP      0.000     0.000     0.000        19
            QMPREIS      0.000     0.000     0.000        10
                ORT      0.000     0.000     0.000        26
            STRASSE      0.118     0.125     0.121        16

           accuracy                          0.691       839
          macro avg      0.194     0.114     0.102       839
       weighted avg      0.664     0.691     0.657       839



Now try with leading "B-" and "I-"

In [83]:
########## ONLY RUN IF WE WANT TO ADD LEADING "B-" / "I-" TO CLASS LABEL
# now use above code and loop through all items of annot list:
# addLeading=1 for "Yes" (i.e. add leading "B-","I-" to annot); 0 for "No" (i.e. add labels to annot simply as they are)
addLeading = 1

if addLeading == 1:
    for j in range(0,len(annot)):
        a = annot[j]
        # select list of dict of tokens w/ annnotations and add column w/ no. of words to each dict:
        b = a['spans']
        # add noWords to b dict. note: b is list of dicts w/ annotations; tokens not on this list don't have annotations
        if b!=[]: #i.e. only try to add annotations to tokens if there are annotations to begin with
            #print(b)
            for i in range(0,len(annot[j]['tokens'])):
                    # now break-up label into 1st occurrence (leading "B-") and subsequent occurrences (leading "I-") (only for non "O"'s)
                    if annot[j]['tokens'][i]['label'] != "O":
                        if i==0:
                            annot[j]['tokens'][i]['label'] = "B-" + annot[j]['tokens'][i]['label']
                        else: 
                            if annot[j]['tokens'][i]['label'] == annot[j]['tokens'][i-1]['label'][2:]: # need to remove the leading "B-" that we had already been added to c[i-1]
                                annot[j]['tokens'][i]['label'] = "I-" + annot[j]['tokens'][i]['label']
                            else:
                                annot[j]['tokens'][i]['label'] = "B-" + annot[j]['tokens'][i]['label'] 

In [84]:
# now get data into format that TorchRNN expects:
X=[] 
y=[]
for j in range(0,len(annot)):
    a = annot[j]['tokens']
    auxX = []
    auxy = []
    if annot[j]['spans']!=[]: # are there annot for this example?
        for i in range(0,len(a)):
            #token_element = (a[i]['text'],a[i]['label'])
            auxX.append(a[i]['text'])
            auxy.append(a[i]['label'])
        X.append(auxX)
        y.append(auxy)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train, X_test, y_train, y_test = X[:120], X[120:], y[:120], y[120:]
vocab = sorted({w for seq in X_train for w in seq}) + ["$UNK"]

In [89]:
print(X_train[0])

['DORNBIRN', 'In', 'der', 'Schulgasse', 'in', 'Dornbirn', 'hat', 'eine', '71,93', 'Quadratmeter', 'große', 'Wohnung', 'für', 'einen', 'Quadratmeterpreis', 'von', '5533,71', 'Euro', 'den', 'Besitzer', 'gewechselt', '.', 'Dieser', 'beinhaltet', 'auch', 'einen', 'Pkw-Abstellplatz', '.', 'Käufer', 'der', 'Wohnung', 'mit', '9,86', 'Quadratmetern', 'Terrasse', 'ist', 'die', 'ValLiLean', 'Beteiligungs-', 'und', 'Immobilienverwaltungs', 'GmbH', 'Beim', 'Verkäufer', 'handelt', 'es', 'sich', 'um', 'die', 'Karrenblick', 'Projekt', 'GmbH', ' ', 'Der', 'Kaufpreis', 'liegt', 'bei', '398.040', 'Euro', '.', 'Unterzeichnet', 'wurde', 'der', 'Kaufvertrag', 'am', '18.', 'September', '.', 'Die', 'Verbücherung', 'datiert', 'mit', 'Oktober', '2020', '.', '.', '.']


In [85]:
%time _ = seq_mod.fit(X_train, y_train)
y_pred = seq_mod.predict(X_test)
print(y_test[0])
print(y_pred[0])

Stopping after epoch 35. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 2.249159574508667

Wall time: 1.78 s
['B-ORT', 'O', 'O', 'B-STRASSE', 'I-STRASSE', 'O', 'B-ORT', 'O', 'O', 'B-FLAECHE', 'O', 'O', 'B-IMMO_TYP', 'O', 'O', 'O', 'O', 'B-QMPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-VERKAEUFER', 'O', 'O', 'O', 'O', 'O', 'B-GESAMTPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATUM_VERTRAG', 'I-DATUM_VERTRAG', 'O', 'O', 'O', 'O', 'O', 'B-DATUM_VERBUECHERUNG', 'I-DATUM_VERBUECHERUNG', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [90]:
labels=seq_mod.classes_
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

In [91]:
# unfold all our data - NOTE: this means we don't care about per sentence results. 
# i.e. each classification is worth same regardless of sentence in which it occurs
y_test_unfold = [y for element in y_test for y in element]
y_pred_unfold = [y for element in y_pred for y in element]

In [92]:
print(classification_report(
    y_test_unfold, y_pred_unfold, labels=sorted_labels, digits=3
))

                       precision    recall  f1-score   support

                    O      0.773     0.988     0.867       643
B-DATUM_VERBUECHERUNG      0.000     0.000     0.000        13
I-DATUM_VERBUECHERUNG      0.000     0.000     0.000        12
      B-DATUM_VERTRAG      0.000     0.000     0.000        13
      I-DATUM_VERTRAG      0.000     0.000     0.000        14
            B-FLAECHE      0.000     0.000     0.000        15
            I-FLAECHE      0.000     0.000     0.000         0
        B-GESAMTPREIS      0.000     0.000     0.000        11
        I-GESAMTPREIS      0.000     0.000     0.000         0
           B-IMMO_TYP      0.000     0.000     0.000        19
           I-IMMO_TYP      0.000     0.000     0.000         0
            B-KAEUFER      0.000     0.000     0.000        10
            I-KAEUFER      0.000     0.000     0.000         8
                B-ORT      0.300     0.115     0.167        26
            B-QMPREIS      0.000     0.000     0.000  

  _warn_prf(average, modifier, msg_start, len(result))


Remove "B-" and "I-" (in case they are present in labels)

In [119]:
for j in range(0,len(annot)):
    a = annot[j]
    b = a['spans']
    if b!=[]: #i.e. only try to add annotations to tokens if there are annotations to begin with
        for i in range(0,len(annot[j]['tokens'])):
                if annot[j]['tokens'][i]['label'] != "O":
                    if annot[j]['tokens'][i]['label'][:2]=="B-" or annot[j]['tokens'][i]['label'][:2]=="I-":
                        annot[j]['tokens'][i]['label']=annot[j]['tokens'][i]['label'][2:]

Try bi-directional LSTM

In [149]:
seq_mod = TorchRNNSequenceLabeler(
    vocab,
    early_stopping=True,
    eta=0.001,
    bidirectional=True)

In [132]:
print(y_train[0])

['ORT', 'O', 'O', 'STRASSE', 'O', 'ORT', 'O', 'O', 'FLAECHE', 'O', 'O', 'IMMO_TYP', 'O', 'O', 'O', 'O', 'QMPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'TERRASSENGROESSE', 'O', 'O', 'O', 'O', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'O', 'O', 'O', 'O', 'O', 'GESAMTPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'DATUM_VERTRAG', 'DATUM_VERTRAG', 'O', 'O', 'O', 'O', 'O', 'DATUM_VERBUECHERUNG', 'DATUM_VERBUECHERUNG', 'O', 'O', 'O']


In [150]:
%time _ = seq_mod.fit(X_train, y_train)

None


Stopping after epoch 18. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 2.1157665252685547

Wall time: 2.12 s


In [123]:
y_pred = seq_mod.predict(X_test)
print(y_test[0])
print(y_pred[0])

labels=seq_mod.classes_
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

# unfold all our data - NOTE: this means we don't care about per sentence results. 
# i.e. each classification is worth same regardless of sentence in which it occurs
y_test_unfold = [y for element in y_test for y in element]
y_pred_unfold = [y for element in y_pred for y in element]

print(classification_report(
    y_test_unfold, y_pred_unfold, labels=sorted_labels, digits=3
))

['ORT', 'O', 'O', 'STRASSE', 'STRASSE', 'O', 'ORT', 'O', 'O', 'FLAECHE', 'O', 'O', 'IMMO_TYP', 'O', 'O', 'O', 'O', 'QMPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'VERKAEUFER', 'O', 'O', 'O', 'O', 'O', 'GESAMTPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'DATUM_VERTRAG', 'DATUM_VERTRAG', 'O', 'O', 'O', 'O', 'O', 'DATUM_VERBUECHERUNG', 'DATUM_VERBUECHERUNG', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'IMMO_TYP', 'O', 'O', 'TERRASSENGROESSE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'TERRASSENGROESSE', 'TERRASSENGROESSE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                  O      0.760     0.893     0.821       643
            KAEUFER      0.000     0.000     0.000        18
DATUM_VERBUECHERUN