In [1]:
import sys
sys.path.append("/work/multi_doc_analyzer")
sys.path.append("/work/relation_extraction/Bert_model/baseline/data/")

import torch as T
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.cuda
from allennlp.nn import util as nn_util
from multi_doc_analyzer.structure.structure import *
from multi_doc_analyzer.tokenization.tokenizer import MDATokenizer
from tqdm import tqdm

from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.data.token_indexers import TokenIndexer

from allennlp.data.instance import Instance
from allennlp.data.fields import TextField, LabelField, ArrayField

from ace05_set_reader import ACE05Reader

from allennlp.data.vocabulary import Vocabulary
from allennlp.data.iterators import BucketIterator, DataIterator, BasicIterator
from allennlp.nn.util import get_text_field_mask
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder
import random

from allennlp.data.token_indexers import PretrainedBertIndexer
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders.bert_token_embedder import PretrainedBertEmbedder

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import csv



In [2]:
train_path = "/work/LDC2006T06/dataset/train/"
test_path = "/work/LDC2006T06/dataset/test/"
model_folder = "/work/model_checkpoint/bert_model_checkpoint/bert_modify_seq/"
output_path = "/work/relation_extraction/Bert_model/bert_modify_seq/analysis/"

In [3]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    seed=1,
    batch_size=64,
    lr=5e-5,                # learning rate
    epochs=50,
    MLP_hidden_sz=300,
    lstm_hidden_sz=768,
    arg_sz=20,              # position embedding size
    max_seq_len=200
)

In [4]:
USE_GPU = T.cuda.is_available()
USE_GPU

True

In [5]:
# set seed for both CPU and CUDA
T.manual_seed(config.seed)

<torch._C.Generator at 0x7f58300a89b0>

In [6]:
# from ace05_set_reader import ACE05Reader
# train_path = "/work/LDC2006T06/dataset/train/"
# reader = ACE05Reader(lang='en')
# doc_dicts = reader.read(train_path)
# tokenizer = MDATokenizer('bert-en')
# for doc in doc_dicts.values():
#     tokenizer.annotate_document(doc)
#     for s in doc.sentences: 
        

In [7]:
e_type2idx = {'X':0, 'O': 1, 'PER': 2, 'ORG': 3, 'LOC': 4, 'GPE': 5, 'FAC': 6, 'VEH': 7, 'WEA': 8}

r_label2idx = {'PHYS-lr': 1, 'PART-WHOLE-lr': 2, 'PER-SOC-lr': 3, 'ORG-AFF-lr': 4, 'ART-lr': 5, 'GEN-AFF-lr': 6,
               'PHYS-rl': 7, 'PART-WHOLE-rl': 8, 'PER-SOC-rl': 9, 'ORG-AFF-rl': 10, 'ART-rl': 11, 'GEN-AFF-rl': 12,
               'NONE': 0}

# r_label2idx = {'PHYS': 1, 'PART-WHOLE': 2, 'PER-SOC': 3, 'ORG-AFF': 4, 'ART': 5, 'GEN-AFF': 6, 'NONE': 0}

r_idx2label = {v: k for k, v in r_label2idx.items()}

class RelationDatasetReader(DatasetReader):
    """
    Reads Structure object formatted datasets files, and creates AllenNLP instances.
    """
    def __init__(self, tokenizer: Tokenizer=None, token_indexers: Dict[str, TokenIndexer]=None, 
                 MAX_WORDPIECES: int=config.max_seq_len, 
                 is_training = False, ace05_reader: ACE05Reader=None):
        # make sure results may be reproduced when sampling...
        super().__init__(lazy=False)
        random.seed(0)
        self.is_training = is_training
        self.ace05_reader = ace05_reader
        
        # NOTE AllenNLP automatically adds [CLS] and [SEP] word peices in the begining and end of the context,
        # therefore we need to subtract 2
        self.MAX_WORDPIECES = MAX_WORDPIECES - 2
        
        self.tokenizer = tokenizer or WordTokenizer()
        
        # BERT specific init
        self._token_indexers = token_indexers

    def text_to_instance(self, sentence: Sentence) -> Instance:

        e_tuple_check_dicts = {} # {(train_arg_l.id, train_arg_r.id):true_label, ...}
        if self.is_training: 
            for r in sentence.relation_mentions:
                train_arg_l, train_arg_r, true_label = r.get_left_right_args()
                e_tuple_check_dicts[(train_arg_l.id, train_arg_r.id)] = true_label

        # construct pair entities
        for arg1_idx in range(len(sentence.entity_mentions)-1):
            for arg2_idx in range(arg1_idx+1, len(sentence.entity_mentions)):
                field = {}
                sentence_tokens = []
    
                arg1 = sentence.entity_mentions[arg1_idx]
                arg2 = sentence.entity_mentions[arg2_idx]
                
                if arg1.char_b >= arg2.char_b:
                    entity_l = arg2
                    entity_r = arg1
                else:
                    entity_l = arg1
                    entity_r = arg2                    
                
                ent = [[] for i in range(2)]             # index 0 : 0 for ent_l, 1 for ent_r
                    
                for i,t in enumerate(sentence.tokens):
                    if i >= entity_l.token_b and i < entity_l.token_e:
                        ent[0].append(t.text)
                        sentence_tokens.append(Token(text="[" + entity_l.type + "]"))
                    elif i >= entity_r.token_b and i < entity_r.token_e:
                        ent[1].append(t.text)
                        sentence_tokens.append(Token(text="[" + entity_r.type + "]"))
                    else:
                        sentence_tokens.append(Token(text=t.text))
                sentence_tokens.append(Token(text="[SEP]"))
                for i in range(len(ent[0])):
                    sentence_tokens.append(Token(text=ent[0][i]))
                sentence_tokens.append(Token(text="[SEP]"))
                for i in range(len(ent[1])):
                    sentence_tokens.append(Token(text=ent[1][i]))
                
                sentence_field = TextField(sentence_tokens, self._token_indexers)
                fields = {"tokens": sentence_field}

                arg_vec = T.tensor([[0, 0] for i in range(len(sentence.tokens) + 2)], dtype=T.int)
                
                # +1 because the first token is [CLS]
                pos = lambda t, b, e: 0 if t >= b and t < e else ( (t-b) if t < b else (t-e+1) ) 
                for i in range(len(sentence.tokens) + 2):
                    arg_vec[i][0] = pos(i-1, entity_l.token_b, entity_l.token_e)      # arg_l position, i-1 for [CLS]
                    arg_vec[i][1] = pos(i-1, entity_r.token_b, entity_r.token_e)    # arg_r position, i-1 for [CLS]
                fields["arg_idx"] = ArrayField(arg_vec)

#                 fields["arg_left"] = SpanField(arg_left.char_b, arg_left.char_e, char_list_field)
#                 fields["arg_right"] = SpanField(arg_right.char_b, arg_right.char_e, char_list_field)
                if self.is_training:
                    if (arg1.id, arg2.id) in e_tuple_check_dicts.keys():
                        fields["label"] = LabelField(r_label2idx[e_tuple_check_dicts[(arg1.id, arg2.id)]], skip_indexing=True)
                    else:
                        fields["label"] = LabelField(r_label2idx['NONE'], skip_indexing=True)
                yield Instance(fields)
    
    def _read(self, file_path: str)->Iterator: 
        doc_dicts = self.ace05_reader.read(file_path)
        tokenizer = MDATokenizer('bert-en')
        for doc in doc_dicts.values():
            tokenizer.annotate_document(doc)
            for s in doc.sentences: 
                if len(s.tokens) <= config.max_seq_len:
                    for instance in self.text_to_instance(s):
                        yield instance

In [8]:
# ace05_reader = ACE05Reader(lang='en')

# token_indexer = PretrainedBertIndexer(
#     pretrained_model="bert-base-uncased",
# #         max_pieces=config.max_seq_len,
# #         do_lowercase=False               # for cased condition
# )

# # AllenNLP DatasetReader
# reader = RelationDatasetReader(
#     is_training=True, 
#     ace05_reader=ace05_reader, 
#     tokenizer=lambda s: token_indexer.wordpiece_tokenizer(s),
#     token_indexers={"tokens": token_indexer}
# )

# train_ds = reader.read(train_path)

In [9]:
class BERT(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                out_sz: int=len(r_label2idx)):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self._entity_embeddings = T.nn.Embedding(num_embeddings=len(e_type2idx), embedding_dim=config.arg_sz, padding_idx=0)
        self.gru = T.nn.GRU(word_embeddings.get_output_dim()+config.arg_sz, config.hidden_sz, batch_first=True)
        self.projection = nn.Linear(config.hidden_sz, out_sz)
        self.loss = nn.CrossEntropyLoss()
        
    def forward(self, tokens: Dict[str, T.tensor], arg_idx: T.tensor, label: T.tensor = None) -> Dict[str, T.tensor]:
        mask = get_text_field_mask(tokens)
        
        embeddings = self.word_embeddings(tokens)
        pad_len = embeddings.shape[-2]
#         print(pad_len)
        
        arg_idx = arg_idx[:,:pad_len]
        arg_idx = arg_idx.type(T.long)
        
        arg_emb = self._entity_embeddings(arg_idx)

        concat = T.cat((embeddings, arg_emb), -1)
        ot, ht = self.gru(concat, None) # revise this "None"
        ot = ot[:,-1,:]    
        class_logits = self.projection(ot)
        
        output = {"class_logits": class_logits}
        output["loss"] = self.loss(class_logits, label)

        return output

In [10]:
from scipy.special import expit # the sigmoid function
def tonp(tsr): return tsr.detach().cpu().numpy()

In [11]:
# Predict
class Predictor:
    def __init__(self, model: Model, iterator: DataIterator,
                 cuda_device: int=-1) -> None:
        self.model = model
        self.iterator = iterator
        self.cuda_device = cuda_device
        
    def _extract_data(self, batch) -> np.ndarray:
        out_dict = self.model(**batch)
        return expit(tonp(out_dict["class_logits"]))
    
    def predict(self, ds: Iterable[Instance]) -> np.ndarray:
        pred_generator = self.iterator(ds, num_epochs=1, shuffle=False)
        self.model.eval()
        pred_generator_tqdm = tqdm(pred_generator, total=self.iterator.get_num_batches(ds))
        preds = []
        with T.no_grad():
            for batch in pred_generator_tqdm:
                batch = nn_util.move_to_device(batch, self.cuda_device)
                preds.append(self._extract_data(batch))
        return np.concatenate(preds, axis=0)

In [12]:
def plot_comfusion_matrix(label_classes, predict_classes, out_folder, file_name):
    label_types = list(r_idx2label.values())

    cm = confusion_matrix(label_classes, predict_classes, label_types)
    print(cm)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    for (i, j), z in np.ndenumerate(cm):
        ax.text(j, i, '{:0.0f}'.format(z), ha='center', va='center', color='white')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + label_types)
    ax.set_yticklabels([''] + label_types)
    plt.xlabel('Predicted')
    plt.ylabel('True')

    plt.savefig(out_folder + 'confusion_matrix_' + file_name + '.png')
    plt.show()

    f1 = f1_score(label_classes, predict_classes, average='macro')*100
    
    print("Accuracy:", sum(cm[i][i] for i in range(len(cm))) / len(label_classes))
    print("F1 score:", f1)

In [13]:
def err_analyze(ds, true, pred, opt):
    
    # classify different kinds of error
    detail = [[[] for j in range(len(r_label2idx))] for i in range(len(r_label2idx))]
    for i in range(len(ds)):
         if true[i] != pred[i]:
            detail[r_label2idx[true[i]]][r_label2idx[pred[i]]].append(i)
    
    # print into a csv file
    with open(output_path + "error_detail_" + opt + ".csv", "w", newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["Sentence", "Two_Entity", "Predict", "Label", "idx"])
        for j in range(len(detail)):
            for k in range(len(detail)):
                with_element = 0
                if k == j:
                    continue
                for i in detail[j][k]:
                    with_element = 1
                    ent1 = []
                    ent2 = []
                    seq = 0
                    prev = 0
                    for g in range(len(vars(ds[i].fields['arg_idx'])['array'])):
                        if int(vars(ds[i].fields['arg_idx'])['array'][g]) != 1 and int(vars(ds[i].fields['arg_idx'])['array'][g]) != 0:
                            if ent1 == [] or (seq == 1 and prev == vars(ds[i].fields['arg_idx'])['array'][g]):
                                prev = vars(ds[i].fields['arg_idx'])['array'][g]
                                seq = 1
                                ent1.append(vars(ds[i].fields['tokens'])['tokens'][g-1])
                            else:
                                seq = 2
                                ent2.append(vars(ds[i].fields['tokens'])['tokens'][g-1])
                        else: 
                            seq = 0
    
                    tostr = lambda a: [str(a[i]) for i in range(len(a))] 
                    writer.writerow([" ".join(tostr(vars(ds[i].fields['tokens'])['tokens'])), [ent1, ent2], pred[i], true[i], i])
                if with_element == 1:
                    writer.writerow("")

In [15]:

if __name__ == '__main__':

    ace05_reader = ACE05Reader(lang='en')
    
    token_indexer = PretrainedBertIndexer(
        pretrained_model="bert-base-uncased",
#         max_pieces=config.max_seq_len,
#         do_lowercase=False               # for cased condition
    )
 
	# AllenNLP DatasetReader
    reader = RelationDatasetReader(
        is_training=True, 
        ace05_reader=ace05_reader, 
        tokenizer=lambda s: token_indexer.wordpiece_tokenizer(s),
        token_indexers={"tokens": token_indexer}
    )

    train_ds = reader.read(train_path)
#     print(len(train_ds))
#     for e in range(20):
#         print(len(vars(train_ds[e].fields['tokens'])['tokens']))
#         print(vars(train_ds[e].fields['tokens']))
#         print(len(vars(train_ds[0].fields['arg_idx'])['array']))
#         print(vars(train_ds[e].fields['arg_idx']))
#         print(vars(train_ds[e].fields['label']))
    
    vocab = Vocabulary()
    iterator = BucketIterator(batch_size=config.batch_size, sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    bert_embedder = PretrainedBertEmbedder(
        pretrained_model="bert-base-uncased",
        top_layer_only=True, # conserve memory   
    )
    word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": bert_embedder},
                                                                # we'll be ignoring masks so we'll need to set this to True
                                                               allow_unmatched_keys = True)
    model = BERT(word_embeddings)
    if USE_GPU:
        model.cuda()

    optimizer = optim.Adam(model.parameters(), lr=config.lr)

0it [00:00, ?it/s]
  0%|          | 0/351 [00:00<?, ?it/s][A
  4%|▍         | 14/351 [00:00<00:02, 139.17it/s][A
  9%|▉         | 32/351 [00:00<00:02, 148.65it/s][A
 14%|█▎        | 48/351 [00:00<00:02, 149.94it/s][A
 17%|█▋        | 61/351 [00:00<00:02, 137.20it/s][A
 23%|██▎       | 81/351 [00:00<00:01, 147.92it/s][A
 28%|██▊       | 97/351 [00:00<00:01, 150.80it/s][A
 32%|███▏      | 112/351 [00:00<00:01, 148.89it/s][A
 36%|███▌      | 126/351 [00:00<00:01, 139.36it/s][A
 40%|███▉      | 140/351 [00:00<00:01, 135.82it/s][A
 46%|████▌     | 161/351 [00:01<00:01, 150.69it/s][A
 50%|█████     | 177/351 [00:01<00:01, 144.25it/s][A
 56%|█████▌    | 195/351 [00:01<00:01, 151.78it/s][A
 61%|██████▏   | 215/351 [00:01<00:00, 160.59it/s][A
 66%|██████▌   | 232/351 [00:01<00:00, 143.50it/s][A
 71%|███████   | 248/351 [00:01<00:00, 141.63it/s][A
 75%|███████▍  | 263/351 [00:01<00:00, 137.91it/s][A
 81%|████████▏ | 286/351 [00:01<00:00, 156.25it/s][A
 86%|████████▋ | 303/351 [

ORG-AFF
FBI
FBI
error! relation argument positions error!
ORG-AFF
Department
Department
error! relation argument positions error!
ORG-AFF
CIA
CIA
error! relation argument positions error!


67082it [01:46, 630.57it/s] 


67082
23
{'tokens': [wood, ##ruff, a, reminder, ,, [PER], [PER], coming, back, to, the, [GPE], [GPE], within, the, hour, ., [SEP], jessica, lynch, [SEP], united, states], '_token_indexers': {'tokens': <allennlp.data.token_indexers.wordpiece_indexer.PretrainedBertIndexer object at 0x7f576adf6c18>}, '_indexed_tokens': None, '_indexer_name_to_indexed_token': None, '_token_index_to_indexer_name': None}
19
{'array': tensor([[ -6, -12],
        [ -5, -11],
        [ -4, -10],
        [ -3,  -9],
        [ -2,  -8],
        [ -1,  -7],
        [  0,  -6],
        [  0,  -5],
        [  1,  -4],
        [  2,  -3],
        [  3,  -2],
        [  4,  -1],
        [  5,   0],
        [  6,   0],
        [  7,   1],
        [  8,   2],
        [  9,   3],
        [ 10,   4],
        [ 11,   5]], dtype=torch.int32), 'padding_value': 0, 'dtype': <class 'numpy.float32'>}
{'label': 0, '_label_namespace': 'labels', '_label_id': 0}
23
{'tokens': [[PER], [PER], a, reminder, ,, jessica, lynch, coming, ba

AttributeError: 'Config' object has no attribute 'hidden_sz'

In [None]:
    # training
    from allennlp.training.trainer import Trainer

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_ds,
        cuda_device=0 if USE_GPU else -1,
        num_epochs=config.epochs,
    )

In [None]:
    # train the model 
#     metrics = trainer.train()

In [None]:
    # load model
    model.load_state_dict(T.load(model_folder + "/model.th"))

In [None]:
    # save 
    with open(model_folder+'model.th', 'wb') as f:
        T.save(model.state_dict(), f)

In [None]:
    # training data analysis
    seq_iterator = BasicIterator(batch_size=config.batch_size)
    seq_iterator.index_with(vocab)
    
    predictor = Predictor(model, seq_iterator, cuda_device=0 if USE_GPU else -1)
    train_preds = predictor.predict(train_ds) 
    
    label_types = [r_idx2label.get(i.fields['label'].label) for i in train_ds]
    predict_types = [r_idx2label.get(i) for i in np.argmax(train_preds, axis=-1)]
    err_analyze(train_ds, label_types, predict_types, "train")

In [None]:
    plot_comfusion_matrix(label_types, predict_types, output_path, "train_full")

In [None]:
    # testing data analysis
    
    # AllenNLP DatasetReader
    reader = RelationDatasetReader(
        is_training=True, 
        ace05_reader=ace05_reader, 
        tokenizer=lambda s: token_indexer.wordpiece_tokenizer(s),
        token_indexers={"tokens": token_indexer}
    )
    
    test_ds = reader.read(test_path)
    print(len(test_ds))
    seq_iterator = BasicIterator(batch_size=config.batch_size)
    seq_iterator.index_with(vocab)
    
    predictor = Predictor(model, seq_iterator, cuda_device=0 if USE_GPU else -1)
    test_preds = predictor.predict(test_ds) 
    
    label_types = [r_idx2label.get(i.fields['label'].label) for i in test_ds]
    predict_types = [r_idx2label.get(i) for i in np.argmax(test_preds, axis=-1)]  
    
    err_analyze(test_ds, label_types, predict_types, "test")

In [None]:
    plot_comfusion_matrix(label_types, predict_types, output_path, "test_full")