In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.append('../../../')

In [3]:
import argparse
import logging
import os
import numpy as np
import torch
from src.ner import utils
from src.booster.progressive_encoder import CharEncoder, EntityEncoder, WordEncoder
from src.booster.progNN.net import LSTMCRF
from src.ner.model.data_loader import DataLoader

In [22]:
from src.ner.data import StringIterator
from collections import defaultdict

In [46]:
from spacy import displacy

In [47]:
from nltk.tokenize import TreebankWordTokenizer as twt

In [5]:
# 1. set the device to train on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
# 2. Load the parameters from json file
model_dir = 'C:\\Users\\sgupta\\Thesis\\thesis-sarthak\\src\\ner\\experiments\\disease\\st_fracs\\germeval\\st_germeval_100'
network_params = os.path.join(model_dir, 'params.json')
assert os.path.isfile(network_params), "No json configuration file found at {}".format(network_params)
params = utils.Params(network_params)
# use GPU if available
params.cuda = torch.cuda.is_available()

In [7]:
# 3. Set the random seed for reproducible experiments
cuda = True
torch.manual_seed(230)
if cuda: torch.cuda.manual_seed(230)
np.random.seed(0)

In [8]:
# 4. Set the logger
utils.set_logger(os.path.join(model_dir, 'train.log'))

In [9]:
from collections import OrderedDict

data_encoder = utils.load_obj(os.path.join(model_dir, 'data_encoder.pkl'))
label_encoder = utils.load_obj(os.path.join(model_dir, 'label_encoder.pkl'))

In [135]:
def encode(input_string):
    sentences = []
    data = {}
    input_string = StringIterator(input_string)
    sentences_string = input_string.list_of_sentences
    spans = input_string.spans
    for sentence in input_string.get_next_X():
        sentences_string.append(sentence)
        s = OrderedDict()
        for feature_name, encoder in data_encoder.items():
            s[feature_name] = encoder.encode(sentence)
        s['timesteps'] = len(sentence)
        s['word_length'] = [len(list(w)) for w in sentence]
        sentences.append(s)
    data['data'] = sentences
    data['labels'] = None
    data['size'] = len(sentences)
    
    
    # sort the length in descending order of sentence_length
    sorted_data = [b for b in sorted(enumerate(sentences),
                                     key=lambda k: k[1]['timesteps'],
                                     reverse=True)]
    sorted_index = [i[0] for i in sorted_data]
    sentences_string = [sentences_string[i] for i in sorted_index]
    spans = [spans[i] for i in sorted_index]
    sentences = [i[1] for i in sorted_data]
    # compute length of longest sentence in batch
    batch_max_len = max([s['timesteps'] for s in sentences])

    # PAD
    for s in sentences:
        s['word_length'].extend([-1]*(batch_max_len-len(s['word_length'])))
        for feature_name, encoded_value in s.items():
            for encoder_name, encoder in data_encoder.items():
                if encoder_name == feature_name:
                    encoder.pad(encoded_value, batch_max_len)
                    break
    
    super_data_dict = defaultdict(list)
    for s in sentences:
        for k, v in s.items():
            super_data_dict[k].append(v)
    super_label_dict = defaultdict(list)

    # convert the lists to torch.tensor
    for encoder_name, encoded_value in super_data_dict.items():
        super_data_dict[encoder_name] = torch.tensor(encoded_value).cuda() if params.cuda else torch.tensor(encoded_value)
    
    return super_data_dict, spans, sentences_string
                                                                                                    

In [136]:
txt = "Munich Re Group or Munich Reinsurance Company is a reinsurance company based in Munich, Germany. It is one of the world’s leading reinsurers. ERGO, a Munich Re subsidiary, is the Group’s primary insurance arm."

In [137]:
input_X, spans, strings = encode(txt)

In [138]:
strings, spans

(['Munich Re Group or Munich Reinsurance Company is a reinsurance company based in Munich, Germany.',
  'ERGO, a Munich Re subsidiary, is the Group’s primary insurance arm.',
  'It is one of the world’s leading reinsurers.'],
 [[(0, 6),
   (7, 9),
   (10, 15),
   (16, 18),
   (19, 25),
   (26, 37),
   (38, 45),
   (46, 48),
   (49, 50),
   (51, 62),
   (63, 70),
   (71, 76),
   (77, 79),
   (80, 86),
   (86, 87),
   (88, 95),
   (95, 96)],
  [(0, 4),
   (4, 5),
   (6, 7),
   (8, 14),
   (15, 17),
   (18, 28),
   (28, 29),
   (30, 32),
   (33, 36),
   (37, 42),
   (42, 43),
   (43, 44),
   (45, 52),
   (53, 62),
   (63, 66),
   (66, 67)],
  [(0, 2),
   (3, 5),
   (6, 9),
   (10, 12),
   (13, 16),
   (17, 22),
   (22, 23),
   (23, 24),
   (25, 32),
   (33, 43),
   (43, 44)]])

In [33]:
# 6. Modeling
# 6.1 Define the model
model = LSTMCRF(params=params,
                char_vocab_length=data_encoder[CharEncoder.FEATURE_NAME].vocab_length,
                num_tags=label_encoder[EntityEncoder.FEATURE_NAME].num_tags,
                pretrained_word_vecs=torch.from_numpy(data_encoder[WordEncoder.FEATURE_NAME].vectors),
                dropout=params.dropout,
                decoder_type=params.decoder,
                bidirectional=True,
                freeze_embeddings=False).to(device).float()

# 6.2 fetch loss function and metrics
from src.ner.evaluation import accuracy_score, f1_score, precision_score, recall_score

metrics = {'accuracy': accuracy_score,
           'f1_score': f1_score,
           'precision_score': precision_score,
           'recall_score': recall_score}

utils.load_checkpoint(os.path.join(model_dir, 'best' + '.pth'), model)

  "num_layers={}".format(dropout, num_layers))


{'epoch': 66,
 'state_dict': OrderedDict([('char_embedding.weight',
               tensor([[ 0.0733, -0.0193,  0.0601,  ...,  0.0652,  0.1896, -0.1670],
                       [-0.1489,  0.1734, -0.0277,  ..., -0.1560, -0.0213, -0.1616],
                       [-0.1689, -0.2559, -0.2886,  ...,  0.1633, -0.2752,  0.2653],
                       ...,
                       [-0.0584, -0.1693,  0.1844,  ..., -0.2901, -0.0645,  0.2280],
                       [-0.0342, -0.2627, -0.0259,  ..., -0.0306, -0.0787, -0.2843],
                       [ 0.1847,  0.1095, -0.0385,  ..., -0.1445,  0.0617,  0.0976]],
                      device='cuda:0')),
              ('char_cnn.weight', tensor([[[ 0.0413, -0.3038, -0.1553],
                        [ 0.1633,  0.3140,  0.0161],
                        [ 0.0537,  0.1553,  0.1492],
                        ...,
                        [ 0.2429,  0.0310,  0.1487],
                        [-0.2343, -0.2076, -0.4132],
                        [-0.1333, -0.20

In [95]:
preds = model.predict(X=input_X)
preds = [label_encoder[EntityEncoder.FEATURE_NAME].decode(l) for l in preds]
preds

  sequence_length = torch.tensor(sequence_length, dtype=torch.int)


[['B-ORG',
  'I-ORG',
  'E-ORG',
  'O',
  'B-ORG',
  'I-ORG',
  'E-ORG',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'E-ORG',
  'O',
  'S-LOC',
  'O',
  'S-LOC',
  'O'],
 ['S-ORG',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'E-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]

In [139]:
all_ents = []
for span, pred in zip(spans, preds):
    sent_ents = []
    for s, p in zip(span, pred):
        sent_ents.append({'start':s[0],
                        'end':s[1],
                        'label':p.split('-')[-1]})
    all_ents.append(sent_ents)
all_ents

[[{'start': 0, 'end': 6, 'label': 'ORG'},
  {'start': 7, 'end': 9, 'label': 'ORG'},
  {'start': 10, 'end': 15, 'label': 'ORG'},
  {'start': 16, 'end': 18, 'label': 'O'},
  {'start': 19, 'end': 25, 'label': 'ORG'},
  {'start': 26, 'end': 37, 'label': 'ORG'},
  {'start': 38, 'end': 45, 'label': 'ORG'},
  {'start': 46, 'end': 48, 'label': 'O'},
  {'start': 49, 'end': 50, 'label': 'O'},
  {'start': 51, 'end': 62, 'label': 'ORG'},
  {'start': 63, 'end': 70, 'label': 'ORG'},
  {'start': 71, 'end': 76, 'label': 'ORG'},
  {'start': 77, 'end': 79, 'label': 'O'},
  {'start': 80, 'end': 86, 'label': 'LOC'},
  {'start': 86, 'end': 87, 'label': 'O'},
  {'start': 88, 'end': 95, 'label': 'LOC'},
  {'start': 95, 'end': 96, 'label': 'O'}],
 [{'start': 0, 'end': 4, 'label': 'ORG'},
  {'start': 4, 'end': 5, 'label': 'O'},
  {'start': 6, 'end': 7, 'label': 'O'},
  {'start': 8, 'end': 14, 'label': 'ORG'},
  {'start': 15, 'end': 17, 'label': 'ORG'},
  {'start': 18, 'end': 28, 'label': 'ORG'},
  {'start': 28

In [141]:
ex = [{"text": strings[1],
       "ents": all_ents[1],
       "title": None,
      "settings":{}}]
html = displacy.render(ex, style="ent", manual=True)