In [1]:
import sys
from pathlib import Path
main_path = Path().absolute().parent
data_path = main_path / 'data'
setting_path = main_path / 'setting_files'

sys.path.append(str(main_path / 'src'))

import torch
import yaml
import json
import pytorch_lightning as pl
from pytorch_lightning import seed_everything

from nlu_models import NLUModel
from nlu_utils import NLUDataModule

with (setting_path / 'train_settings.yml').open('r') as file:
    settings = yaml.load(file, Loader=yaml.FullLoader)

data_module_settings = settings['data_module']
model_settings = settings['model']
trainer_settings = settings['trainer']

data_module = NLUDataModule(
    train_path=data_path / data_module_settings['train_file'], 
    valid_path=data_path / data_module_settings['valid_file'],
    test_path=data_path / data_module_settings['test_file'],
    labels_path=data_path / data_module_settings['labels_file'],
    batch_size=data_module_settings['batch_size'], 
    max_len=data_module_settings['max_len'],
    num_workers=data_module_settings['num_workers'],
    seed=settings['seed']
)

if model_settings.get('weight_file') is not None:
    with (data_path / model_settings['weight_file']).open('r', encoding='utf-8') as file:
        weight_dict = json.load(file)
else:
    weight_dict = None

if trainer_settings['deterministic']:
    seed_everything(seed=settings['seed'], workers=True)
deterministic = trainer_settings['deterministic']

trainer = pl.Trainer(
    gpus=trainer_settings['n_gpus'], 
    max_epochs=trainer_settings['n_epochs'], 
    num_sanity_val_steps=trainer_settings['num_sanity_val_steps'],
    log_every_n_steps=trainer_settings['log_every_n_steps'],
    deterministic=deterministic,
)

# trainer.test(ckpt_path='best', datamodule=data_module)

# checkpoint_path = str(main_path / 'checkpoints' / 'nlu_simple' / 'best_model.ckpt')

# seed_everything(seed=settings['seed'])
# trainer = pl.Trainer(
#     gpus=trainer_settings['n_gpus'], 
#     max_epochs=trainer_settings['n_epochs'], 
#     num_sanity_val_steps=trainer_settings['num_sanity_val_steps'],
#     deterministic=True,
# )
# # checkpoint = torch.load(checkpoint_path)
# model = NLUModel.load_from_checkpoint(checkpoint_path)
# # model.load_state_dict(checkpoint['state_dict'])
# model.eval()
# print()

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [2]:
from nlu_utils import NLUTokenizer

tokenizer = NLUTokenizer()

In [21]:
model_idx = 8
model_dict = dict(enumerate([
    ('ce_l7_tk3_0', 'epoch=11-step=167-val_loss=1.74.ckpt'),     # 0
    ('ce_l7_tk3_1', 'epoch=8-step=125-val_loss=1.57.ckpt'),      # 1
    ('ce_l7_tk3_3', 'epoch=6-step=97-val_loss=1.82.ckpt'),       # 2
    ('focal_l7_tk3_0', 'epoch=17-step=251-val_loss=0.43.ckpt'),  # 3
    ('focal_l7_tk3_1', 'epoch=19-step=279-val_loss=0.35.ckpt'),  # 4
    ('focal_l7_tk3_3', 'epoch=18-step=265-val_loss=0.44.ckpt'),  # 5
    ('focal2_l7_tk3_0', 'epoch=18-step=265-val_loss=0.25.ckpt'), # 6
    ('focal2_l7_tk3_1', 'epoch=19-step=279-val_loss=0.17.ckpt'), # 7
    ('focal2_l7_tk3_3', 'epoch=18-step=265-val_loss=0.30.ckpt'), # 8
]))


# s = model_dict[model_idx]
s = ('ce_l7_tk3_3', 'best_model.ckpt')
print(s)
checkpoint_path = str(main_path / 'logs' / 'nlu_simple' / s[0] / 'checkpoints' / s[1])
model = NLUModel.load_from_checkpoint(checkpoint_path)
data = {"text": "if the noncurrent assets reduced by 62 percent in this fiscal year, what will be the effect to assets?"}
text = data['text']
bert_encodes = tokenizer(
    text, 
    add_special_tokens=True, 
    truncation=True, 
    max_length=64,
    return_tensors='pt'
)
o = model(**bert_encodes)
o

('ce_l7_tk3_3', 'best_model.ckpt')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'tags_loss': None,
 'intent_loss': None,
 'tags_pred': tensor([ 0,  5,  5,  8,  9,  9,  9,  9,  6,  5, 12, 13,  5,  5, 14, 15,  5,  5,
          5,  5,  5,  5,  5,  8,  5,  0]),
 'intent_pred': tensor([1])}

In [37]:
with (data_path / 'labels_simple.json').open('r', encoding='utf-8') as file:
    ls = json.load(file)
tags2id = ls['tags']
intent2id = ls['intent']
id2tags = {v: k for k, v in tags2id.items()}

In [38]:
id2tags

{100: '[UNK]',
 102: '[SEP]',
 0: '[PAD]',
 101: '[CLS]',
 103: '[MASK]',
 5: 'O',
 6: 'B-APPLY',
 7: 'I-APPLY',
 8: 'B-BS',
 9: 'I-BS',
 10: 'B-IS',
 11: 'I-IS',
 12: 'B-PERCENT',
 13: 'I-PERCENT',
 14: 'B-TIME',
 15: 'I-TIME'}

In [48]:
bert_encodes['input_ids']

tensor([[  101,  2065,  1996,  2512, 10841, 14343,  3372,  7045,  4359,  2011,
          5786,  3867,  1999,  2023, 10807,  2095,  1010,  2054,  2097,  2022,
          1996,  3466,  2000,  7045,  1029,   102]])

In [51]:
from spacy.training import biluo_tags_to_spans, iob_to_biluo
from tokenizations import get_alignments 

In [56]:
spacy_tkns = tokenizer.spacy_tokenize(text)
print(spacy_tkns)

['if', 'the', 'noncurrent', 'assets', 'reduced', 'by', '62', 'percent', 'in', 'this', 'fiscal', 'year', ',', 'what', 'will', 'be', 'the', 'effect', 'to', 'assets', '?']


In [61]:
a2b, b2a = get_alignments(tkns, spacy_tkns)

In [64]:
a2b

[[0],
 [1],
 [2],
 [2],
 [2],
 [2],
 [3],
 [4],
 [5],
 [6],
 [7],
 [8],
 [9],
 [10],
 [11],
 [12],
 [13],
 [14],
 [15],
 [16],
 [17],
 [18],
 [19],
 [20]]

In [59]:
entities = []
tkns = []
tags = []
cur_ent = ''
for tkn_id, tag_id in zip(bert_encodes['input_ids'][0, 1:-1], o['tags_pred'][1:-1].tolist()):
    tkn = tokenizer.bert_decode([tkn_id])
    tag = id2tags.get(tag_id)
    tags.append(tag)
    tkns.append(tkn)
    print(tkn, tag)
    # if tag in ['O', '[PAD]']:
    #     continue
    
    # scheme, ent = tag.split('-')
    # if scheme == 'B' and cur_ent != ent:
    #     txt = tkn
    #     cur_ent = ent
    # elif scheme == 'I':
    #     if tkn.startswith('##'):
    #         txt += tkn[2:]
    #     else:
    #         txt += f' {tkn}'
    # else:
    #     cur_ent = ''

    # print(tag, txt)
    # entities.append()


if O
the O
non B-BS
##cu I-BS
##rre I-BS
##nt I-BS
assets I-BS
reduced B-APPLY
by O
62 B-PERCENT
percent I-PERCENT
in O
this O
fiscal B-TIME
year I-TIME
, O
what O
will O
be O
the O
effect O
to O
assets B-BS
? O


In [34]:
tags2id

{'[UNK]': 100,
 '[SEP]': 102,
 '[PAD]': 0,
 '[CLS]': 101,
 '[MASK]': 103,
 'O': 5,
 'B-APPLY': 6,
 'I-APPLY': 7,
 'B-BS': 8,
 'I-BS': 9,
 'B-IS': 10,
 'I-IS': 11,
 'B-PERCENT': 12,
 'I-PERCENT': 13,
 'B-TIME': 14,
 'I-TIME': 15}

In [27]:
doc = tokenizer.spacy_nlp(text)
for x in doc:
    print(x, x.ent_type_)

if 
the 
noncurrent 
assets 
reduced 
by 
62 PERCENT
percent PERCENT
in 
this DATE
fiscal DATE
year DATE
, 
what 
will 
be 
the 
effect 
to 
assets 
? 


In [24]:
x.ent_type_

''