In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict
import torch
import re
from tqdm.notebook import tqdm
import sys

print(torch.__version__)
print(torch.cuda.is_available())
src_path = Path('.').absolute().parent
data_path = src_path / 'data'
sys.path.append(str(src_path))

import yaml
import networkx as nx
from src.ontology import OntologySystem

with (src_path / 'setting_files' / 'app_settings.yml').open('r') as file:
    settings = yaml.load(file, Loader=yaml.FullLoader)

onto = OntologySystem(
    acc_name_path=data_path / 'AccountName.csv', 
    rdf_path=data_path / 'AccountRDF.xml',
    model_path=data_path / settings['ontology']['model']['model_name'],
    kwargs_graph_drawer=settings['ontology']['graph_drawer']
)
ACC_DICT = onto.ACC_DICT

1.10.1
True


# Test for guessing masking tokens

In [None]:
from transformers import BertForMaskedLM, BertTokenizerFast

model_path = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForMaskedLM.from_pretrained(model_path)

## Question 1

Asking information based on fact and knowledge

In [None]:
# Question 1
# what is the Cost of sales ratio in last year?
threshold = 0.01
exceptions = ['BalanceSheet', 'IncomeStatement', 'CalendarOneYear']
times = ['year', 'quarter']
sentence_format = "[MASK] is the {} in the [MASK] {}?"
n_top = 15

predicted_tokens_dict = defaultdict(set)
progress_bar = tqdm(total=((len(ACC_DICT) - len(exceptions)) * len(times)))

for acc, dic in ACC_DICT.items():
    if acc in exceptions:
        continue
    account_name = dic['eng_name'].lower()
    for t in times:
        s = sentence_format.format(account_name, t.lower())

        inputs = tokenizer(s, padding=True, truncation=True, return_token_type_ids=True, return_tensors='pt')
        inputs_tensors = inputs['input_ids']
        masked = inputs_tensors.eq(tokenizer.mask_token_id)
        outputs = model(**inputs).logits[masked]
        logits_top = outputs.argsort(descending=True)[:, :n_top]
        probs_top = outputs.softmax(1).gather(1, logits_top)

        for i, m in enumerate(probs_top >= threshold):
            # tkns.append([k.item() for k in logits_top[i, m]])
            for k in logits_top[i, m]:
                tkn = tokenizer.decode(k)
                if len(re.findall(r'(\")', tkn)) == 0:
                    predicted_tokens_dict[f'[MASK]-{i}-{t}'].add(tkn)
        
        progress_bar.update(1)

In [None]:
with (data_path / 'tkns.csv').open('w') as file:
    for k, v in predicted_tokens_dict.items():
        print(','.join([k] + list(v)), file=file)

## Question 2

What if: Analysis based on fact

In [40]:
knowledge = 'BS'
knowledge_query = onto.sparql.get_predefined_knowledge(knowledge=knowledge+'R')
results = onto.sparql.query(knowledge_query)
nx_graph = onto.get_nx_graph(results)
sub_tree = nx.bfs_successors(nx_graph, source='BalanceSheet')
sub_tree = dict(sub_tree)

In [41]:
# Question 2
# what happens to the operating income when the cost of sales increases by 10% this year?

threshold = 0.01
exceptions = ['BalanceSheet', 'IncomeStatement', 'CalendarOneYear']
times = ['year', 'quarter']
# sentence_format = "what [MASK] to the {} when the {} [MASK] by {} {} in the [MASK] {}?"
sentence_format = "what will be the effect to {} if the {} [MASK] by {} {} in the [MASK] {}?"
n_top = 15
successors = []
predicted_tokens_dict = defaultdict(set)
progress_bar = tqdm()

for sub_acc, accs in sub_tree.items():
    if sub_acc in exceptions:
        continue
    sub_acc_name = ACC_DICT[sub_acc]['eng_name'].lower()
    successors.extend(accs)
    for acc in successors:
        account_name = ACC_DICT[acc]['eng_name'].lower()
        for t in times:
            s = sentence_format.format(
                account_name, sub_acc_name, 
                np.random.randint(1, 50, (1,))[0], np.random.choice(['percent', '%']),
                t.lower())

            inputs = tokenizer(s, padding=True, truncation=True, return_token_type_ids=True, return_tensors='pt')
            inputs_tensors = inputs['input_ids']
            masked = inputs_tensors.eq(tokenizer.mask_token_id)
            outputs = model(**inputs).logits[masked]
            logits_top = outputs.argsort(descending=True)[:, :n_top]
            probs_top = outputs.softmax(1).gather(1, logits_top)
            for i, m in enumerate(probs_top >= threshold):
                # tkns.append([k.item() for k in logits_top[i, m]])
                for k in logits_top[i, m]:
                    tkn = tokenizer.decode(k)
                    if len(re.findall(r'(\")', tkn)) == 0:
                        predicted_tokens_dict[f'[MASK]-{i}-{t}'].add(tkn)

            progress_bar.update(1)

0it [00:00, ?it/s]

In [42]:
with (data_path / 'tkns.csv').open('w') as file:
    for k, v in predicted_tokens_dict.items():
        print(','.join([k] + list(v)), file=file)

## Question 3

What if: Forecasting with embedded ML

In [44]:
# Question 3
# what will be our revenue in the 4th quarter?

threshold = 0.01
exceptions = ['BalanceSheet', 'IncomeStatement', 'CalendarOneYear']
times = ['year', 'quarter']
sentence_format = "[MASK] will be the {} in the [MASK] {}?"
# sentence_format = "how is the {} going to be in the [MASK] {}?"
n_top = 15

predicted_tokens_dict = defaultdict(set)
progress_bar = tqdm(total=((len(ACC_DICT) - len(exceptions)) * len(times)))

for acc, dic in ACC_DICT.items():
    if acc in exceptions:
        continue
    account_name = dic['eng_name'].lower()
    for t in times:
        s = sentence_format.format(account_name, t.lower())

        inputs = tokenizer(s, padding=True, truncation=True, return_token_type_ids=True, return_tensors='pt')
        inputs_tensors = inputs['input_ids']
        masked = inputs_tensors.eq(tokenizer.mask_token_id)
        outputs = model(**inputs).logits[masked]
        logits_top = outputs.argsort(descending=True)[:, :n_top]
        probs_top = outputs.softmax(1).gather(1, logits_top)

        for i, m in enumerate(probs_top >= threshold):
            # tkns.append([k.item() for k in logits_top[i, m]])
            for k in logits_top[i, m]:
                tkn = tokenizer.decode(k)
                if len(re.findall(r'(\")', tkn)) == 0:
                    predicted_tokens_dict[f'[MASK]-{i}-{t}'].add(tkn)
        
        progress_bar.update(1)

with (data_path / 'tkns.csv').open('w') as file:
    for k, v in predicted_tokens_dict.items():
        print(','.join([k] + list(v)), file=file)

  0%|          | 0/78 [00:00<?, ?it/s]

---

# Create dataset

In [5]:
# TODO: position 만들기
# ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]})

def get_entity(s, x, tag):
    idx = s.index(x)
    return (idx, idx+len(x), tag)

def random_sampling(x_dict, x_key):
    idx_range = np.arange(len(x_dict[x_key]))
    idx = np.random.choice(idx_range, replace=False, p=np.ones(len(idx_range)) / len(idx_range))
    word, tag, desc = x_dict[x_key][idx]
    return word, tag, desc

def get_words_filtered(words, text):
    words_filtered = defaultdict(list)
    for k, v in words.items():
        for word, tag, desc in v:
            if desc != text:
                words_filtered[k].append((word, tag, desc))
    return words_filtered

df = pd.read_csv(data_path / 'AccountWords.csv', encoding='utf-8')

format_dict = {
    0: ['help'],
    1: [
        # what/how, target_account, [MASK] + year/quarter
        "{} is the {} in the {} ?",
        # [MASK] + year/quarter, what/how, target_account
        "In the {}, {} is the value of the {} ?"
    ], 
    2: [
        # target_account, subject_account, [MASK], random_number + percent/%, [MASK] + year/quarter
        "what happens to the {} when the {} {} by {} in the {} ?",
        # target_account, subject_account, [MASK], random_number + percent/%, [MASK] + year/quarter
        "what will be the effect to {} if the {} {} by {} in the {} ?",
        # reverse the relation
        # subject_account, [MASK], random_number + percent/%, [MASK] + year/quarter, target_account
        "when the {} {} by {} in the {}, what will happen to the {} ?",
        # subject_account, [MASK], random_number + percent/%, [MASK] + year/quarter, target_account
        "if the {} {} by {} in the {}, what will be the effect to {} ?"
    ],
    3: [
        # what/how, target_account, [MASK] + year/quarter
        "{} will be the {} in the {} ?"
    ]
}

# TODO: maybe add the today's information after [SEP]?
context = ['HELP', 'PAST', 'FUTURE']
words = defaultdict(list)
for typ in ['year', 'quarter', 'words']:
    df_temp = df.loc[:, [typ, f'{typ}_tag', f'{typ}_desc']]
    df_temp = df_temp.loc[~df_temp[typ].isna(), :]
    for i, (w, t, desc) in df_temp.iterrows():
        words[typ].append((w, t, desc))

exceptions = ['BalanceSheet', 'IncomeStatement', 'Ratios', 'CalendarOneYear']
times = ['year', 'quarter']

all_data = []
s_ENT = '[E]'
e_ENT = '[/E]'
f_ENT = lambda x: f'{s_ENT}{x}{e_ENT}'


## Question 1

```python
# what/how, target_account, [MASK] + year/quarter
"{} is the {} in the {}?",
```

In [104]:
data1 = []
trg_scenario = 1
progress_bar = tqdm()
words_filtered = get_words_filtered(words, text='FUTURE')
for idx_fmt, fmt in enumerate(format_dict[trg_scenario]):
    
    for acc, dic in ACC_DICT.items():
        if acc in exceptions:
            continue
        target_account = dic['eng_name'].lower()
        knowledge, acc_type, _ = dic['group'].split('-')

        for t in ['year', 'quarter']:
            for t_word, t_tag, _ in words_filtered[t]:
                entities = []
                pre_token = np.random.choice(['what', 'how'], replace=False, p=np.ones(2)/2)
                if idx_fmt == 0:
                    # what/how, target_account, [MASK] + year/quarter
                    # "{} is the {} in the {}?",
                    s = fmt.format(
                        pre_token,
                        f_ENT(target_account), 
                        f_ENT(f'{t_word} {t}')
                        )
                else:
                    # [MASK] + year/quarter, what/how, target_account
                    # "In the {}, {} is the value of the {}"
                    s = fmt.format(
                        f_ENT(f'{t_word} {t}'),
                        pre_token,
                        f_ENT(target_account)
                    )
                # relation = [0, 0, 0]  # no_relation, order1, order2
                # entities
                ## target_account
                entities.append(get_entity(s, f_ENT(target_account), f'{knowledge}.{acc_type}'))
                ## MASK year/quarter
                entities.append(get_entity(s, f_ENT(f'{t_word} {t}'), t_tag))
                
                data1.append(
                    {'question': s, 'entities': sorted(entities, key=lambda x: x[0]), 'intent': 'PAST.value'} #, 'relation': relation}
                )
            
                progress_bar.update(1)

0it [00:00, ?it/s]

## Question 2

```python
# target_account, subject_account, [MASK], random_number + percent/%, [MASK] + year/quarter
"what happens to the {} when the {} {} by {} in the {}?"
# target_account, subject_account, [MASK], random_number + percent/%, [MASK] + year/quarter
"what will be the effect to {} if the {} {} by {} in the {}?"
# reverse the relation
# subject_account, [MASK], random_number + percent/%, [MASK] + year/quarter, target_account
"when the {} {} by {} in the {}, what will happen to the {}?"
# subject_account, [MASK], random_number + percent/%, [MASK] + year/quarter, target_account
"if the {} {} by {} in the {}, what will be the effect to {}?"
```

In [106]:
def get_role_dict(onto, knowledge):
    knowledge_query = onto.sparql.get_predefined_knowledge(knowledge=knowledge)
    sparql_results = onto.sparql.query(knowledge_query)
    role_dict = defaultdict(list)
    for s, p, o in sparql_results:
        s, p, o = map(onto.graph_drawer.convert_to_string, [s, p, o])
        if s == 'CalendarOneYear' or o == 'CalendarOneYear':
            continue
        if s not in role_dict[o]:
            role_dict[o].append(s)
        
    return role_dict

def process_successor(successors, role_dict, trg_acc, acc):
    if role_dict.get(acc) is None:
        # successors[trg_acc].extend(successors[acc])
        return None
    else:
        accs = role_dict.get(acc)
        if accs is not None:
            successors[trg_acc].extend(accs)
            for acc in accs:
                process_successor(successors, role_dict, trg_acc, acc)

def get_successor(onto, knowledge, exceptions=None):
    role_dict = get_role_dict(onto, knowledge=knowledge)
    successors = defaultdict(list)
    for trg_acc in role_dict.keys():
        if (exceptions is not None) and (trg_acc in exceptions):
            continue
        process_successor(successors, role_dict, trg_acc, trg_acc)
    return successors

trg_scenario = 2
bs_successors = get_successor(sparql, 'BS', exceptions)
is_successors = get_successor(sparql, 'IS', exceptions)
data2 = []
n_sample = 5
progress_bar = tqdm()
words_filtered = get_words_filtered(words, text='FUTURE')

for idx_fmt, fmt in enumerate(format_dict[trg_scenario]):
    for sub_tree in [bs_successors, is_successors]:
        for trg_acc, successors in sub_tree.items():
            if trg_acc in exceptions:
                continue
            target_account = ACC_DICT[trg_acc]['eng_name'].lower()
            target_knowledge, target_acc_type, _ = ACC_DICT[trg_acc]['group'].split('-')
            for sub_acc in successors:
                subject_account = ACC_DICT[sub_acc]['eng_name'].lower()
                subject_knowledge, subject_acc_type, _ = ACC_DICT[trg_acc]['group'].split('-')
                n = 0
                while n < n_sample:
                    entities = []

                    apply_word, apply_tag, _ = random_sampling(x_dict=words_filtered, x_key='words')
                    t = np.random.choice(times, replace=False, p=np.ones(len(times))/len(times))
                    t_word, t_tag, _ = random_sampling(x_dict=words_filtered, x_key=t)
                    
                    number = np.random.randint(1, 99)
                    percent = np.random.choice(['percent', '%'], replace=False, p=np.ones(2)/2)
                    
                    if idx_fmt in [0, 1]:
                        # target_account, subject_account, [MASK], random_number + percent/%, [MASK] + year/quarter
                        s = fmt.format(
                            f_ENT(target_account),
                            f_ENT(subject_account), 
                            f_ENT(apply_word), 
                            f_ENT(f'{number} {percent}'),
                            f_ENT(f'{t_word} {t}')
                            )
                        # relation = [1, 1, 2]
                    else:
                        # subject_account, [MASK], random_number + percent/%, [MASK] + year/quarter, target_account
                        s = fmt.format(
                            f_ENT(subject_account), 
                            f_ENT(apply_word), 
                            f_ENT(f'{number} {percent}'),
                            f_ENT(f'{t_word} {t}'),
                            f_ENT(target_account)
                            )
                        # relation = [1, 2, 1]
                    # entities
                    ## target_account
                    entities.append(get_entity(s, f_ENT(target_account), f'{target_knowledge}.{target_acc_type}'))
                    ## subject_account
                    entities.append(get_entity(s, f_ENT(subject_account), f'{subject_knowledge}.{subject_acc_type}'))
                    ## MASK apply words
                    entities.append(get_entity(s, f_ENT(apply_word), apply_tag))
                    ## percentages
                    entities.append(get_entity(s, f_ENT(f'{number} {percent}'), 'PERCENT'))
                    ## MASK year/quarter
                    entities.append(get_entity(s, f_ENT(f'{t_word} {t}'), t_tag))

                    d = {'question': s, 'entities': sorted(entities, key=lambda x: x[0]), 'intent': 'IF.fact'} #, 'relation': relation}
                    if d not in data2:
                        data2.append(
                            d
                        )
                    
                    progress_bar.update(1)
                    n += 1

## Question 3

```python
# what/how, target_account, [MASK] + year/quarter
"{} will be the {} in the {}?"
```

In [109]:
data3 = []
trg_scenario = 3
progress_bar = tqdm()
words_filtered = get_words_filtered(words, text='PAST')

for fmt in format_dict[trg_scenario]:
    for acc, dic in ACC_DICT.items():
        if acc in exceptions:
            continue
        target_account = dic['eng_name'].lower()
        knowledge, acc_type, _ = dic['group'].split('-')
        for t in ['year', 'quarter']:
            for t_word, t_tag, _ in words_filtered[t]:
                entities = []
                s = fmt.format(
                    np.random.choice(['what', 'how']), 
                    f_ENT(target_account), 
                    f_ENT(f'{t_word} {t}')
                    )
                # relation = [0, 0, 0]
                # entities
                ## target_account
                entities.append(get_entity(s, f_ENT(target_account), f'{knowledge}.{acc_type}'))
                ## MASK year/quarter
                entities.append(get_entity(s, f_ENT(f'{t_word} {t}'), t_tag))
                
                data3.append(
                    {'question': s, 'entities': entities, 'intent': 'IF.forecast'} #, 'relation': relation}
                )
                
                progress_bar.update(1)

all_data = data1 + data2 + data3

0it [00:00, ?it/s]

1014


---

# Post-process for entities

In [None]:
import json

special_len = len(s_ENT)+len(e_ENT)

for k, x in tqdm(enumerate(all_data), total=len(all_data)):
    all_data[k]['question'] = x['question'].replace(s_ENT, '').replace(e_ENT, '')
    for i, (s, e, ent) in enumerate(x['entities']):
        new_s = s-i*special_len
        new_e = new_s+(e-s)-special_len
        all_data[k]['entities'][i] = (new_s, new_e, ent)

with (data_path / 'all_data.jsonl').open('w', encoding='utf-8') as file:
    for line in tqdm(all_data, total=len(all_data), desc='saving'):
        file.write(json.dumps(line) + '\n')

---

# [Debugging]

In [5]:
import json

with (data_path / 'labels.json').open('r', encoding='utf-8') as file:
    labels = json.load(file)

tags2id = {v: k for k, v in enumerate(labels['tags'])}
intents2id = {v: k for k, v in enumerate(labels['intent'])}

In [18]:
index = 2314
data = train_data[index]
text = data['text']
ents = data['entities']
intent = data['intent']

In [19]:
tokens = nlu_tokenizer.tokenize(text)
offset_mapping = nlu_tokenizer.str_to_offset_mapping(text)

In [35]:
tags = nlu_tokenizer.get_biluo_tags(tokens, offset_mapping, ents)
tags = biluo_to_iob(tags)
print(tokens)
print(tags)

['when', 'the', 'advances', 'customers', 'rise', 'by', '94', 'percent', 'in', 'the', 'calendar', 'year', ',', 'what', 'will', 'happen', 'to', 'the', 'lia', '##bilities', 'and', 'e', '##qui', '##ties', '?']
['O', 'O', 'B-BS.Value', 'I-BS.Value', 'B-APPLY', 'O', 'B-PERCENT', 'I-PERCENT', 'O', 'O', 'B-TIME', 'I-TIME', 'O', 'O', 'O', 'O', 'O', 'O', 'B-BS.Value', 'I-BS.Value', 'I-BS.Value', 'I-BS.Value', 'I-BS.Value', 'I-BS.Value', 'O']


In [23]:
e = nlu_tokenizer(
    text, 
    add_special_tokens=True, 
    padding='max_length', 
    truncation=True, 
    max_length=64
)

In [38]:
def pad_tags(tags, input_ids, pad_idx):
    padded_tags = [pad_idx] + tags + [pad_idx] + ([pad_idx] * (len(input_ids) - len(tags)))
    return padded_tags

In [146]:
tokens = nlu_tokenizer.tokenize(text)
offset_mapping = nlu_tokenizer.spacy_encode(text, pad_offset=False)['offset_mapping']

combine conll dataset and custom dataset into together

In [53]:
from transformers import BertForTokenClassification, BertTokenizerFast, BertConfig

model_path = 'bert-base-uncased'# 'dslim/bert-base-NER'
bert_tokenizer = BertTokenizerFast.from_pretrained(model_path)
bert_ner = BertForTokenClassification.from_pretrained(model_path)
# cfg = BertConfig.from_pretrained(model_path, label2id=, id2label=)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

# Traning

- Entities
- Entities Relation (subject, target)

## Dataset

In [14]:
from src.nlu_utils import NLUDataModule, NLUTokenizer, NLUDataset

main_path = Path().absolute().parent
data_path = main_path / 'data'
setting_path = main_path / 'setting_files'

with (setting_path / 'train_settings.yml').open('r') as file:
    settings = yaml.load(file, Loader=yaml.FullLoader)

nlu_tokenizer = NLUTokenizer()

data_module = NLUDataModule(
    train_path=data_path / settings['train_file'], 
    valid_path=data_path / settings['valid_file'],
    test_path=data_path / settings['test_file'],
    labels_path=data_path / settings['labels_file'],
    batch_size=settings['batch_size'], 
    max_len=settings['max_len'],
    num_workers=settings['num_workers'],
    seed=settings['seed']
)
data_module.prepare_data()

loading: 100%|██████████| 15220/15220 [00:00<00:00, 73096.93it/s]
loading: 100%|██████████| 3060/3060 [00:00<00:00, 204245.30it/s]
loading: 100%|██████████| 2996/2996 [00:00<00:00, 150195.84it/s]


---

In [15]:
train_data = data_module.train_data
train_dataset = NLUDataset(
    train_data, 
    tags2id=data_module.tags2id, 
    intents2id=data_module.intents2id,
)
train_dataloader = data_module.train_dataloader()

In [4]:
from datasets import load_dataset
conll = load_dataset('conll2003')

Reusing dataset conll2003 (C:\Users\simon\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)
100%|██████████| 3/3 [00:00<00:00, 430.13it/s]


In [5]:
dataset = conll['train']
feature = dataset.features['ner_tags'].feature
errors = 0
x = dataset[1343]
text = ' '.join(x['tokens']).lower()
doc = nlu_tokenizer.spacy_nlp(text)

In [6]:
doc
spacy_tokens = list(map(str, doc))
tags = list(map(feature.int2str, x['ner_tags']))
bert_tokens = nlu_tokenizer.bert_tokenize(text)
original_tokens = list(map(str.lower, x['tokens']))

print(original_tokens)
print(tags)
print(spacy_tokens)
print(bert_tokens)
print(list(map(len, [original_tokens, tags, spacy_tokens, bert_tokens])))

['a', 'west', 'bank', 'bookseller', 'charged', 'on', 'thursday', 'that', 'the', 'palestinian', 'information', 'ministry', 'has', 'forced', 'him', 'to', 'sign', 'an', 'undertaking', 'not', 'to', 'distribute', 'books', 'written', 'by', 'critics', 'of', 'israeli-plo', 'self-rule', 'deals', '.']
['O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O']
['a', 'west', 'bank', 'bookseller', 'charged', 'on', 'thursday', 'that', 'the', 'palestinian', 'information', 'ministry', 'has', 'forced', 'him', 'to', 'sign', 'an', 'undertaking', 'not', 'to', 'distribute', 'books', 'written', 'by', 'critics', 'of', 'israeli', '-', 'plo', 'self', '-', 'rule', 'deals', '.']
['a', 'west', 'bank', 'books', '##eller', 'charged', 'on', 'thursday', 'that', 'the', 'palestinian', 'information', 'ministry', 'has', 'forced', 'him', 'to', 'sign', 'an', 'undertaking', 'not', 'to', 'distribute', 'bo

In [9]:
from tokenizations import get_alignments
from spacy.training import biluo_tags_to_offsets, iob_to_biluo, biluo_to_iob

In [60]:
print(mapped_tags)

['O', 'B-LOC', 'L-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'L-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'L-MISC', 'O', 'O', 'O', 'O', 'O']


In [62]:
entities = biluo_tags_to_offsets(doc, mapped_tags)
entities

[(2, 11, 'LOC'), (52, 84, 'ORG'), (169, 180, 'MISC')]

In [5]:
typ = 'train'
dataset = conll[typ]
feature = dataset.features['ner_tags'].feature
errors = 0
for x in tqdm(dataset, total=len(dataset), desc=f'{typ}set'):
    text = ' '.join(x['tokens']).lower()
    doc = nlu_tokenizer.spacy_nlp(text)

    tags = list(map(feature.int2str, x['ner_tags']))
    spacy_tokens = list(map(str, doc))
    original_tokens = list(map(str.lower, x['tokens']))
    mapped_tags = nlu_tokenizer.fix_tags_alignment(
        longer_tokens=spacy_tokens, shorter_tokens=original_tokens, tags=tags
    )

    entities = biluo_tags_to_offsets(doc, mapped_tags)
    if not entities:
        errors += 1
        continue

    d = {'text': text, 'entities': entities, 'intent': 'None'}

trainset:   0%|          | 0/14041 [00:00<?, ?it/s]

ValueError: [E067] Invalid BILUO tag sequence: Got a tag starting with L without a preceding 'B' (beginning of an entity). Tag sequence:
['O', 'B-PER', 'L-PER', 'L-PER']

In [7]:
print(original_tokens)
print(tags)
print(spacy_tokens)
print(mapped_tags)

['a', 'west', 'bank', 'bookseller', 'charged', 'on', 'thursday', 'that', 'the', 'palestinian', 'information', 'ministry', 'has', 'forced', 'him', 'to', 'sign', 'an', 'undertaking', 'not', 'to', 'distribute', 'books', 'written', 'by', 'critics', 'of', 'israeli-plo', 'self-rule', 'deals', '.']
['O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O']
['a', 'west', 'bank', 'bookseller', 'charged', 'on', 'thursday', 'that', 'the', 'palestinian', 'information', 'ministry', 'has', 'forced', 'him', 'to', 'sign', 'an', 'undertaking', 'not', 'to', 'distribute', 'books', 'written', 'by', 'critics', 'of', 'israeli', '-', 'plo', 'self', '-', 'rule', 'deals', '.']


NameError: name 'mapped_tags' is not defined

In [13]:
biluo_tags

['O',
 'B-PER',
 'L-PER',
 'O',
 'B-PER',
 'L-PER',
 'O',
 'B-PER',
 'L-PER',
 'O',
 'U-LOC',
 'O',
 'O']

In [10]:
a2b, b2a = get_alignments(a=original_tokens, b=spacy_tokens)

biluo_tags = iob_to_biluo(tags)
mapped_tags = ['-'] * len(spacy_tokens)
for i, tag in enumerate(biluo_tags):
    if tag == 'O':
        for k in a2b[i]:
            mapped_tags[k] = tag
        continue

    prefix, label = tag.split('-')
    if prefix == 'B':
        for j, k in enumerate(a2b[i]):
            if j == 0:
                mapped_tags[k] = tag
            else:
                mapped_tags[k] = f'I-{label}'
    elif prefix == 'L':
        for j, k in enumerate(a2b[i]):
            if j == len(a2b[i])-1:
                mapped_tags[k] = tag
            else:
                mapped_tags[k] = f'I-{label}'
    elif prefix == 'U':
        if len(a2b[i]) == 1:
            k = a2b[i][0]
            mapped_tags[k] = tag
        elif len(a2b[i]) == 2:
            b, l = a2b[i]
            mapped_tags[b] = f'B-{label}'
            mapped_tags[l] = f'L-{label}'
        else:
            for j, k in enumerate(a2b[i]):
                if j == 0:
                    mapped_tags[k] = f'B-{label}'
                elif j == len(a2b[i])-1:
                    mapped_tags[k] = f'L-{label}'
                else:
                    mapped_tags[k] = f'I-{label}'
    else:
        for j, k in enumerate(a2b[i]):
            mapped_tags[k] = tag

# TODO: augmentation 구현하기

In [54]:
from transformers import pipeline
from spacy.training import biluo_tags_to_offsets, iob_to_biluo, offsets_to_biluo_tags

s_ENT = '[E]'
e_ENT = '[/E]'
f_ENT = lambda x: f'{s_ENT}{x}{e_ENT}'
def get_entity(s, x, tag):
    idx = s.index(x)
    return (idx, idx+len(x), tag)

idx = 1468
text = train_data[idx]['text']
ents = train_data[idx]['entities']
intent = train_data[idx]['intent']
print(text)
print(ents)

what happens to the sales and selling general administrative ratio when the cost of sales ratio decreases by 17 % in the first quarter?
[[20, 66, 'IS.Ratio'], [76, 95, 'IS.Ratio'], [96, 105, 'APPLY'], [109, 113, 'PERCENT'], [121, 134, 'TIME']]


In [64]:
def remapping(text, ents)
    splitted = []
    text_copy = text[:]
    s_prev, e_prev = 0, 0
    for s, e, ent in ents:
        new_s = s-e_prev
        new_e = e-e_prev
        splitted.append(text_copy[:new_s])
        splitted.append(f_ENT(text_copy[new_s:new_e]))
        text_copy = text_copy[new_e:]
        s_prev = s
        e_prev = e
    new_text = ''.join(splitted)
    new_text

'what happens to the [E]sales and selling general administrative ratio[/E] when the [E]cost of sales ratio[/E] [E]decreases[/E] by [E]17 %[/E] in the [E]first quarter[/E]'

In [13]:
fillmask = pipeline('fill-mask', model="distilroberta-base")
mask_token = fillmask.tokenizer.mask_token

In [66]:
bert_tokens = nlu_tokenizer.bert_tokenize(text)

bert_offset_mapping = nlu_tokenizer.bert(text, add_special_tokens=False, return_offsets_mapping=True)['offset_mapping']
tags = nlu_tokenizer.offset_mapping_to_tags(offset_mapping=bert_offset_mapping, ents=ents)
tags = biluo_to_iob(tags)

In [67]:
masks = [i for i, t in enumerate(tags) if t == 'O']

In [74]:
K = np.random.choice(masks)
print(K)
# K = masks[-1]
masked_sentence = " ".join(bert_tokens[:K]  + [mask_token] + bert_tokens[K+1:])
predictions = fillmask(masked_sentence)
augmented_sequences = [predictions[i]['sequence'] for i in range(3) if predictions[i]['sequence'] != text]
for aug_s in augmented_sequences:
    nlu_tokenizer.bert_tokenize(aug_s)
augmented_sequences

2


['what happens with the sales and selling general administrative ratio when the cost of sales ratio decreases by 17 % in the first quarter?',
 'what happens about the sales and selling general administrative ratio when the cost of sales ratio decreases by 17 % in the first quarter?']

In [72]:
bert_tokens[11]

'the'

In [71]:
# offset mapping 통해서 다시 entities 매핑하기!
aug_s

'what happens to the sales and selling general administrative ratio when gross cost of sales ratio decreases by 17 % in the first quarter?'

In [47]:
aug_s

'what happens to the sales and selling general administrative ratio when the cost of sales ratio decreases by 17 % in the first quarter?"'

In [43]:
aug_s

'what happens to the sales and selling general administrative ratio when the cost of sales ratio decreases by 17 % in the first quarter?'

In [45]:
nlu_tokenizer.bert_tokenize(augmented_sequences[1])

['what',
 'happens',
 'to',
 'the',
 'sales',
 'and',
 'selling',
 'general',
 'administrative',
 'ratio',
 'when',
 'the',
 'cost',
 'of',
 'sales',
 'ratio',
 'decreases',
 'by',
 '17',
 '%',
 'in',
 'the',
 'first',
 'quarter',
 '?',
 '"']

In [35]:
nlu_tokenizer.bert_tokenize(augmented_sequences[0])

['what',
 'happens',
 'to',
 'the',
 'sales',
 'and',
 'selling',
 'general',
 'administrative',
 'ratio',
 'when',
 'the',
 'cost',
 'of',
 'sales',
 'ratio',
 'decreases',
 'by',
 '17',
 '%',
 'since',
 'the',
 'first',
 'quarter',
 '?']

In [28]:
augmented_sequences

['what happens to the sales and selling general administrative ratio when the cost of sales ratio decreases by 17 % since the first quarter?',
 'what happens to the sales and selling general administrative ratio when the cost of sales ratio decreases by 17 % over the first quarter?',
 'what happens to the sales and selling general administrative ratio when the cost of sales ratio decreases by 17 % in the first quarter?']

In [76]:
tags = nlu_tokenizer.get_tags(text, ents, tag_type='iob')
spacy_tokens = nlu_tokenizer.spacy_tokenize(text)
bert_tokens = nlu_tokenizer.bert_tokenize(text)
spanned_tags = nlu_tokenizer.map_spanned_tokens(
    longer_tokens=bert_tokens, shorter_token=spacy_tokens, tags=tags
)
bert_encodes = nlu_tokenizer(
    text, 
    add_special_tokens=True, 
    truncation=True, 
    max_length=256
)


In [77]:
outputs = []
for sentence in spanned_tags:
    words = sentence.split(' ')
    K = np.random.randint(1, len(words)-1)
    masked_sentence = " ".join(words[:K]  + [mask_token] + words[K+1:])
    predictions = fillmask(masked_sentence)
    augmented_sequences = [predictions[i]['sequence'] for i in range(3)]
    outputs += [sentence] + augmented_sequences

['O',
 'O',
 'O',
 'O',
 'B-IS.Ratio',
 'I-IS.Ratio',
 'O',
 'O',
 'B-IS.Ratio',
 'I-IS.Ratio',
 'I-IS.Ratio',
 'B-APPLY',
 'O',
 'B-PERCENT',
 'I-PERCENT',
 'O',
 'O',
 'B-TIME',
 'I-TIME',
 'O']

In [55]:
bert_offset_mapping = nlu_tokenizer.bert(text, add_special_tokens=False, return_offsets_mapping=True)['offset_mapping']
bert_tokens = nlu_tokenizer.bert_tokenize(text) 

In [24]:
spacy_tokens = nlu_tokenizer.spacy_tokenize(text)
bert_encodes = nlu_tokenizer.bert(text, add_special_tokens=False, return_offsets_mapping=True)
bert_offset_mapping = bert_encodes['offset_mapping']
# bert_tokens = [nlu_tokenizer.bert_decode([i]) for i in bert_encodes['input_ids']]
bert_tokens = nlu_tokenizer.bert_tokenize(text) 

In [70]:
spacy_tokens, bert_tokens

(['was', "n't"], ['wasn', "'", 't'])

In [71]:
index = 2314
data = train_data[index]
text = data['text']
ents = data['entities']
intent = data['intent']

encodes = nlu_tokenizer.bert.encode_plus(
    text, 
    add_special_tokens=True, 
    truncation=True, 
    max_length=256
)

In [72]:
train_dataset[0]

{'input_ids': [101,
  2129,
  2003,
  1996,
  2783,
  7045,
  1999,
  1996,
  2034,
  2095,
  1029,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'intent': 3,
 'tags': [0, 0, 0, 0, 3, 4, 0, 0, 13, 14, 0, 0]}

# dataloader validation check

In [9]:
import json
from collections import Counter

In [11]:
a = Counter(all_tags)
a.most_common(10)

[(0, 273734),
 (16, 13833),
 (18, 9045),
 (19, 7560),
 (4, 6781),
 (15, 6763),
 (17, 6539),
 (13, 4088),
 (14, 4088),
 (21, 3766)]

In [8]:
train_dataset = data_module.create_dataset(data_module.train_data)

all_tags = []
with (data_path / 'trainset.jsonl').open('w', encoding='utf-8') as file:

    restart_idx = 0
    for i in tqdm(range(restart_idx, len(train_dataset)), total=len(train_dataset)-restart_idx):
        item = train_dataset.__getitem__(i)
        file.write(json.dumps(item) + '\n')
        all_tags.extend(item['tags'])

  0%|          | 0/18129 [00:00<?, ?it/s]

In [42]:
valid_dataset = data_module.create_dataset(data_module.valid_data)
restart_idx = 0
for i in tqdm(range(restart_idx, len(valid_dataset)), total=len(valid_dataset)-restart_idx):
    valid_dataset.__getitem__(i)
    assert item['tags'].size(0) == 256, f"tags_size={item['tags'].size()}"
    assert isinstance(item['intent'].tolist(), int), f"intent_size={item['intent']}"

  0%|          | 0/3705 [00:00<?, ?it/s]

In [43]:
test_dataset = data_module.create_dataset(data_module.test_data)
restart_idx = 0
for i in tqdm(range(restart_idx, len(test_dataset)), total=len(test_dataset)-restart_idx):
    test_dataset.__getitem__(i)
    assert item['tags'].size(0) == 256, f"tags_size={item['tags'].size()}"
    assert isinstance(item['intent'].tolist(), int), f"intent_size={item['intent']}"

  0%|          | 0/3693 [00:00<?, ?it/s]

In [20]:
for x in data_module.train_dataloader():
    break

## [Debugging]

In [7]:
from spacy.training import biluo_to_iob, offsets_to_biluo_tags, biluo_tags_to_offsets, iob_to_biluo
from src.nlu_utils import NLUTokenizer

nlu_tokenizer = NLUTokenizer(hugg_path='bert-base-uncased', spacy_path='en_core_web_sm')

In [8]:
i = 5972 
x = train_dataset.data[i]
text = x['text']
ents = x['entities']
intent = x['intent']
print(text)
print(ents)
print(intent)

for x in ents:
    print(text[x[0]:x[1]], x[2])

11. johnny herbert ( britain ) sauber 1:56.318
[[4, 18, 'PER'], [21, 28, 'LOC'], [31, 37, 'ORG']]
None
johnny herbert PER
britain LOC
sauber ORG


In [9]:
print(tags)

['O', 'O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'O', 'B-ORG', 'O']


In [10]:
spacy_tokens = nlu_tokenizer.spacy_tokenize(text)
tags = nlu_tokenizer.get_tags(text, ents)
bert_tokens = nlu_tokenizer.bert_tokenize(text)

print(spacy_tokens)
print(tags)
print(bert_tokens)

['11', '.', 'johnny', 'herbert', '(', 'britain', ')', 'sauber', '1:56.318']
['O', 'O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'O', 'B-ORG', 'O']
['11', '.', 'johnny', 'herbert', '(', 'britain', ')', 'sa', '##uber', '1', ':', '56', '.', '318']


In [11]:
token_mappings = nlu_tokenizer.get_token_mappings(bert_tokens, spacy_tokens)

In [12]:
token_mappings

defaultdict(list,
            {0: [0],
             1: [1],
             2: [2],
             3: [3],
             4: [4],
             5: [5],
             6: [6],
             7: [7, 8],
             8: [9, 10, 11, 12, 13]})

In [60]:
conll = load_dataset('conll2003')
dataset = conll['train']
feature = dataset.features['ner_tags'].feature

for d in tqdm(dataset, total=len(dataset)):
    if ' '.join(d['tokens']).lower() == text:
        break

Reusing dataset conll2003 (C:\Users\simon\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)
100%|██████████| 3/3 [00:00<00:00, 429.73it/s]


In [94]:
tags = list(map(feature.int2str, d['ner_tags']))
tags

['O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'O', 'B-ORG', 'O']

In [95]:
biluo_tags_to_offsets(nlu_tokenizer.spacy_nlp(text), iob_to_biluo(tags))

[(2, 10, 'PER'), (19, 20, 'LOC'), (29, 30, 'ORG')]

In [96]:
spacy_tokens = list(map(str, nlu_tokenizer.spacy_nlp(text)))
original_tokens = d['tokens']

spacy_tokens, original_tokens

['11', '.', 'johnny', 'herbert', '(', 'britain', ')', 'sauber', '1:56.318']

In [71]:
original_tokens

['11.', 'Johnny', 'Herbert', '(', 'Britain', ')', 'Sauber', '1:56.318']

In [84]:
tags

['O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'O', 'B-ORG', 'O']

In [85]:
i, j = 0, 0
token_mappings = defaultdict(list) #{spacy_idx: [bert_idx]}
spanned = ''
while i < len(original_tokens) and j < len(spacy_tokens):
    origin_tkn = original_tokens[i]
    spacy_tkn = spacy_tokens[j]
    if origin_tkn == spacy_tkn:
        token_mappings[i].append(j)
        i += 1
        j += 1
        spanned = ''
    else:
        token_mappings[i].append(j)
        j += 1
        spanned += spacy_tkn[2:] if spacy_tkn.startswith('##') else spacy_tkn
        # see whether spanned is equal to current tokens
        if len(spanned) == len(origin_tkn):
            i += 1 
            spanned = ''
token_mappings

defaultdict(list,
            {0: [0, 1],
             1: [2],
             2: [3],
             3: [4],
             4: [5],
             5: [6],
             6: [7],
             7: [8]})

In [87]:
fixed_tags = ['-'] * len(spacy_tokens)
for i, t in enumerate(tags):
    for k in token_mappings[i]:
        fixed_tags[k] = t
fixed_tags

['O', 'O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'O', 'B-ORG', 'O']

In [97]:
doc = nlu_tokenizer.spacy_nlp(text)
doc

11. johnny herbert ( britain ) sauber 1:56.318

In [98]:
entities = biluo_tags_to_offsets(doc, iob_to_biluo(fixed_tags))
entities

[(4, 18, 'PER'), (21, 28, 'LOC'), (31, 37, 'ORG')]

In [100]:
text[21:28]

'britain'

In [43]:
tags = nlu_tokenizer.get_tags(text, ents, tag_type='iob')
spacy_tokens = nlu_tokenizer.spacy_tokenize(text)
bert_tokens = nlu_tokenizer.bert_tokenize(text)
print(spacy_tokens)
print(bert_tokens)
print(len(spacy_tokens), len(bert_tokens))

['shrs', 'outstanding', 'after', 'ipo', '16,668,560']
['sh', '##rs', 'outstanding', 'after', 'ip', '##o', '16', ',', '66', '##8', ',', '560']
5 12


In [101]:
conll = load_dataset('conll2003')
typ = 'train'

def add_conll_data(conll, nlu_tokenizer, data_list, typ='train', delete_errors=False):
    dataset = conll[typ]
    feature = dataset.features['ner_tags'].feature
    errors = 0
    for x in tqdm(dataset, total=len(dataset), desc=f'{typ}set'):
        text = ' '.join(x['tokens']).lower()
        doc = nlu_tokenizer.spacy_nlp(text)
        if delete_errors and len(list(doc)) != len(x['tokens']):
            errors += 1
            continue
            
        tags = list(map(feature.int2str, x['ner_tags']))
        if len(list(doc)) != len(x['tokens']):
            spacy_tokens = list(map(str, doc))
            original_tokens = x['tokens']
            tags = nlu_tokenizer.map_spanned_tokens(
                longer_tokens=spacy_tokens, shorter_token=original_tokens, tags=tags
            )
        entities = biluo_tags_to_offsets(doc, iob_to_biluo(tags))

        d = {'text': text, 'entities': entities, 'intent': 'None'}
        data_list.append(d)

Reusing dataset conll2003 (C:\Users\simon\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)
100%|██████████| 3/3 [00:00<00:00, 376.01it/s]


In [45]:
spanned_tags

NameError: name 'spanned_tags' is not defined

In [37]:
spacy_tokens[1] == bert_tokens[1]

False

In [49]:
i, j = 0, 0
token_mappings = defaultdict(list) #{spacy_idx: [bert_idx]}
spanned = ''
while i < len(spacy_tokens) and j < len(bert_tokens):
    spacy_tkn = spacy_tokens[i]
    bert_tkn = bert_tokens[j]
    print(f'| {i}: {spacy_tkn} | {j}: {bert_tkn} |', end='')
    if spacy_tkn == bert_tkn:
        token_mappings[i].append(j)
        i += 1
        j += 1
        spanned = ''
        print()
    else:
        token_mappings[i].append(j)
        j += 1
        spanned += bert_tkn[2:] if bert_tkn.startswith('##') else bert_tkn
        print(f' {spanned}')
        # see whether spanned is equal to current tokens
        if len(spanned) == len(spacy_tkn):
            i += 1 
            spanned = ''
token_mappings

| 0: shrs | 0: sh | sh
| 0: shrs | 1: ##rs | shrs
| 1: outstanding | 2: outstanding |
| 2: after | 3: after |
| 3: ipo | 4: ip | ip
| 3: ipo | 5: ##o | ipo
| 4: 16,668,560 | 6: 16 | 16
| 4: 16,668,560 | 7: , | 16,
| 4: 16,668,560 | 8: 66 | 16,66
| 4: 16,668,560 | 9: ##8 | 16,668
| 4: 16,668,560 | 10: , | 16,668,
| 4: 16,668,560 | 11: 560 | 16,668,560


defaultdict(list,
            {0: [0, 1], 1: [2], 2: [3], 3: [4, 5], 4: [6, 7, 8, 9, 10, 11]})

In [42]:
spanned_tags = ['-'] * len(bert_tokens)
for i, t in enumerate(tags):
    for k in token_mappings[i]:
        spanned_tags[k] = t

In [43]:
spanned_tags

['O',
 'O',
 'O',
 'O',
 'B-BS.Value',
 'B-BS.Value',
 'B-BS.Value',
 'B-BS.Value',
 'I-BS.Value',
 'I-BS.Value',
 'O',
 'O',
 'B-BS.Value',
 'I-BS.Value',
 'B-APPLY',
 'O',
 'B-PERCENT',
 'I-PERCENT',
 'O',
 'O',
 'B-TIME',
 'I-TIME',
 'O']

In [54]:
from transformers import BertForTokenClassification

bert_encodes = nlu_tokenizer.bert.encode_plus(text, return_offsets_mapping=False, return_tensors='pt')
model_path = 'bert-base-uncased'
model = BertForTokenClassification.from_pretrained(model_path)
o = model(**bert_encodes)
o.logits.shape

In [62]:
get_spanned_tags(bert_offset_mapping, spacy_offset_mapping, tags)

[None, None, None, None, None, None, None, None, None, None, None, None, None]

## Modeling

In [7]:
import torch
import torch.nn as nn
import torchmetrics
import pytorch_lightning as pl
from collections import defaultdict
from transformers import BertConfig, BertForTokenClassification

class BertPooler(nn.Module):
    def __init__(self, config):
        """from https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert/modeling_bert.py#L627"""
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

class NLUModel(pl.LightningModule):
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters() 
        # self.hparams: model_path, intent_size, tags_size, max_len
        self.outputs_keys = ['tags', 'intent']
        # Networks
        cfg = BertConfig()
        self.bert_ner = BertForTokenClassification.from_pretrained(self.hparams.model_path, num_labels=self.hparams.tags_size)
        self.bert_pooler = BertPooler(cfg)
        self.intent_network = nn.Linear(cfg.hidden_size, self.hparams.intent_size)
        
        # losses
        if self.hparams.stage == 'train':
            self.loss = nn.CrossEntropyLoss()
            # metrics
            self.metrics = nn.ModuleDict({
                'train_': self.create_metrics(prefix='train_'),
                'val_': self.create_metrics(prefix='val_'),
                'test_': self.create_metrics(prefix='test_')
            })
            
    def contiguous(self, x):
        return x.squeeze(-1).contiguous().type_as(x)

    def create_metrics(self, prefix='train_'):
        m = nn.ModuleDict()
        metrics = torchmetrics.MetricCollection([torchmetrics.Accuracy(), torchmetrics.F1()])
        for k in self.outputs_keys:
            m[k] = metrics.clone(prefix+k+'_')
        return m

    def _forward_bert(self, input_ids, token_type_ids, attention_mask):
        outputs = self.bert_ner.bert(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
        )
        return outputs.last_hidden_state

    def _forward_tags(self, last_hidden_state):
        tags_outputs = self.bert_ner.dropout(last_hidden_state)
        tags_logits = self.bert_ner.classifier(tags_outputs)
        return tags_logits.view(-1, self.hparams.tags_size)

    def _forward_intent(self, pooled_outputs):
        intent_logits = self.intent_network(pooled_outputs)
        return intent_logits

    def forward(self, input_ids, token_type_ids, attention_mask):
        # tags
        last_hidden_state = self._forward_bert(input_ids, token_type_ids, attention_mask)
        tags_logits = self._forward_tags(last_hidden_state)

        # intent
        pooled_outputs = self.bert_pooler(last_hidden_state)
        intent_logits = self._forward_intent(pooled_outputs)

        return {
            'tags': tags_logits,       # (B*max_len, tags_size)
            'intent': intent_logits,   # (B, intent_size)
        }

    def forward_all(self, batch, prefix='train_'):
        outputs = self.forward(
            input_ids=batch['input_ids'], 
            token_type_ids=batch['token_type_ids'], 
            attention_mask=batch['attention_mask'], 
        )

        targets = {
            'tags': batch['tags'].view(-1),    # (B*max_len, )
            'intent': batch['intent'],         # (B, )
        }
        loss = self.cal_loss(outputs, targets)
        self.log(f'{prefix}loss', loss, 
            on_step=True, on_epoch=True, sync_dist=self.hparams.multigpu)
        # logging
        self.cal_metrics(outputs, targets, prefix=prefix)
        return loss

    def cal_loss(self, outputs, targets):
        tags_loss = self.loss(outputs['tags'], targets['tags'])
        intent_loss = self.loss(outputs['intent'], targets['intent'])
        return tags_loss + intent_loss

    def cal_metrics(self, outputs, targets, prefix='train_'):
        outputs_metrics = defaultdict()
        for k in self.outputs_keys:
            for k_sub, v in self.metrics[prefix][k](outputs[k], targets[k]).items():
                outputs_metrics[k_sub] = v
        self.log_dict(outputs_metrics)

    def training_step(self, batch, batch_idx):
        loss = self.forward_all(batch, prefix='train_')
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.forward_all(batch, prefix='val_')

    def test_step(self, batch, batch_idx):   
        loss = self.forward_all(batch, prefix='test_')

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        lr_schedulers = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
        
        return {'optimizer': optimizer, 'lr_scheduler': lr_schedulers, 'monitor': 'val_loss'}

    def predict(self, input_ids, token_type_ids, attention_mask):
        outputs = self.forward(input_ids, token_type_ids, attention_mask)
        predicts = self._predict_from_outputs(outputs)
        return predicts

    def _predict_from_outputs(self, outputs):
        predicts = {k: outputs[k].argmax(-1) for k in ['tags', 'intent']} 
        return predicts

In [8]:
hparams = {
    'stage': settings['stage'],
    'model_path': settings['model_path'], 
    'intent_size': len(data_module.intents2id), 
    'tags_size': len(data_module.tags2id), 
    'max_len': settings['max_len'],
    'lr': settings['lr'],
    'multigpu': True if settings['n_gpus'] > 1 else False
}

model = NLUModel(**hparams)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [12]:
train_dataloader = data_module.train_dataloader()

In [11]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar

log_path = src_path / 'logs'
checkpoint_path = src_path / 'checkpoints'

logger = TensorBoardLogger(save_dir=str(log_path), name="NLU")
checkpoint_callback = ModelCheckpoint(
    dirpath=str(checkpoint_path), 
    save_top_k=2,
    monitor='val_loss'
)
progress_callback = TQDMProgressBar(refresh_rate=20)
trainer = pl.Trainer(
    gpus=1, 
    max_epochs=3, 
    logger=logger, 
    callbacks=[checkpoint_callback, progress_callback]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [243]:
trainer.fit(
    model, datamodule=data_module
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type                       | Params
----------------------------------------------------------------
0 | bert_ner         | BertForTokenClassification | 108 M 
1 | bert_pooler      | BertPooler                 | 590 K 
2 | intent_network   | Linear                     | 3.1 K 
3 | relation_network | RelationNetwork            | 789 K 
----------------------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
441.356   Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]