In [7]:
import torch
from pytorch_transformers import BertTokenizer, BertForMaskedLM

In [8]:
S1 = '[CLS] the company has a fiduciary duty to its shareholders . [SEP]'
S2 = 'one of its many regulatory requirements . [SEP]'

MI = [2,12,19,20,]

In [9]:
tokenizer      = BertTokenizer.from_pretrained('bert-base-uncased')
text           = f'{S1} {S2}'
tokenized_text = tokenizer.tokenize(text)

for i in MI :
    tokenized_text[i] = '[MASK]'

print(tokenized_text)

indexed_tokens   = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids     = [0] * len(tokenizer.tokenize(S1)) + [1] * len(tokenizer.tokenize(S2))

tokens_tensor    = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

['[CLS]', 'the', '[MASK]', 'has', 'a', 'fi', '##du', '##cia', '##ry', 'duty', 'to', 'its', '[MASK]', '.', '[SEP]', 'one', 'of', 'its', 'many', '[MASK]', '[MASK]', '.', '[SEP]']


In [10]:
model            = {}
model['GooBERT'] = BertForMaskedLM.from_pretrained('GooBERT')
model['FinBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Prime_128MSL-250K')
model['PreBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Pre2K_128MSL-250K')
model['ComBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Combo_128MSL-250K')

In [19]:
model['FinBERT-Prime_128MSL-500K_512MSL-100K'] = BertForMaskedLM.from_pretrained('FinBERT-Prime_128MSL-500K_512MSL-100K')

In [11]:
preds  = {}
for m in model:
    with torch.no_grad():
        preds[m] = model[m](tokens_tensor, token_type_ids = segments_tensors)[0]

In [12]:
d = ' | '
for m in preds:
    tokens = []
    for i in MI:
        predicted_index = torch.argmax(preds[m][0, i]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        tokens.append(f'{predicted_token:<12} [{predicted_index}]')

    print(f'{m :<37} : {d.join(tokens)}')

GooBERT                               : state        [2110] | members      [2372] | important    [2590] | duties       [5704]
FinBERT                               : company      [2194] | shareholders [15337] | stock        [4518] | ##holders    [17794]
PreBERT                               : bank         [2924] | directors    [5501] | are          [2024] | ##rs         [2869]
ComBERT                               : company      [2194] | shareholders [15337] | directors    [5501] | is           [2003]


In [15]:
tokenizer.convert_ids_to_tokens([2015])[0]

'##s'

# Take Aways
- FinBERT shows a statistically significant improvement in the contexual understanding of financial statements.
- Financial terms such as fudituary are missing from the vocabulary. In hind sight we should have generated the vocabulary for FinBERT from the SEC dataset.
- Impact of regulatory legislations introducted post financial crisis shows up in Prime and Combo results.

- Combo : Pre2K LR was low on purpose.