In [46]:
import torch
from pytorch_transformers import *
import sys, os
import pandas as pd
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from tqdm import tqdm, tqdm_notebook
import numpy as np

In [2]:
S1 = '[CLS] the company has a fiduciary duty to its shareholders . [SEP]'
S2 = 'one of its many regulatory requirements . [SEP]'
MI = [12,19]

In [3]:
tokenizer      = BertTokenizer.from_pretrained('bert-base-uncased')
text           = f'{S1} {S2}'
tokenized_text = tokenizer.tokenize(text)

for i in MI :
    tokenized_text[i] = '[MASK]'

print(tokenized_text)

indexed_tokens   = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids     = [0] * len(tokenizer.tokenize(S1)) + [1] * len(tokenizer.tokenize(S2))

tokens_tensor    = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

100%|██████████| 231508/231508 [00:00<00:00, 1139719.65B/s]


['[CLS]', 'the', 'company', 'has', 'a', 'fi', '##du', '##cia', '##ry', 'duty', 'to', 'its', '[MASK]', '.', '[SEP]', 'one', 'of', 'its', 'many', '[MASK]', 'requirements', '.', '[SEP]']


In [4]:
model            = {}
model['GooBERT'] = BertForMaskedLM.from_pretrained('GooBERT')
model['FinBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Prime_128MSL-250K')
model['PreBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Pre2K_128MSL-250K')
model['ComBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Combo_128MSL-250K')

In [5]:
preds  = {}
for m in model:
    with torch.no_grad():
        preds[m] = model[m](tokens_tensor, token_type_ids = segments_tensors)[0]

In [6]:
for m in preds:
    tokens = []
    for i in MI:
        predicted_index = torch.argmax(preds[m][0, i]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        tokens.append(predicted_token)

    print(f'{m} : {tokens}')

GooBERT : ['members', 'legal']
FinBERT : ['shareholders', 'regulatory']
PreBERT : ['shareholders', 'important']
ComBERT : ['shareholders', 'regulatory']


In [29]:
fiqa_question = pd.read_csv('FiQA_train_question_final.tsv', sep='\t')
fiqa_question.head()
fiqa_question.iloc[1]

Unnamed: 0                                                    1
qid                                                           1
question      Claiming business expenses for a business with...
timestamp                                   May 13 '14 at 13:17
Name: 1, dtype: object

In [8]:
fiqa_map = pd.read_csv('FiQA_train_question_doc_final.tsv', sep='\t')
fiqa_map.head()

Unnamed: 0.1,Unnamed: 0,qid,docid
0,0,0,18850
1,1,1,14255
2,2,2,308938
3,3,3,296717
4,4,3,100764


In [9]:
fiqa_answers = pd.read_csv('FiQA_train_doc_final.tsv', sep='\t')
fiqa_answers.head()

Unnamed: 0.1,Unnamed: 0,docid,doc,timestamp
0,0,3,I'm not saying I don't like the idea of on-the...,Oct 03 '12 at 14:56
1,1,31,So nothing preventing false ratings besides ad...,Sep 01 '17 at 13:36
2,2,56,You can never use a health FSA for individual ...,Jun 9 '14 at 17:37
3,3,59,Samsung created the LCD and other flat screen ...,Dec 27 at 01:37
4,4,63,Here are the SEC requirements: The federal sec...,Jul 14 '14 at 8:10


In [55]:
aux = fiqa_question.merge(fiqa_map, on='qid')
aux = aux[['qid','question','timestamp','docid']]
train = aux.merge(fiqa_answers, on='docid')
train = train[['question','doc']]
train = train.astype(str)

In [35]:
def convert_lines(question, answer, max_seq_length,tokenizer):
    max_seq_length -=1
    all_tokens = []
    longer = 0
    for q, a in (tqdm_notebook(question),:
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(['[CLS]']+tokens_a)
        sequence_id = [0]*len(one_token)
    all_tokens.append(one_token)
    print(longer)
    return np.array(all_tokens)

In [47]:
%%time
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
sequences = convert_lines(train,128,tokenizer)

HBox(children=(IntProgress(value=0, max=17110), HTML(value='')))


11959
CPU times: user 1min 19s, sys: 428 ms, total: 1min 19s
Wall time: 1min 21s


In [52]:
questionAnswering_model = BertForQuestionAnswering.from_pretrained('FinBERT-Combo_128MSL-500K_512MSL-100K')
questionAnswering_model.eval()

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): B

In [53]:
train_dataset = torch.utils.data.TensorDataset(torch.tensor(sequences, dtype=torch.long))
train = train_dataset
output_model_file = 'qa_finbert.bin'
lr = 2e-5
batch_size = 32
seed = 300686
np.random.seed(seed)
torch.manual_seed(seed)
param_optimizer = list(questionAnswering_model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay': 0.0}
]

# Preparing the optimizer
num_train_optimization_steps = int(1*len(train)/batch_size)
optimizer = BertAdam(optimizer_grouped_parameters,
                    lr = lr,
                    warmup = 0.05,
                    t_total = num_train_optimization_steps)

array([[  101,  2054,  2003, ...,  2030,  2717,   102],
       [  101,  6815,  2449, ..., 11372,  2000,   102],
       [  101, 14391,  2769, ...,  2000,  2113,   102],
       ...,
       [  101,  2064,  3007, ...,  1006,  1002,   102],
       [  101,  2064,  3007, ...,     0,     0,     0],
       [  101,  4855,  1037, ...,     0,     0,     0]])