In [104]:
import torch
from pytorch_transformers import *
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam
import sys, os
import pandas as pd
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from tqdm import tqdm, tqdm_notebook
import numpy as np

In [2]:
S1 = '[CLS] the company has a fiduciary duty to its shareholders . [SEP]'
S2 = 'one of its many regulatory requirements . [SEP]'
MI = [12,19]

In [3]:
tokenizer      = BertTokenizer.from_pretrained('bert-base-uncased')
text           = f'{S1} {S2}'
tokenized_text = tokenizer.tokenize(text)

for i in MI :
    tokenized_text[i] = '[MASK]'

print(tokenized_text)

indexed_tokens   = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids     = [0] * len(tokenizer.tokenize(S1)) + [1] * len(tokenizer.tokenize(S2))

tokens_tensor    = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

100%|██████████| 231508/231508 [00:00<00:00, 1139719.65B/s]


['[CLS]', 'the', 'company', 'has', 'a', 'fi', '##du', '##cia', '##ry', 'duty', 'to', 'its', '[MASK]', '.', '[SEP]', 'one', 'of', 'its', 'many', '[MASK]', 'requirements', '.', '[SEP]']


In [4]:
model            = {}
model['GooBERT'] = BertForMaskedLM.from_pretrained('GooBERT')
model['FinBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Prime_128MSL-250K')
model['PreBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Pre2K_128MSL-250K')
model['ComBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Combo_128MSL-250K')

In [5]:
preds  = {}
for m in model:
    with torch.no_grad():
        preds[m] = model[m](tokens_tensor, token_type_ids = segments_tensors)[0]

In [6]:
for m in preds:
    tokens = []
    for i in MI:
        predicted_index = torch.argmax(preds[m][0, i]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        tokens.append(predicted_token)

    print(f'{m} : {tokens}')

GooBERT : ['members', 'legal']
FinBERT : ['shareholders', 'regulatory']
PreBERT : ['shareholders', 'important']
ComBERT : ['shareholders', 'regulatory']


In [127]:
fiqa_question = pd.read_csv('FiQA_train_question_final.tsv', sep='\t')
fiqa_question.head()


Unnamed: 0.1,Unnamed: 0,qid,question,timestamp
0,0,0,What is considered a business expense on a bus...,Nov 8 '11 at 15:14
1,1,1,Claiming business expenses for a business with...,May 13 '14 at 13:17
2,2,2,Transferring money from One business checking ...,Jan 20 '16 at 20:31
3,3,3,Having a separate bank account for business/in...,Mar 1 at 0:24
4,4,4,Business Expense - Car Insurance Deductible Fo...,Mar 4 at 0:26


In [137]:
fiqa_map = pd.read_csv('FiQA_train_question_doc_final.tsv', sep='\t')
fiqa_map.head()

Unnamed: 0.1,Unnamed: 0,qid,docid
0,0,0,18850
1,1,1,14255
2,2,2,308938
3,3,3,296717
4,4,3,100764


In [136]:
fiqa_answers = pd.read_csv('FiQA_train_doc_final.tsv', sep='\t')
fiqa_answers.head()

Unnamed: 0.1,Unnamed: 0,docid,doc,timestamp
0,0,3,I'm not saying I don't like the idea of on-the...,Oct 03 '12 at 14:56
1,1,31,So nothing preventing false ratings besides ad...,Sep 01 '17 at 13:36
2,2,56,You can never use a health FSA for individual ...,Jun 9 '14 at 17:37
3,3,59,Samsung created the LCD and other flat screen ...,Dec 27 at 01:37
4,4,63,Here are the SEC requirements: The federal sec...,Jul 14 '14 at 8:10


In [118]:
aux = fiqa_question.merge(fiqa_map, on='qid')
aux = aux[['qid','question','timestamp','docid']]
train = aux.merge(fiqa_answers, on='docid')
train = train[['question','doc']]
train = train.astype(str)
train = train.rename(columns={'doc':'context'})

In [119]:
train.to_json('~/w266-final/pred/train.json', orient='table')

In [125]:
fiqa_question_test = pd.read_csv('~/w266-final/pred/FiQA_test_question_task2 .tsv', sep='\t')
fiqa_question_test.head()

Unnamed: 0.1,Unnamed: 0,qid,question
0,0,113,How can I calculate deductible percentage of b...
1,1,114,What to ask for on a business partnership?
2,2,115,Can I claim household services as business exp...
3,3,116,Sending more that 10K USD to UK Business Account
4,4,117,Any practical difference between “personal che...


In [135]:
fiqa_answers_test = pd.read_csv('FiQA_test_doc_task2.tsv', sep='\t')
fiqa_answers_test.head()

Unnamed: 0.1,Unnamed: 0,docid,doc,timestamp
0,0,3,I'm not saying I don't like the idea of on-the...,Oct 03 '12 at 14:56
1,1,31,So nothing preventing false ratings besides ad...,Sep 01 '17 at 13:36
2,2,56,You can never use a health FSA for individual ...,Jun 9 '14 at 17:37
3,3,59,Samsung created the LCD and other flat screen ...,Dec 27 at 01:37
4,4,63,Here are the SEC requirements: The federal sec...,Jul 14 '14 at 8:10


In [140]:
test = fiqa_question_test.merge(fiqa_answers_test)
test = test[['qid','question','docid', 'doc']]
test = test.astype(str)
test = test.rename(columns={'doc':'context'})

In [142]:
test.to_json('~/w266-final/pred/test.json', orient='table')

In [None]:
!python3 run_squad.py \
     --do_train=True \
     --do_predict=True \
     --vocab_file=bert.vocab \
     --bert_config_file=GooBERT/bert_config.json \
     --init_checkpoint=$BIOBERT_DIR/biobert_model.ckpt \
     --max_seq_length=128 \
     --train_batch_size=12 \
     --learning_rate=5e-6 \
     --doc_stride=128 \
     --num_train_epochs=1.0 \
     --do_lower_case=True \
     --train_file=train.json \
     --predict_file=test.json \
     --output_dir=/tmp/QA_output/

In [84]:
def convert_lines(question, answer, max_seq_length,tokenizer):
    max_seq_length -=3
    all_tokens = []
    sequences = []
    for q, a in zip(tqdm_notebook(question), tqdm_notebook(answer)):
        tokens_q = tokenizer.tokenize(q)
        tokens_a = tokenizer.tokenize(a)
        if len(tokens_a)+len(tokens_q)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length-len(tokens_q)]
        one_token = tokenizer.convert_tokens_to_ids(['[CLS]']+tokens_q+['[SEP]']+tokens_a+['[SEP]']+[0]*(max_seq_length \
                                                    - len(tokens_q)-len(tokens_a)))
        all_tokens.append(one_token)
        sequences.append([0]*(2+len(tokens_q))+[1]*(1+len(tokens_a))+[0]*(max_seq_length \
                                                    - len(tokens_q)-len(tokens_a)))
        
    sequence_ids = np.array(sequences) 
    tokens_ids = np.array(all_tokens)
    return tokens_ids, sequence_ids

In [85]:
%%time
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

CPU times: user 49.1 ms, sys: 7 µs, total: 49.2 ms
Wall time: 341 ms


In [86]:
%%time
tokens, sequences = convert_lines(train['question'],train['doc'],128,tokenizer)

HBox(children=(IntProgress(value=0, max=17110), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17110), HTML(value='')))

CPU times: user 1min 21s, sys: 518 ms, total: 1min 22s
Wall time: 1min 21s


In [76]:
%%time
qamodel            = {}
qamodel['GooBERT'] = BertForQuestionAnswering.from_pretrained('GooBERT')
qamodel['FinBERT'] = BertForQuestionAnswering.from_pretrained('FinBERT-Prime_128MSL-500K_512MSL-200K')
qamodel['PreBERT'] = BertForQuestionAnswering.from_pretrained('FinBERT-Pre2K_128MSL-250K')
qamodel['ComBERT'] = BertForQuestionAnswering.from_pretrained('FinBERT-Combo_128MSL-500K_512MSL-100K')

CPU times: user 29.3 s, sys: 766 ms, total: 30.1 s
Wall time: 20.3 s


In [101]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_dataset = torch.utils.data.TensorDataset(torch.tensor(tokens, dtype=torch.long), \
                                               torch.tensor(sequences, dtype=torch.long))
train = train_dataset
output_model_file = 'qa_finbert.bin'
lr = 2e-5
batch_size = 32
seed = 300686
np.random.seed(seed)
torch.manual_seed(seed)

model = qamodel['GooBERT']
param_optimizer = list(qamodel['GooBERT'].named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay': 0.0}
]

# Preparing the optimizer
num_train_optimization_steps = int(1*len(train)/batch_size)
optimizer = BertAdam(optimizer_grouped_parameters,
                    lr = lr,
                    warmup = 0.05,
                    t_total = num_train_optimization_steps)

In [102]:
model.train()
optimizer.

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): B

In [107]:
%%time
tq = tqdm_notebook(range(1))
for epoch in tq:
    train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle = True)
    avg_loss = 0.
    avg_accuracy = 0.
    lossf = None
    tk0 = tqdm_notebook(enumerate(train_loader), total = len(train_loader), leave = False)
    optimizer.zero_grad()
    for i, (x_batch, y_batch) in tk0:
        y_pred = model(x_batch.to(device), attention_mask = (x_batch>0).to(device))
        loss = F.binary_cross_entropy_with_logits(y_pred, y_batch.to(device))
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        if (i+1) % accumulation_steps == 0: 
            optimizer.step()
            optimizer.zero_grad()
        if lossf:
            lossf = 0.98*lossf + 0.02*loss.item()
        else:
            lossf = loss.item()
        tk0.set_postfix(loss = lossf)
        avg_loss += loss.item() / len(train_loader)
        avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:, 0]) > 0.5) == (y_batch[:, 0] > 0.5).to(device)).to(torch.float)).item() / len(train_loader)
    tq.set_postfix(avg_loss = avg_loss, avg_accuracy = avg_accuracy)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=535), HTML(value='')))

AttributeError: 'tuple' object has no attribute 'size'

In [108]:
len(tokens)

17110

In [110]:
17110/63/60

4.526455026455026

In [None]:
!