# FinBERT QA

### Data preprocessing
1. Loads and cleans the raw data
2. Prepares the data for the retriever
3. Pre-processes and tokenizes the raw cleaned data
4. Creates vocabulary from the corpus

In [None]:
!git clone https://github.com/sparsh9012/FinBERT-QA
%cd FinBERT-QA
from src.utils import *
from src.process_data import *

In [2]:
# document dataset
collection = load_answers_to_df("data/raw/FiQA_train_doc_final.tsv")
collection.head()

Unnamed: 0,docid,doc
0,3,I'm not saying I don't like the idea of on-the...
1,31,So nothing preventing false ratings besides ad...
2,56,You can never use a health FSA for individual ...
3,59,Samsung created the LCD and other flat screen ...
4,63,Here are the SEC requirements: The federal sec...


In [3]:
# question dataset
queries = load_questions_to_df("data/raw/FiQA_train_question_final.tsv")
queries.head()

Unnamed: 0,qid,question
0,0,What is considered a business expense on a bus...
1,1,Claiming business expenses for a business with...
2,2,Transferring money from One business checking ...
3,3,Having a separate bank account for business/in...
4,4,Business Expense - Car Insurance Deductible Fo...


In [5]:
# question to document mapping
qid_docid = load_qid_docid_to_df("data/raw/FiQA_train_question_doc_final.tsv")
qid_rel = label_to_dict(qid_docid)
qid_docid.head()

Unnamed: 0,qid,docid
0,0,18850
1,1,14255
2,2,308938
3,3,296717
4,3,100764


In [None]:
# Cleaning data
empty_docs, empty_id = get_empty_docs(collection)
# Remove empty answers from collection of answers
collection_cleaned = collection.drop(empty_id)
# Remove empty answers from qa pairs
qid_docid = qid_docid[~qid_docid['docid'].isin(empty_docs)]

In [None]:
# Write collection df to file
save_tsv("retriever/collection_cleaned.tsv", collection_cleaned)

# Convert collection df to JSON file for document indexer
collection_to_json("retriever/collection_json/docs.json", "retriever/collection_cleaned.tsv")

In [9]:
# process questions
processed_questions = process_questions(queries)
processed_questions.head()

Unnamed: 0,qid,question,q_processed,tokenized_q,q_len
0,0,What is considered a business expense on a bus...,what is considered a business expense on a bus...,"[what, is, considered, a, business, expense, o...",10
1,1,Claiming business expenses for a business with...,claiming business expenses for a business with...,"[claiming, business, expenses, for, a, busines...",9
2,2,Transferring money from One business checking ...,transferring money from one business checking ...,"[transferring, money, from, one, business, che...",10
3,3,Having a separate bank account for business/in...,having a separate bank account for business in...,"[having, a, separate, bank, account, for, busi...",13
4,4,Business Expense - Car Insurance Deductible Fo...,business expense car insurance deductible fo...,"[business, expense, car, insurance, deductible...",13


In [10]:
# process answers
processed_answers = process_answers(collection_cleaned)
processed_answers.head()

Unnamed: 0,docid,doc,doc_processed,tokenized_ans,ans_len
0,3,I'm not saying I don't like the idea of on-the...,im not saying i dont like the idea of on the j...,"[im, not, saying, i, dont, like, the, idea, of...",76
1,31,So nothing preventing false ratings besides ad...,so nothing preventing false ratings besides ad...,"[so, nothing, preventing, false, ratings, besi...",78
2,56,You can never use a health FSA for individual ...,you can never use a health fsa for individual ...,"[you, can, never, use, a, health, fsa, for, in...",74
3,59,Samsung created the LCD and other flat screen ...,samsung created the lcd and other flat screen ...,"[samsung, created, the, lcd, and, other, flat,...",54
4,63,Here are the SEC requirements: The federal sec...,here are the sec requirements the federal sec...,"[here, are, the, sec, requirements, the, feder...",222


In [11]:
# statistics
avg_ans_count = processed_answers['ans_len'].mean()
avg_q_count = processed_questions['q_len'].mean()

print("Average answer length: {}".format(round(avg_ans_count)))
print("Average question length: {}".format(round(avg_q_count)))

print("Total answers: {}".format(len(processed_answers)))
print("Number of answers with length greater than 512: {}".format(len(processed_answers[processed_answers['ans_len'] > 512])))

Average answer length: 136
Average question length: 11
Total answers: 57600
Number of answers with length greater than 512: 1233


In [12]:
# Create vocabulary
word2index, word2count = create_vocab(processed_answers, processed_questions)
print("Vocab size: {}".format(len(word2index)))
print("Top {} common words: {}".format(35, Counter(word2count).most_common(35)))

qid_to_text, docid_to_text = id_to_text(collection, queries)
qid_to_tokenized_text, docid_to_tokenized_text = id_to_tokenized_text(processed_answers, processed_questions)

# Save objects to pickle
save_pickle("data/qa_lstm_tokenizer/word2index.pickle", word2index)
save_pickle("data/qa_lstm_tokenizer/word2count.pickle", word2count)

# id map to raw text
save_pickle("data/id_to_text/qid_to_text.pickle", qid_to_text)
save_pickle("data/id_to_text/docid_to_text.pickle", docid_to_text)

# id map to tokenized text
save_pickle("data/qa_lstm_tokenizer/qid_to_tokenized_text.pickle", qid_to_tokenized_text)
save_pickle("data/qa_lstm_tokenizer/docid_to_tokenized_text.pickle", docid_to_tokenized_text)

Vocab size: 85034
Top 35 common words: [('the', 371203), ('to', 233559), ('a', 201620), ('you', 166702), ('and', 163066), ('of', 157574), ('is', 129894), ('in', 120019), ('that', 111416), ('for', 89366), ('it', 83822), ('i', 74100), ('your', 68153), ('are', 67255), ('if', 60689), ('be', 59266), ('on', 58382), ('have', 55754), ('as', 50088), ('this', 49868), ('not', 49227), ('or', 46080), ('with', 45894), ('they', 44485), ('but', 41690), ('can', 38863), ('will', 36865), ('at', 35548), ('an', 31392), ('money', 31003), ('so', 29980), ('$', 29096), ('would', 28750), ('from', 28582), ('more', 27378)]


## **FinBERT-QA**

In [2]:
!git clone https://github.com/sparsh9012/FinBERT-QA
%cd FinBERT-QA
from src.utils import *

Cloning into 'FinBERT-QA'...
remote: Enumerating objects: 153, done.[K
remote: Counting objects:   0% (1/153)[Kremote: Counting objects:   1% (2/153)[Kremote: Counting objects:   2% (4/153)[Kremote: Counting objects:   3% (5/153)[Kremote: Counting objects:   4% (7/153)[Kremote: Counting objects:   5% (8/153)[Kremote: Counting objects:   6% (10/153)[Kremote: Counting objects:   7% (11/153)[Kremote: Counting objects:   8% (13/153)[Kremote: Counting objects:   9% (14/153)[Kremote: Counting objects:  10% (16/153)[Kremote: Counting objects:  11% (17/153)[Kremote: Counting objects:  12% (19/153)[Kremote: Counting objects:  13% (20/153)[Kremote: Counting objects:  14% (22/153)[Kremote: Counting objects:  15% (23/153)[Kremote: Counting objects:  16% (25/153)[Kremote: Counting objects:  17% (27/153)[Kremote: Counting objects:  18% (28/153)[Kremote: Counting objects:  19% (30/153)[Kremote: Counting objects:  20% (31/153)[Kremote: Counting objects:  21% 

In [None]:
!pip install transformers

In [None]:
import torch
import pickle
import csv
import regex as re
import pandas as pd

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
torch.backends.cudnn.deterministic = True
torch.manual_seed(1234)

Using device: cuda


<torch._C.Generator at 0x7f718636cbf0>

In [None]:
# Collection of answers - docid, text
collection = pd.read_csv("retriever/collection_cleaned.tsv", sep="\t", header=None)
collection = collection.rename(columns={0: 'docid', 1: 'doc'})
# Questions - qid, text
query_df = pd.read_csv("data/raw/FiQA_train_question_final.tsv", sep="\t")
queries = query_df[['qid', 'question']]

# List of empty docs
empty_docs = load_pickle('data/id_to_text/empty_docs.pickle')

# docid to text mapping
docid_to_text = load_pickle('data/id_to_text/docid_to_text.pickle')
# qid to text mapping
qid_to_text = load_pickle('data/id_to_text/qid_to_text.pickle')

In [None]:
# Load and process dataset
dataset = pd.read_csv("data/raw/FiQA_train_question_doc_final.tsv", sep="\t")
dataset = dataset[["qid", "docid"]]
dataset = dataset[~dataset['docid'].isin(empty_docs)]
dataset['question'] = dataset['qid'].apply(lambda x: qid_to_text[x])
dataset['answer'] = dataset['docid'].apply(lambda x: docid_to_text[x])

In [8]:
dataset.head()

Unnamed: 0,qid,docid,question,answer
0,0,18850,What is considered a business expense on a bus...,The IRS Guidance pertaining to the subject. I...
1,1,14255,Claiming business expenses for a business with...,Yes you can claim your business deductions if ...
2,2,308938,Transferring money from One business checking ...,You should have separate files for each of the...
3,3,296717,Having a separate bank account for business/in...,Having a separate checking account for the bus...
4,3,100764,Having a separate bank account for business/in...,"You don't specify which country you are in, so..."


In [None]:
def add_ques_token(string):
    question = string + " [SEP] "
    return question

In [10]:
# Concatenate question and answer with a separator
dataset['question'] = dataset['question'].apply(add_ques_token)
dataset['seq'] = dataset['question'] + dataset['answer']
dataset = dataset[['seq']]

dataset.at[17081, "seq"]

"Is it wise to switch investment strategy frequently? [SEP] My super fund and I would say many other funds give you one free switch of strategies per year.  Some suggest you should change from high growth option to a more balance option once you are say about 10 to 15 years from retirement, and then change to a more capital guaranteed option a few years from retirement. This is a more passive approach and has benefits as well as disadvantages. The benefit is that there is not much work involved, you just change your investment option based on your life stage, 2 to 3 times during your lifetime. This allows you to take more risk when you are young to aim for higher returns, take a balanced approach with moderate risk and returns during the middle part of your working life, and take less risk with lower returns (above inflation) during the latter part of your working life. A possible disadvantage of this strategy is you may be in the higher risk/ higher growth option during a market corre

In [None]:
# Write data to file
dataset.to_csv('data/data.txt',index=False,header=False, sep="\t", quoting=csv.QUOTE_NONE)