# Installation Required:

# Importing various libraries

In [1]:
import pdfplumber
import re
import gensim
from gensim.parsing.preprocessing import remove_stopwords
import torch
from transformers import BertForQuestionAnswering

In [2]:
from docx2pdf import convert
convert("sample_policy_document.docx")

  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
pdf = pdfplumber.open('sample_policy_document.pdf')
page = pdf.pages[0]
page1 = pdf.pages[1]
pdf_txt = page.extract_text() + page1.extract_text()
pdf.close() 

In [4]:
print(pdf_txt)

 
 
D.2.1 Total disability benefit 
 
If the person insured is totally disabled, we will pay you the total disability benefit.  
The total disability benefit is calculated monthly and we will pay you half a month in arrears and half a month in advance.  
The definition of totally disabled depends on the person insured’s occupation category.  
 
Where the Schedule specifies the occupation category as MP, AA, A, B or C  
The person insured is totally disabled if, because of an injury or sickness, he or she is:  
•  not capable of doing the important duties of his or her occupation  
•  not working in any occupation (whether paid or unpaid), and  
•  under medical care.  
However, if immediately preceding a claim the person insured has been unemployed for 15 months or on leave 
without pay for 12 months, he or she is totally disabled if, because of an injury or sickness, he or she is:  
•  not capable of performing any occupation (whether paid or unpaid) for which he or she is reasonably 

In [5]:
pdf_txt=pdf_txt.replace('•', '')
print(pdf_txt)

 
 
D.2.1 Total disability benefit 
 
If the person insured is totally disabled, we will pay you the total disability benefit.  
The total disability benefit is calculated monthly and we will pay you half a month in arrears and half a month in advance.  
The definition of totally disabled depends on the person insured’s occupation category.  
 
Where the Schedule specifies the occupation category as MP, AA, A, B or C  
The person insured is totally disabled if, because of an injury or sickness, he or she is:  
  not capable of doing the important duties of his or her occupation  
  not working in any occupation (whether paid or unpaid), and  
  under medical care.  
However, if immediately preceding a claim the person insured has been unemployed for 15 months or on leave 
without pay for 12 months, he or she is totally disabled if, because of an injury or sickness, he or she is:  
  not capable of performing any occupation (whether paid or unpaid) for which he or she is reasonably suit

# For Question Answering I used here Bert For QuestionAnswering class from the transformers library
- This class supports fine-tuning, and in this example we will keep things simpler and load a BERT model that has already been fine-tuned for the SQuAD(Stanford Question Answering Dataset) benchmark.

In [6]:
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


In [7]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [8]:
question = "How often total disability is calculated?"
answer_text = pdf_txt

#### We'll need to run the BERT tokenizer against both the question and the answer_text. To feed these into BERT, we actually concatenate them together and place the special [SEP] token in between.

In [9]:
# Apply the tokenizer to the input text, treating them as a text-pair.
input_ids = tokenizer.encode(question, answer_text)

print('The input has a total of {:} tokens.'.format(len(input_ids)))

The input has a total of 454 tokens.


#### Just to see exactly what the tokenizer is doing, let's print out the tokens with their IDs.

In [10]:
# BERT only needs the token IDs, but for the purpose of inspecting the 
# tokenizer's behavior, let's also get the token strings and display them.
tokens = tokenizer.convert_ids_to_tokens(input_ids)

# For each token and its id...
for token, id in zip(tokens, input_ids):
    
    # If this is the [SEP] token, add some space around it to make it stand out.
    if id == tokenizer.sep_token_id:
        print('')
    
    # Print the token string and its ID in two columns.
    print('{:<12} {:>6,}'.format(token, id))

    if id == tokenizer.sep_token_id:
        print('')

[CLS]           101
how           2,129
often         2,411
total         2,561
disability   11,980
is            2,003
calculated   10,174
?             1,029

[SEP]           102

d             1,040
.             1,012
2             1,016
.             1,012
1             1,015
total         2,561
disability   11,980
benefit       5,770
if            2,065
the           1,996
person        2,711
ins          16,021
##ured       12,165
is            2,003
totally       6,135
disabled      9,776
,             1,010
we            2,057
will          2,097
pay           3,477
you           2,017
the           1,996
total         2,561
disability   11,980
benefit       5,770
.             1,012
the           1,996
total         2,561
disability   11,980
benefit       5,770
is            2,003
calculated   10,174
monthly       7,058
and           1,998
we            2,057
will          2,097
pay           3,477
you           2,017
half          2,431
a             1,037
month         3,20

- We've concatenated the question and answer_text together, but BERT still needs a way to distinguish them. BERT has two special "Segment" embeddings, one for segment "A" and one for segment "B". Before the word embeddings go into the BERT layers, the segment A embedding needs to be added to the question tokens, and the segment B embedding needs to be added to each of the answer_text tokens.

- These additions are handled for us by the transformer library, and all we need to do is specify a '0' or '1' for each token.

In [11]:
# Search the input_ids for the first instance of the `[SEP]` token.
sep_index = input_ids.index(tokenizer.sep_token_id)

# The number of segment A tokens includes the [SEP] token istelf.
num_seg_a = sep_index + 1

# The remainder are segment B.
num_seg_b = len(input_ids) - num_seg_a

# Construct the list of 0s and 1s.
segment_ids = [0]*num_seg_a + [1]*num_seg_b

# There should be a segment_id for every input token.
assert len(segment_ids) == len(input_ids)

- We're ready to feed our example into the model!

In [12]:
# Run our example through the model.
outputs = model(torch.tensor([input_ids]), # The tokens representing our input text.
                             token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                             return_dict=True) 

start_scores = outputs.start_logits
end_scores = outputs.end_logits


- Now we can highlight the answer just by looking at the most probable start and end words.

In [13]:
# Find the tokens with the highest `start` and `end` scores.
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)

# Combine the tokens in the answer and print it out.
answer = ' '.join(tokens[answer_start:answer_end+1])

print('Answer: "' + answer + '"')

Answer: "monthly"


# Turn this QA process into a function so that we can easily try out for other examples.

In [14]:
def answer_question(question, answer_text):
    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    '''
    # ======== Tokenize ========
    # Apply the tokenizer to the input text, treating them as a text-pair.
    input_ids = tokenizer.encode(question, answer_text)

    # Report how long the input sequence is.
    #print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    # Run our example through the model.
    outputs = model(torch.tensor([input_ids]), # The tokens representing our input text.
                    token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                    return_dict=True) 

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    print('Answer: "' + answer + '"')

In [15]:
import textwrap

# Wrap text to 80 characters.
wrapper = textwrap.TextWrapper(width=80) 

bert_abstract = pdf_txt

print(wrapper.fill(bert_abstract))

    D.2.1 Total disability benefit    If the person insured is totally disabled,
we will pay you the total disability benefit.   The total disability benefit is
calculated monthly and we will pay you half a month in arrears and half a month
in advance.   The definition of totally disabled depends on the person insured’s
occupation category.     Where the Schedule specifies the occupation category as
MP, AA, A, B or C   The person insured is totally disabled if, because of an
injury or sickness, he or she is:     not capable of doing the important duties
of his or her occupation     not working in any occupation (whether paid or
unpaid), and     under medical care.   However, if immediately preceding a claim
the person insured has been unemployed for 15 months or on leave  without pay
for 12 months, he or she is totally disabled if, because of an injury or
sickness, he or she is:     not capable of performing any occupation (whether
paid or unpaid) for which he or she is reasonably suit

# Let's ask Bert all the Question and get Answer 

In [16]:
question1 = "How often total disability is calculated?"

answer_question(question1, bert_abstract)

Answer: "monthly"


In [17]:
question2 = "How to pay the total disability?"

answer_question(question2, bert_abstract)

Answer: "the total disability benefit is calculated monthly"


In [18]:
question3 = "What the definition of total disability depends on?"

answer_question(question3, bert_abstract)

Answer: "the person insured ’ s occupation category"


In [19]:
question4 = "The definition of total disability for occupation category as MP, AA, A, B or C?"

answer_question(question4, bert_abstract)

Answer: "totally disabled"
