In [20]:
import docx2txt
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
text = docx2txt.process(r"sample_policy_document.docx")


In [21]:
print(text)


D.2.1 Total disability benefit



If the person insured is totally disabled, we will pay you the total disability benefit. 

The total disability benefit is calculated monthly and we will pay you half a month in arrears and half a month in advance. 

The definition of totally disabled depends on the person insured’s occupation category. 



Where the Schedule specifies the occupation category as MP, AA, A, B or C 

The person insured is totally disabled if, because of an injury or sickness, he or she is: 

not capable of doing the important duties of his or her occupation 

not working in any occupation (whether paid or unpaid), and 

under medical care. 

However, if immediately preceding a claim the person insured has been unemployed for 15 months or on leave without pay for 12 months, he or she is totally disabled if, because of an injury or sickness, he or she is: 

not capable of performing any occupation (whether paid or unpaid) for which he or she is reasonably suited by educati

In [22]:
nltk.download('punkt')
sentences = nltk.sent_tokenize(text.lower().strip())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
sentences

['d.2.1 total disability benefit\n\n\n\nif the person insured is totally disabled, we will pay you the total disability benefit.',
 'the total disability benefit is calculated monthly and we will pay you half a month in arrears and half a month in advance.',
 'the definition of totally disabled depends on the person insured’s occupation category.',
 'where the schedule specifies the occupation category as mp, aa, a, b or c \n\nthe person insured is totally disabled if, because of an injury or sickness, he or she is: \n\nnot capable of doing the important duties of his or her occupation \n\nnot working in any occupation (whether paid or unpaid), and \n\nunder medical care.',
 'however, if immediately preceding a claim the person insured has been unemployed for 15 months or on leave without pay for 12 months, he or she is totally disabled if, because of an injury or sickness, he or she is: \n\nnot capable of performing any occupation (whether paid or unpaid) for which he or she is reason

In [27]:
heading = sentences[0].split('\n\n\n')[0]

In [28]:
# remove heading from the document
sentences[0] = sentences[0].split('\n\n\n')[1]

In [29]:
sentences

['\nif the person insured is totally disabled, we will pay you the total disability benefit.',
 'the total disability benefit is calculated monthly and we will pay you half a month in arrears and half a month in advance.',
 'the definition of totally disabled depends on the person insured’s occupation category.',
 'where the schedule specifies the occupation category as mp, aa, a, b or c \n\nthe person insured is totally disabled if, because of an injury or sickness, he or she is: \n\nnot capable of doing the important duties of his or her occupation \n\nnot working in any occupation (whether paid or unpaid), and \n\nunder medical care.',
 'however, if immediately preceding a claim the person insured has been unemployed for 15 months or on leave without pay for 12 months, he or she is totally disabled if, because of an injury or sickness, he or she is: \n\nnot capable of performing any occupation (whether paid or unpaid) for which he or she is reasonably suited by education, training o

In [30]:
questions = ["How often total disability is calculated? ",
             "How to pay the total disability?",
             "What the definition of total disability depends on",
            "The definition of total disability for occupation category as MP, AA, A, B or C"]

In [34]:
#Tfidf based on Ngrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

# Generating the tf-idf matrix for the text
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

# Function to get the response to a user's query
def get_response(query):
  query_tokens = nltk.sent_tokenize(query)
  query_vector = tfidf_vectorizer.transform(query_tokens)
  scores = cosine_similarity(query_vector, tfidf_matrix)
  max_index = scores.argmax()
  return sentences[max_index]

# Example usage
query = "How to pay the total disability?"
response = get_response(query)
print("question:", query)
print("Answer:", response)

question: How to pay the total disability?
Answer: 
if the person insured is totally disabled, we will pay you the total disability benefit.


In [48]:
index = 0
for query in questions:
    response = get_response(query.lower().strip())
    print("question" + '-' + str(index)  + ":", query)
    print("Answer:", response.strip())
    index = index + 1

question-0: How often total disability is calculated? 
Answer: the total disability benefit is calculated monthly and we will pay you half a month in arrears and half a month in advance.
question-1: How to pay the total disability?
Answer: if the person insured is totally disabled, we will pay you the total disability benefit.
question-2: What the definition of total disability depends on
Answer: the definition of totally disabled depends on the person insured’s occupation category.
question-3: The definition of total disability for occupation category as MP, AA, A, B or C
Answer: where the schedule specifies the occupation category as mp, aa, a, b or c 

the person insured is totally disabled if, because of an injury or sickness, he or she is: 

not capable of doing the important duties of his or her occupation 

not working in any occupation (whether paid or unpaid), and 

under medical care.
