In [1]:
# importing the librtaries
import numpy as np
import nltk
import re
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer 
import heapq

In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[K     |████████████████████████████████| 232 kB 4.1 MB/s eta 0:00:01
[?25hCollecting typing_extensions>=3.10.0.0
  Downloading typing_extensions-4.7.0-py3-none-any.whl (33 kB)
Installing collected packages: typing-extensions, PyPDF2
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 3.7.4.3
    Uninstalling typing-extensions-3.7.4.3:
      Successfully uninstalled typing-extensions-3.7.4.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.4.1 requires typing-extensions~=3.7.4, but you have typing-extensions 4.7.0 which is incompatible.
arviz 0.11.2 requires typing-extensions<4,>=3.7.4.3, but you have typing-extensions 4.7.0 which is incompatible.
aiobotocore 1.3.0 requires botocore<1.20.50,>=1.20.49, but you have botocore 1.20.53 w

In [3]:
!pip install docx2txt

Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25ldone
[?25h  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3966 sha256=c5ec9f2eb8ea9b0250d1e40dce3b03c9d05503ba87d3dbd33a365260ebf21946
  Stored in directory: /root/.cache/pip/wheels/b7/20/b2/473e3aea9a0c0d3e7b2f7bd81d06d0794fec12752733d1f3a8
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8


In [4]:
import os
import docx2txt
from PyPDF2 import PdfReader
from transformers import pipeline

In [5]:
# Function to load and read a text document
def load_text_document(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    return content

# Function to load and read a Word document
def load_word_document(file_path):
    content = docx2txt.process(file_path)
    return content

# Function to load and read a PDF document
def load_pdf_document(file_path):
    content = ""
    with open(file_path, 'rb') as file:
        pdf = PdfReader(file)
        num_pages = len(pdf.pages)
        for page in range(num_pages):
            content +=pdf.pages[page].extract_text()
    return content


In [6]:
def process_document(file_path):
    _, file_extension = os.path.splitext(file_path)
    if file_extension == '.txt':
        content = load_text_document(file_path)
    elif file_extension == '.docx':
        content = load_word_document(file_path)
    elif file_extension == '.pdf':
        content = load_pdf_document(file_path)
    else:
        print( "Unsupported file format")
    return content 

In [7]:
file_path = "../input/us-declaration-pdf-file/US_Declaration.pdf"  # Replace with the actual path to your document
content = process_document(file_path)

In [8]:
content

"Declaration of Independence\nIN CONGRESS, July 4, 1776.  \nThe unanimous Declaration of the thirteen united States of America,  \nWhen in the Course of human events, it becomes necessary for one people to dissolve thepolitical bands which have connected them with another, and to assume among the powers of theearth, the separate and equal station to which the Laws of Nature and of Nature's God entitlethem, a decent respect to the opinions of mankind requires that they should declare the causeswhich impel them to the separation. We hold these truths to be self-evident, that all men are created equal, that they are endowed bytheir Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit\nof Happiness.— \x14That to secure these rights, Governments are instituted among Men, derivingtheir just powers from the consent of the governed,—  \x14That whenever any Form of Government\nbecomes destructive of these ends, it is the Right of the People to alter or to 

In [9]:
#class for preprocessing and creating word embedding
class Preprocessing:
    #constructor
    def __init__(self,txt):
        # Tokenization
        nltk.download('punkt')  #punkt is nltk tokenizer 
        # breaking text to sentences
        tokens = nltk.sent_tokenize(txt) 
        self.tokens = tokens
        self.tfidfvectoriser=TfidfVectorizer()

    # Data Cleaning
    # remove extra spaces
    # convert sentences to lower case 
    # remove stopword
    def clean_sentence(self, sentence, stopwords=False):
        sentence = sentence.lower().strip()
        sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
        if stopwords:
            sentence = remove_stopwords(sentence)
        return sentence

    # store cleaned sentences to cleaned_sentences
    def get_cleaned_sentences(self,tokens, stopwords=False):
        cleaned_sentences = []
        for line in tokens:
            cleaned = self.clean_sentence(line, stopwords)
            cleaned_sentences.append(cleaned)
        return cleaned_sentences

    #do all the cleaning
    def cleanall(self):
        cleaned_sentences = self.get_cleaned_sentences(self.tokens, stopwords=True)
        cleaned_sentences_with_stopwords = self.get_cleaned_sentences(self.tokens, stopwords=False)
        # print(cleaned_sentences)
        # print(cleaned_sentences_with_stopwords)
        return [cleaned_sentences,cleaned_sentences_with_stopwords]

    # TF-IDF Vectorizer
    def TFIDF(self,cleaned_sentences):
        self.tfidfvectoriser.fit(cleaned_sentences)
        tfidf_vectors=self.tfidfvectoriser.transform(cleaned_sentences)
        return tfidf_vectors

    #tfidf for question
    def TFIDF_Q(self,question_to_be_cleaned):
        tfidf_vectors=self.tfidfvectoriser.transform([question_to_be_cleaned])
        return tfidf_vectors

    # main call function
    def doall(self):
        cleaned_sentences, cleaned_sentences_with_stopwords = self.cleanall()
        tfidf = self.TFIDF(cleaned_sentences)
        return [cleaned_sentences,cleaned_sentences_with_stopwords,tfidf]


In [10]:
#class for answering the question.
class AnswerMe:
    #cosine similarity
    def Cosine(self, question_vector, sentence_vector):
        dot_product = np.dot(question_vector, sentence_vector.T)
        denominator = (np.linalg.norm(question_vector) * np.linalg.norm(sentence_vector))
        return dot_product/denominator
    
    #Euclidean distance
    def Euclidean(self, question_vector, sentence_vector):
        vec1 = question_vector.copy()
        vec2 = sentence_vector.copy()
        if len(vec1)<len(vec2): vec1,vec2 = vec2,vec1
        vec2 = np.resize(vec2,(vec1.shape[0],vec1.shape[1]))
        return np.linalg.norm(vec1-vec2)

    # main call function
    def answer(self, question_vector, sentence_vector, method):
        if method==1: return self.Euclidean(question_vector,sentence_vector)
        else: return self.Cosine(question_vector,sentence_vector)


# Trying HeapQ, TFIDF and Cosine

In [11]:
def RetrieveAnswer(question_embedding, tfidf_vectors,method=1):
    similarity_heap = []
    if method==1: max_similarity = float('inf')
    else: max_similarity = -1
    index_similarity = -1

    for index, embedding in enumerate(tfidf_vectors):  
        find_similarity = AnswerMe()
        similarity = find_similarity.answer((question_embedding).toarray(),(embedding).toarray() , method).mean()
        if method==1:
            heapq.heappush(similarity_heap,(similarity,index))
        else:
            heapq.heappush(similarity_heap,(-similarity,index))
            
    return similarity_heap


In [12]:
# Put Your question here
user_question = "when was independence declared"
#define method
method = 1

In [13]:
preprocess = Preprocessing(content)
cleaned_sentences,cleaned_sentences_with_stopwords,tfidf_vectors = preprocess.doall()

question = preprocess.clean_sentence(user_question, stopwords=True)
question_embedding = preprocess.TFIDF_Q(question)

similarity_heap = RetrieveAnswer(question_embedding , tfidf_vectors ,method)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [14]:
print("Question: ", user_question)

# number of relevant solutions you want here it will print 2
number_of_sentences_to_print = 2

while number_of_sentences_to_print>0 and len(similarity_heap)>0:
    x = similarity_heap.pop(0)
    print(cleaned_sentences_with_stopwords[x[1]])
    number_of_sentences_to_print-=1

Question:  when was independence declared
declaration of independence
in congress july 4 1776
he has combined with others to subject us to a jurisdiction foreign to our
constitution and unacknowledged by our laws giving his assent to their acts of
pretended legislation
for quartering  large bodies of  armed troops amon g us
for protecting them by a mock trial from punishment for any  murders which
they should c ommit on the inha bitants of these sta tes
for cutting off our trade with all parts of the world
for imposing taxe s on us without our con sent for deprivi ng us in many  cases
of the benefits of t rial by j ury
for transporting us beyond seas to be tried for pretended of fences
for abolishing the free system of english l aws in a neighbouring province
establishing therein an arbitrary government and e nlarging its boundaries so asto render it at onc e an example and fi t instrument for intr oducing the same
absolute rule into  these colonies
for taking away our charters abolish

# Hugging Face Transformer Pipeline

In [18]:
# Perform natural language understanding tasks using a pre-trained model
nlp = pipeline("question-answering")
summarizer = pipeline("summarization")

# Extract key information and provide summarization
summary = summarizer(content, min_length=5, max_length=20, truncation=True)
summary_text = summary[0]['summary_text']

candidate_summaries = []

# for i, con in enumerate(content):
#     if i % 100 == 0:
#         print(i)
#     candidate = summarizer(con, min_length=5, max_length=20)
#     candidate_summaries.append(candidate[0]['summary_text'])

# User interaction loop
while True:
    user_input = input("What would you like to know about the document? (Type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    result = nlp(question=user_input, context=content)
    answer = result['answer']
    print("Answer:", answer)

What would you like to know about the document? (Type 'exit' to quit):  when was independence declared?


Answer: July 4, 1776


What would you like to know about the document? (Type 'exit' to quit):  the representatives of united States of America appealed to whom?


Answer: their native justice and magnanimity


What would you like to know about the document? (Type 'exit' to quit):  who were the representatives of North Carolina?


Answer: William Hooper
   Joseph Hewes
   John Penn
 South Carolina


What would you like to know about the document? (Type 'exit' to quit):  the unanimous declaration of ?


Answer: thirteen united States of America


What would you like to know about the document? (Type 'exit' to quit):  exit


## Made question-answering little interactive in above example

# Checking my own CV

In [16]:
   
file_path = "../input/resume/Shubhangi.pdf"  # Replace with the actual path to your document
content1 =process_document(file_path)
print(content1)

SUMMARY
Meticulous & result-oriented Data Scientist with 10+ years experience, armed with a proven record of analytical acumen in 
developing complex machine learning and statistical modeling algorithms/techniques for identifying patterns and extracting 
valuable insights.
Eligible to work full-time in the United Kingdom without company support or sponsorship.
KEY SKILLS
  • Machine Learning Methodologies • Optimization Techniques   • Predictive & Statistical Modelling  
 • Hyperparameter Tuning  • Data Analysis • Deep Learning • AWS • Data Visualization • Predictive Modelling & Analytics
•  Selenium • Automation Testing • Big Data • Team Coordination & Leadership  • Data Manipulation
TECHNICAL SKILLS
Languages: Python, Java
Cloud Computing: AWS 
Machine/Deep Learning: ANN, CNN, LSTM, RNN,NLP
Database: MySQL, Oracle, Postgre, SQL Server
Statistics/ML: Linear/Logistic Regression, Ensemble Trees, Gradient Boosted trees, Time Series, Regularization
Data Visualization: Tableau, Power BI
ED

In [20]:
# Perform natural language understanding tasks using a pre-trained model
nlp = pipeline("question-answering")
summarizer = pipeline("summarization")

# Extract key information and provide summarization
summary = summarizer(content, min_length=5, max_length=20, truncation=True)
summary_text = summary[0]['summary_text']

candidate_summaries = []
while True:
    user_input = input("What would you like to know about the document? (Type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    result = nlp(question=user_input, context=content1)
    answer = result['answer']
    print("Answer:", answer)

What would you like to know about the document? (Type 'exit' to quit):  what college I attended from 2008-2012?


Answer: Harcourt Butler Technological Institute


What would you like to know about the document? (Type 'exit' to quit):  Key skills?


Answer: Increasing product performance and stability by implementing new test strategies


What would you like to know about the document? (Type 'exit' to quit):  organisation I worked for between 2020-2021?


Answer: Anti Money Laundering problem


What would you like to know about the document? (Type 'exit' to quit):  What is my email id?


Answer: gmail.com


What would you like to know about the document? (Type 'exit' to quit):  exit


**Not working well on unstructured document**

# Trying with text that I used for other model to understan 

In [21]:
pdf_txt = """Life insurance is a legally enforceable contract between two parties both of whom are legally qualified to contract. It is therefore, necessary that the terms and conditions of the agreement must be suitably documented in a manner that would make it clear that both parties to the contract are Ad- idem i.e., of the same mind. Ad-Idem means that both the parties understand the same thing in the same sense or are of the same mind on the same subject. There must be consensus or Ad-Idem between the parties to the contract.
This is possible provided all the terms and conditions, rights and duties - privileges and obligations are properly documented in terms which can be clearly interpreted in a court of law. Between two human beings sometime silence means an acceptance. But as the insurer is a legal personality entitled to contract verbal discussion between parties to the contract is not possible and hence there is a need for documentation.
Insurance is also a contract of utmost good faith and enforced only in the distant future. It is therefore necessary that the declarations made by both the parties should be put in black and white for future reference. Any suppression, willful and material shall make the contract void. The insured, therefore, has a duty to declare all that he knows about himself, his health, his financial status in answering questions contained in the proposal form and other ancillary documents which may be required by the insurer.
Age is an important factor in deciding the quantum of premium against a policy. The document proving the age, i.e. age proof must be reliable and the insured has to undertake as to its truthfulness.
Non-standard age proofs are those which are comparatively less reliable and therefore the insurer accepts them with a pinch of salt. In other words the insurer takes certain precautions before accepting such age proofs as final.
Proof of income is the document may become necessary whenever the sum proposed is very high. Normally a sum proposed which is seven to eight times of the declared income is acceptable for insurance. But proposals do come to the insurer when the known source of income of the proposer is much less compared to the amount of insurance desired. A service holder normally does not face this problem as his sources of income are verifiable.
In case of business people, the assessed income is at times much less compared to what is a desirable income for the amount of insurance desired. In such cases the insurer at times calls for assessed income tax returns, or Chartered Accountant’s certificate etc. Such precautions are necessary to eliminate the possibility of moral hazard.
Policy Contract is the policy document is a detailed document and it is the Evidence of the insurance contract which mentions all the terms and conditions of the insurance. The insured buys not the policy contract, but the right to the sum of money and its future delivery. The insurer on its part promises to pay a sum of money, provided of course the insured keeps its part of promise of paying the installments of premium as scheduled.
The pre-amble to the insurance contract makes the above statement clear and states that this policy is issued subject to the conditions and privileges printed on the back of the policy. The endorsements placed on the policy shall also be part of the policy and it also makes a reference to the proposal form saying that that the statements given in the proposal form are the basis of the contract.
The schedule which is printed on the policy document identifies the office which has issued the policy. It states the name of the policyholder, the date of commencement of the policy, an identification number of the policy called policy number. This number is extremely useful for making any reference to the insurer relating to this policy. This shall avoid needless delay.
Beneficiary’s name is also mentioned along with address.
It is necessary to check that it is correct and any mistake should be immediately pointed out for correction. A mistake in the address may misdirect the premium notices and any other future correspondence. It also states the name of the nominee and the date upto which premium has to be paid. The schedule goes on to mention, the type of policy, on the happening of which, the sum assured is payable and to whom it is payable. It of course also mentions when and how long the premium is to be paid.
The policy document is signed by an official of the insurer and dated and stamped as per the provision of the Stamp Act to make it a completely legally enforceable document.
An assignment of a policy in favour of another person or institution can be effected by an endorsement on the policy. Re-assignment can also be done by a subsequent endorsement on the same policy.As a nomination is automatically cancelled due to an assignment, after re-assignment, it is necessary to make a fresh nomination.
In Duplicate policy a policy document is a valuable document and can be used for mortgage etc. Loss of policy document does not absolve the insurer from the liability of payment of policy proceeds when the claim arises. The claim can be settled on the claimants, furnishing an indemnity bond jointly with one surety.
If a policy is irrevocably lost, a duplicate policy can be issued, after following a certain procedure like the insurer satisfies itself of the circumstances leading to loss and being so satisfied the insurer insists upon an advertisement in a news paper, production of an indemnity bond and payment of policy preparation charges and there after a duplicate policy is issued.
Generally nomination is made at the time of taking a policy. In case it is not done, it is possible to make nomination subsequently by an endorsement on the policy. It is also possible to change a nomination subsequently by an endorsement. After marriage, such change in nomination is normally required.
Life insurance being a legally enforceable contract, needs to be documented with details of the rights and obligations of the parties to the contract. Proposal form duly filled in and signed by the proposer is the first document which forms the basis of the contract.
Every time, the insured pays the premium, he receives a premium receipt. The premium needs to be paid in time, non- payment of premium leads to policy-lapses. Re-instatement of the cover is called revival of the policy.
If the policy is not revived, the policy can become a paid up policy for a reduced sum assured under certain conditions.
The policy document mentions in detail all the rights and obligations of the policyholder. The agent is advised to explain the various provisions of the policy to the policyholder.
The wordings in the policy document are of technical nature and hence the need for explaining. If there are certain endorsements on the policy, that need to be explained too.
It needs to be explained that the policy is a valuable document and needs to be kept in safe custody and in the knowledge of the close relatives.
"""

In [22]:
# Perform natural language understanding tasks using a pre-trained model
nlp = pipeline("question-answering")
summarizer = pipeline("summarization")

# Extract key information and provide summarization
summary = summarizer(content, min_length=5, max_length=20, truncation=True)
summary_text = summary[0]['summary_text']

candidate_summaries = []
while True:
    user_input = input("What would you like to know about the document? (Type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    result = nlp(question=user_input, context=pdf_txt)
    answer = result['answer']
    print("Answer:", answer)

What would you like to know about the document? (Type 'exit' to quit):  When can a duplicate policy be issued?


Answer: If a policy is irrevocably lost


What would you like to know about the document? (Type 'exit' to quit):  exit
