In [11]:
#!pip install pdfplumber

In [12]:
import pdfplumber

pdf_path = "/content/drive/MyDrive/DATASETS/google_terms_of_service_en_in.pdf"

output_text_file = 'extracted_text.txt'

with pdfplumber.open(pdf_path) as pdf:
    extracted_text = ""
    for page in pdf.pages:
        extracted_text += page.extract_text()

with open(output_text_file, 'w') as text_file:
    text_file.write(extracted_text)

print(f"Text extracted and saved to {output_text_file}")

Text extracted and saved to extracted_text.txt


In [13]:
#reading pdf content
with open('/content/extracted_text.txt', 'r') as file:
  document_text = file.read()

#preview the document content
print(document_text[:500]) #preview the first 500 characters

GOOGLE TERMS OF SERVICE
Effective May 22, 2024 | Archived versions
What’s covered in these terms
We know it’s tempting to skip these Terms of
Service, but it’s important to establish what you
can expect from us as you use Google services,
and what we expect from you.
These Terms of Service re ect the way Google’s business works, the laws that apply to
our company, and certain things we’ve always believed to be true. As a result, these Terms
of Service help de ne Google’s relationship with you as


In [14]:
from transformers import pipeline

#load the summarization pipeline
summarizer = pipeline("summarization", model='t5-small')

# summarize the document text (you can summarize parts if the document is too large)
summary = summarizer(document_text[:1000], max_length=150, min_length=30, do_sample=False)
print("Summary:", summary[0]['summary_text'])

Summary: these Terms of Service reect the way Google’s business works, the laws that apply to our company, and certain things we’ve always believed to be true . these terms include: what you can expect from us, which describes how we provide and develop our services What we expect from you, which establishes certain rules for using our services Content in Google services .


In [15]:
import nltk
from nltk.tokenize import sent_tokenize

# Download NLTK's punkt tokenizer (if not already downloaded)
nltk.download('punkt_tab')

# Split text into sentences
sentences = sent_tokenize(document_text)

# Combine sentences into passages (adjust word limit as needed)
passages = []
current_passage = ""
for sentence in sentences:
    if len(current_passage.split()) + len(sentence.split()) < 200:
        current_passage += " " + sentence
    else:
        passages.append(current_passage.strip())
        current_passage = sentence
if current_passage:
    passages.append(current_passage.strip())

print(passages)



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [17]:
# load the question generation pipeline
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

# function to generate questions using the pipeline
def generate_questions_pipeline(passage, min_questions=5):
    input_text = f"generate questions: {passage}"
    results = qg_pipeline(input_text)
    questions = results[0]['generated_text'].split('<sep>')

    # ensure we have at least 3 questions
    questions = [q.strip() for q in questions if q.strip()]

    # if fewer than 3 questions, try to regenerate from smaller parts of the passage
    if len(questions) < min_questions:
        passage_sentences = passage.split('. ')
        for i in range(len(passage_sentences)):
            if len(questions) >= min_questions:
                break
            additional_input = ' '.join(passage_sentences[i:i+2])
            additional_results = qg_pipeline(f"generate questions: {additional_input}")
            additional_questions = additional_results[0]['generated_text'].split('<sep>')
            questions.extend([q.strip() for q in additional_questions if q.strip()])

    return questions[:min_questions]  # return only the top 3 questions

In [18]:
# generate questions from passages
for idx, passage in enumerate(passages):
    questions = generate_questions_pipeline(passage)
    print(f"Passage {idx+1}:\n{passage}\n")
    print("Generated Questions:")
    for q in questions:
        print(f"- {q}")
    print(f"\n{'-'*50}\n")

Passage 1:
GOOGLE TERMS OF SERVICE
Effective May 22, 2024 | Archived versions
What’s covered in these terms
We know it’s tempting to skip these Terms of
Service, but it’s important to establish what you
can expect from us as you use Google services,
and what we expect from you. These Terms of Service re ect the way Google’s business works, the laws that apply to
our company, and certain things we’ve always believed to be true. As a result, these Terms
of Service help de ne Google’s relationship with you as you interact with our services. For
example, these terms include the following topic headings:
What you can expect from us, which describes how we provide and develop our
services
What we expect from you, which establishes certain rules for using our services
Content in Google services, which describes the intellectual property rights to the
content you  nd in our services — whether that content belongs to you, Google, or
others
In case of problems or disagreements, which describes o

In [19]:
# load the QA pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

# function to track and answer only unique questions
def answer_unique_questions(passages, qa_pipeline):
    answered_questions = set()  # to store unique questions

    for idx, passage in enumerate(passages):
        questions = generate_questions_pipeline(passage)

        for question in questions:
            if question not in answered_questions:  # check if the question has already been answered
                answer = qa_pipeline({'question': question, 'context': passage})
                print(f"Q: {question}")
                print(f"A: {answer['answer']}\n")
                answered_questions.add(question)  # add the question to the set to avoid repetition
        print(f"{'='*50}\n")

answer_unique_questions(passages, qa_pipeline)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]



Q: What is the meaning of the Terms of Service?
A: certain things we’ve always believed to be true

Q: What are the terms of service that govern how Google operates?
A: re ect the way Google’s business works, the laws

Q: What do these Terms of Service help define?
A: Google’s relationship with you as you interact with our services

Q: What are the terms of service that describe how Google provides and develops its services?
A: What you can expect from us

Q: What does the term "What you can expect from us" mean?
A: how we provide and develop our
services


Q: What is the name of the company that provides Google services?
A: Google LLC

Q: What is the name of the company that Google LLC provides services to?
A: 
Google LLC

Q: What are the terms of service for your child?
A: 
speci c additional terms and policies

Q: What is the age requirement for Google services?
A: service-
speci c additional terms and policies


Q: What are the terms that govern how Google earns money?
A: how Googl