# Question 3

# Model-1 gtp-2 & all-MiniLM-L6-v2

In [2]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
# Required Libraries
import PyPDF2
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import torch

Chunking the Google Privacy Policy Document

In [None]:
# Extract text from the PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Clean and preprocess the text
def clean_and_split_text(text):
    """
    Cleans and splits text into individual sentences for use as a corpus.
    """
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'[^\w\s.,!?;]', '', text)
    sentences = re.split(r'(?<=[.!?]) +', text)
    return [sentence.strip() for sentence in sentences if sentence]

# Build the corpus
file_path = "google_privacy_policy_en.pdf"
raw_text = extract_text_from_pdf(file_path)
corpus = clean_and_split_text(raw_text)

Creating embedding using all-MiniLM-L6-v2

In [None]:
# Create embeddings for the corpus
retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = retriever_model.encode(corpus, convert_to_tensor=True)



Loading gpt2 and padding the tokens to resize it

In [None]:
# Load GPT-2 for answer generation
generator_model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add padding token and resize GPT-2 model
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
generator_model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token_id = tokenizer.eos_token_id

Question and answer generation using RAG

In [None]:
# Utility functions for retrieval and generation
def generate_response_with_rag(question, top_k=5, temperature=1.0):
    # Retrieve top-k relevant sentences
    question_embedding = retriever_model.encode(question, convert_to_tensor=True)
    similarities = cosine_similarity(
        question_embedding.cpu().numpy().reshape(1, -1),
        corpus_embeddings.cpu().numpy()
    )
    top_indices = np.argsort(similarities[0])[-top_k:][::-1]
    retrieved_sentences = [corpus[idx] for idx in top_indices]

    # Combine the question with the retrieved context
    combined_input = f"Question: {question} Context: {' '.join(retrieved_sentences)}"

    # Tokenize input
    inputs = tokenizer(
        combined_input,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
        padding=True
    )

    # Generate response
    outputs = generator_model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=500,
        num_beams=4,
        temperature=temperature,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True), retrieved_sentences, combined_input


Question and answer generation using Non-RAG

In [None]:
def generate_response_without_rag(question, temperature=1.0):
    # Tokenize input question
    inputs = tokenizer(
        question,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
        padding=True
    )

    # Generate response
    outputs = generator_model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=50,
        num_beams=4,
        temperature=temperature,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Define and process questions
questions = [
    "What data does Google collect from users?",
    "When does Google share user data externally?",
    "What privacy controls are available to users?",
    "How does Google manage cookies and tracking?"
]

print("\n--- Question-Answer Results ---\n")

In [None]:

for idx, question in enumerate(questions, 1):
    print(f"\nQuestion {idx}: {question}")
    print("=" * 50)

    # Generate Non-RAG response
    print("\n[Non-RAG Response]")
    response_without_rag = generate_response_without_rag(question, temperature=1.0)
    print(response_without_rag)

    # Generate RAG response
    response_with_rag, retrieved_sentences, combined_input = generate_response_with_rag(question, top_k=3, temperature=1.0)

    # Display RAG Retrieved Context
    print("\n[RAG Retrieved Context]")
    for i, sentence in enumerate(retrieved_sentences, 1):
        print(f"{i}. {sentence}")

    # Display RAG response
    print("\n[RAG Generated Response]")
    print(response_with_rag)
    print("\n" + "=" * 50)



--- Question-Answer Results ---


Question 1: What data does Google collect from users?

[Non-RAG Response]
What data does Google collect from users?

Google collects information about you, your habits, and how you interact with other people. It also collects information about you, your habits, and how you interact with other people. It also collects information about you, your habits, and how you

[RAG Retrieved Context]
1. collect information This includes information like your usage data and preferences, Gmail messages, G profile, photos, videos, browsing history, map searches, docs, or other Googlehosted content.
2. Information we collect when you are signed in to Google, in addition to information we obtain about you from partners, may be associated with your Google Account.
3. These products share information about your activity with Google and, depending on your account settings and the products in use for instance, when a partner uses Google Analytics in conjunction with our a

1. Source of Information
    1. RAG Output: Combines retrieved context from external sources (like a knowledge base or a document set) with the generative capabilities of GPT The retrieved context provides specific, relevant, and factual information, improving the quality and grounding of the response.
Example: The RAG response includes explicit details, like how Google collects "usage data, Gmail messages, browsing history," and data from "partners using Google Analytics."
    2. Non-RAG Output: Relies solely on the pre-trained language model's inherent knowledge without external retrieval. This can lead to generic or repetitive content, as seen in the example where it redundantly mentions "information about you, your habits."
2. Relevance and Specificity
    1. RAG Output: Tends to be more specific and contextually relevant because it uses real-time retrieved context to construct responses. Example: It discusses account association and partner data sharing, adding depth to the explanation.
    2. Non-RAG Output: Can be vague, repetitive, or miss key details due to the lack of retrieved context.
3. Accuracy
    1. RAG Output: Can enhance factual accuracy by grounding responses in retrieved information. This reduces hallucinations (incorrect or fabricated facts) commonly seen in generative models.
    2. Non-RAG Output: Relies entirely on the model's trained knowledge, which might be outdated or incomplete, potentially leading to less accurate responses.
4. Coherence and Redundancy
    1. RAG Output: Often more coherent and concise because the retrieved content provides a structured foundation for the generated text.
    2. Non-RAG Output: May exhibit redundancy or verbose phrasing, as seen in the repetitive mention of "you, your habits, and how you interact."

Example of Differences
Given the query:

1. Non-RAG Output:
    "Google collects information about you, your habits, and how you interact with other people. It also collects information about you, your habits, and how you interact with other people..."
    Repetitive, lacks specificity, and seems like a generic answer.
2. RAG Output:
"Google collects data such as your usage information, Gmail messages, profile information, browsing history, map searches, and data from Google Analytics, which may be associated with your Google Account depending on your settings."

Specific, factually grounded, and directly addresses the query.


1. RAG enhances specificity, relevance, and accuracy by integrating external context into the response generation process.
2. Non-RAG relies solely on pre-trained knowledge, which can lead to generic or less relevant answers.

# Model-2 GPT-Neo & all-MiniLM-L6-v2

In [None]:
import PyPDF2
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import torch

Chunking the Google Privacy Policy Document

In [None]:
# Extract text from the PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Clean and preprocess the text
def clean_and_split_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'[^\w\s.,!?;]', '', text)
    sentences = re.split(r'(?<=[.!?]) +', text)
    return [sentence.strip() for sentence in sentences if sentence]

In [None]:
# Build the corpus
file_path = "google_privacy_policy_en.pdf"
raw_text = extract_text_from_pdf(file_path)
corpus = clean_and_split_text(raw_text)

Creating embedding using all-MiniLM-L6-v2

In [None]:
# Create embeddings for the corpus
retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = retriever_model.encode(corpus, convert_to_tensor=True)

Loading gpt-neo-1.3B and padding the tokens to resize it

In [None]:
# Load the GPT-Neo model and tokenizer
model_name = "EleutherAI/gpt-neo-1.3B" 
generator_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# Resize the model embeddings to include the new padding token if added
generator_model.resize_token_embeddings(len(tokenizer))

Question and answer generation using RAG

In [None]:
# Utility functions for retrieval and generation
def generate_response_with_rag(question, top_k=5, temperature=1.5):
    # Retrieve top-k relevant sentences
    question_embedding = retriever_model.encode(question, convert_to_tensor=True)
    similarities = cosine_similarity(
        question_embedding.cpu().numpy().reshape(1, -1),
        corpus_embeddings.cpu().numpy()
    )
    top_indices = np.argsort(similarities[0])[-top_k:][::-1]
    retrieved_sentences = [corpus[idx] for idx in top_indices[:3]]  # Limit to top 3 sentences

    # Combine the question with the retrieved context
    combined_input = f"Question: {question} Context: {' '.join(retrieved_sentences)}"

    # Tokenize input
    inputs = tokenizer(
        combined_input,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
        padding=True
    )

    # Generate response
    outputs = generator_model.generate(
        inputs["input_ids"].to("cuda"),
        attention_mask=inputs["attention_mask"].to("cuda"),
        max_new_tokens=200,
        num_beams=4,
        temperature=temperature,
        early_stopping=True,
        repetition_penalty=1.2  # Penalize repetitive tokens
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True), retrieved_sentences, combined_input

Question and answer generation using Non-RAG

In [None]:
def generate_response_without_rag(question, temperature=1.5):
    # Tokenize input question
    inputs = tokenizer(
        question,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
        padding=True
    )

    # Generate response
    outputs = generator_model.generate(
        inputs["input_ids"].to("cuda"),
        attention_mask=inputs["attention_mask"].to("cuda"),
        max_new_tokens=200,
        num_beams=4,
        temperature=temperature,
        early_stopping=True,
        repetition_penalty=1.2  # Penalize repetitive tokens
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Define and process questions
questions = [
    "What data does Google collect from users?",
    "When does Google share user data externally?",
    "What privacy controls are available to users?",
    "How does Google manage cookies and tracking?"
]

print("\n--- Question-Answer Results ---\n")

In [6]:
for idx, question in enumerate(questions, 1):
    print(f"\nQuestion {idx}: {question}")
    print("=" * 50)

    # Generate Non-RAG response
    print("\n[Non-RAG Response]")
    response_without_rag = generate_response_without_rag(question, temperature=1.5)
    print(response_without_rag)

    # Generate RAG response
    response_with_rag, retrieved_sentences, combined_input = generate_response_with_rag(question, top_k=3, temperature=1.5)

    # Display RAG Retrieved Context
    print("\n[RAG Retrieved Context]")
    for i, sentence in enumerate(retrieved_sentences, 1):
        print(f"{i}. {sentence}")

    # Display RAG response
    print("\n[RAG Generated Response]")
    print(response_with_rag)
    print("\n" + "=" * 50)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



--- Question-Answer Results ---


Question 1: What data does Google collect from users?

[Non-RAG Response]


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


What data does Google collect from users?

Google collects a lot of data from its users. This data can be used to improve the user experience, and it can also be used to improve Google’s products and services.

Google collects a lot of data from its users. This data can be used to improve the user experience, and it can also be used to improve Google’s products and services.

Google collects a lot of data from its users. This data can be used to improve the user experience, and it can also be used to improve Google’s products and services.

Google collects a lot of data from its users. This data can be used to improve the user experience, and it can also be used to improve Google’s products and services.

Google collects a lot of data from its users. This data can be used to improve the user experience, and it can also be used to improve Google’s products and services.

Google collects a


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



[RAG Retrieved Context]
1. collect information This includes information like your usage data and preferences, Gmail messages, G profile, photos, videos, browsing history, map searches, docs, or other Googlehosted content.
2. Information we collect when you are signed in to Google, in addition to information we obtain about you from partners, may be associated with your Google Account.
3. These products share information about your activity with Google and, depending on your account settings and the products in use for instance, when a partner uses Google Analytics in conjunction with our advertising services, this data may be associated with your personal information.

[RAG Generated Response]
Question: What data does Google collect from users? Context: collect information This includes information like your usage data and preferences, Gmail messages, G profile, photos, videos, browsing history, map searches, docs, or other Googlehosted content. Information we collect when you are si

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


When does Google share user data externally?

Google has a long history of sharing user data with third parties, but it’s only recently that the company has been transparent about how it does it.

In the past, Google has shared user data with third parties in order to improve its products and services. For example, it has shared user data with third parties in order to improve its search results. It has also shared user data with third parties in order to improve its ads.

Google has also shared user data with third parties in order to improve its products and services. For example, it has shared user data with third parties in order to improve its search results. It has also shared user data with third parties in order to improve its ads.

Google has also shared user data with third parties in order to improve its products and services. For example, it has shared user data with third parties in order to improve its search results. It has also shared user data with third parties in ord

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



[RAG Retrieved Context]
1. Information we share We do not share personal information with companies, organizations and individuals outside of Google unless one of the following circumstances applies With your consent We will share personal information with companies, organizations or individuals outside of Google when we have your consent to do so.
2. For legal reasons We will share personal information with companies, organizations or individuals outside of Google if we have a good faith belief that access, use, preservation or disclosure of the information is reasonably necessary to meet any applicable law, regulation, legal process or enforceable governmental request .
3. These products share information about your activity with Google and, depending on your account settings and the products in use for instance, when a partner uses Google Analytics in conjunction with our advertising services, this data may be associated with your personal information.

[RAG Generated Response]
Que

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


What privacy controls are available to users?

A:

I'm not sure what you mean by "privacy controls", but here are some of the privacy controls that are available to users:

Privacy settings can be found in the Privacy tab of the Settings app.
Privacy settings can be found in the Privacy tab of the Settings app.
Privacy settings can be found in the Privacy tab of the Settings app.
Privacy settings can be found in the Privacy tab of the Settings app.
Privacy settings can be found in the Privacy tab of the Settings app.
Privacy settings can be found in the Privacy tab of the Settings app.




Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



[RAG Retrieved Context]
1. As you use our services, we want you to be clear how we re using information and the ways in which you can protect your privacy.
2. restrict your ability to delete or edit information or privacy settings.
3. We also adhere to several self regulatory frameworks , including the EUUS and SwissUS Privacy Shield Frameworks.

[RAG Generated Response]
Question: What privacy controls are available to users? Context: As you use our services, we want you to be clear how we re using information and the ways in which you can protect your privacy. restrict your ability to delete or edit information or privacy settings. We also adhere to several self regulatory frameworks , including the EUUS and SwissUS Privacy Shield Frameworks.

Question: How do I know what information is being collected? Context: We collect information about you, such as your name, address, phone number, email address, and other contact information. We also collect information about your use of our se

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


How does Google manage cookies and tracking?

Google’s privacy policy states: “We do not sell or rent your personal information to third parties for marketing purposes.”

However, it’s worth noting that Google’s privacy policy also states: “We may share your personal information with third parties for the purposes described in this Privacy Policy.”

What is Google Analytics?

Google Analytics is a web analytics service provided by Google Inc. (“Google”). Google Analytics uses “cookies”, which are text files placed on your computer, to help the website analyze how users use the site. The information generated by the cookie about your use of the website (including your IP address) will be transmitted to and stored by Google on servers in the United States. Google will use this information for the purpose of evaluating your use of the website, compiling reports on website activity for website operators and providing other services relating to website activity and internet usage. Google

[

Non-RAG Response
1. Content:

    Highly repetitive and verbose, with no substantial details about what data is collected.
Focuses on general statements, such as "Google collects a lot of data from its users," and "improves user experience," without specificity.

2. Issues:

    Redundancy: Repeating the same statement multiple times reduces the effectiveness and clarity of the response.
Lack of context or depth: Does not specify the types of data collected (e.g., browsing history, Gmail messages).

RAG Retrieved Context
1. Content:

    Provides detailed information about the types of data Google collects, including "usage data, Gmail messages, browsing history, map searches, and other Google-hosted content."
Explains how this data is associated with Google Accounts and highlights partner-based data collection (e.g., Google Analytics).
Mentions privacy policy links for further information.

2. Advantages:

    Specificity: Details like "map searches, videos, and photos" make the response concrete and informative. Relevance: Context retrieval ensures alignment with the user's query.Credibility: References privacy policy URLs for further reading.