# GDPR chatbot

In [1]:
%pip install transformers datasets torch --quiet
%pip install scikit-learn --quiet
%pip install PyPDF2 --quiet
%pip install numpy --quiet
%pip install faiss-cpu --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Collect datasets

In [52]:
import PyPDF2
with open('gdpr.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    text = ''
    for page in reader.pages:
        text += page.extract_text()

gdpr_text = text
with open('gdpr.txt', 'w') as output_file:
    output_file.write(text)

In [53]:
# Create dataset from the gdpr.txt

# 1. Chunk the text
def read_and_chunk_text(filename, chunk_size=100):
    with open(filename, 'r') as file:
        text = file.read()

    # Tokenize the text into words
    words = text.split()

    # Split words into chunks
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

    return chunks

chunks = read_and_chunk_text('gdpr.txt')

In [82]:
# 2. tokenize and format it
dpr_dataset = [{"id": f"{i}", "text": chunk, "title": f"Passage {i}", "embeddings": None} for i, chunk in enumerate(chunks)]

from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

dpr_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
dpr_model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

for passage in dpr_dataset:
    input_ids = dpr_tokenizer(passage["text"], return_tensors="pt")["input_ids"]
    embeddings = dpr_model(input_ids).pooler_output
    passage["embeddings"] = embeddings.detach().numpy()


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.weight', 'ctx_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

In [6]:
import pandas as pd

df = pd.DataFrame.from_records(dpr_dataset)
df.head(2)

Unnamed: 0,id,text,title,embeddings
0,0,I (Legislativ e acts) REGUL ATIONS REGUL ATION...,Passage 0,"[[0.2896709, -0.004823858, 0.16512825, -0.0188..."
1,1,After transmission of the draf t legislative a...,Passage 1,"[[0.473614, -0.1082333, 0.68263125, 0.02870157..."


In [7]:
df.to_json(f"dpr_dataset.jsonl", orient="records", lines=True)

In [8]:
dpr_dataset = load_dataset("json", data_files="dpr_dataset.jsonl", split="train")
dpr_dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['id', 'text', 'title', 'embeddings'],
    num_rows: 620
})

In [88]:
dpr_dataset[0]["embeddings"].shape

(1, 768)

In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
dpr_dataset.push_to_hub("gpdr-dpr-dataset")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/567 [00:00<?, ?B/s]

In [16]:
dpr_dataset.save_to_disk("gpdr-dpr-dataset.hf")

Saving the dataset (0/1 shards):   0%|          | 0/620 [00:00<?, ? examples/s]

In [98]:
import faiss
import numpy as np

# Prepare embeddings
embeddings = [passage["embeddings"][0] for passage in dpr_dataset]
embeddings = np.asarray(embeddings)
# ensure embeddings are float32
embeddings_converted = np.float32(embeddings)

In [99]:
type(embeddings[0][0])

numpy.float32

In [100]:
type(embeddings_converted[0][0])

numpy.float32

In [101]:
embeddings.shape

(620, 768)

In [102]:
embeddings_converted.shape

(620, 768)

In [103]:
# Create and train a FAISS index
index = faiss.IndexFlatIP(embeddings_converted.shape[1])
index.add(embeddings_converted)

In [108]:
def retrieve_top_k(query, tokenizer, model, index, k=5):
    """
    Retrieve top-k passages for a given query.
    
    Parameters:
    - query (str): The query string.
    - tokenizer: The DPR tokenizer.
    - model: The DPR model.
    - index: The FAISS index with stored embeddings.
    - k (int): Number of top results to retrieve.

    Returns:
    - List of indices of the top-k most similar passages.
    """

    # Tokenize and encode the query
    input_ids = tokenizer(query, return_tensors="pt")["input_ids"]
    query_embedding = model(input_ids).pooler_output

    # Convert the query embedding to numpy array
    query_embedding = query_embedding.detach().numpy()

    # Search the index
    D, I = index.search(query_embedding, k)
    return I[0]  # Return the indices of the top-k passages

In [111]:
query = "Can I save personal data without consent"
top_k_indices = retrieve_top_k(query, dpr_tokenizer, dpr_model, index, k=5)

# Get the passages for these indices (assuming you have the passages in a list or another format)
top_k_passages = [dpr_dataset[i]["text"] for i in top_k_indices]
for passage in top_k_passages:
    print("/////////")
    print(passage + "\n|")

/////////
a clear imbalance between the data subject and the controller , in particular where the controller is a public author ity and it is theref ore unlikely that consent was freely given in all the circumstances of that specifi c situation. Consent is presumed not to be freely given if it does not allow separate consent to be given to differ ent personal data processing operations despit e it being appropr iate in the individual case, or if the perf ormance of a contract, including the provision of a service, is dependent on the consent despite such consent not being
|
/////////
the data subject with direct access to his or her personal data. That right should not adversely affect the rights or freedoms of others, including trade secrets or intellectual proper ty and in particular the copyright protecting the software. However , the result of those considerations should not be a refusal to provide all information to the data subject. Where the controller processes a large quantity

In [129]:
class CustomFAISSRetriever:
    def __init__(self, dpr_tokenizer, dpr_model, faiss_index, dpr_dataset):
        self.dpr_tokenizer = dpr_tokenizer
        self.dpr_model = dpr_model
        self.faiss_index = faiss_index
        self.dpr_dataset = dpr_dataset

    # def retrieve(self, query, k=5):
    #     top_k_indices = retrieve_top_k(query, self.dpr_tokenizer, self.dpr_model, self.faiss_index, k=k)
    #     return top_k_indices

    def retrieve(self, query, k=5):
        top_k_indices = retrieve_top_k(query, self.dpr_tokenizer, self.dpr_model, self.faiss_index, k=k)
        top_k_passages = [self.dpr_dataset[i]["text"] for i in top_k_indices]
        return top_k_passages

custom_retriever = CustomFAISSRetriever(dpr_tokenizer, dpr_model, index, dpr_dataset)


def rag_answer_question(query, retriever, rag_model, rag_tokenizer):
    # Retrieve top-k relevant passages using the custom retriever
    retrieved_passages = retriever.retrieve(query)
    
    # Prepare list of questions (repeated) and the retrieved passages
    questions = [query] * len(retrieved_passages)

    # Use the RAG tokenizer to tokenize the questions and retrieved passages
    input_dict = rag_tokenizer.prepare_seq2seq_batch(
        src_texts=retrieved_passages,  # These are the passages
        tgt_texts=[query] * len(retrieved_passages),  # This is the question
        padding='longest',
        return_tensors="pt"
    )

    print(input_dict["input_ids"].shape)
    print(input_dict["attention_mask"].shape)

    # Use the BART's bos_token_id for decoder_start_token_id
    bos_token_id = rag_model.config.generator.bos_token_id
    
    # Generate the answer using the RAG model
    generated = rag_model.generate(input_ids=input_dict["input_ids"], attention_mask=input_dict["attention_mask"], decoder_start_token_id=bos_token_id)
    answer = rag_tokenizer.decode(generated[0], skip_special_tokens=True)

    return answer


def _rag_answer_question(query, retriever, rag_model, rag_tokenizer):
    # Retrieve top-k relevant passage indices using the custom retriever
    retrieved_indices = retriever.retrieve(query)

    # Tokenize the question
    input_dict = rag_tokenizer.prepare_seq2seq_batch(
        src_texts=retrieved_indices,  # These are the passages
        tgt_texts=[query] * len(retrieved_indices),  # This is the question
        padding='longest',
        return_tensors="pt"
    )


    # Generate the answer using the RAG model
    # Pass the document indices with the key 'retrieved_doc_ids'
    generated = rag_model.generate(
        input_ids=input_dict["input_ids"], 
        attention_mask=input_dict["attention_mask"],
        retrieved_doc_ids=torch.tensor([retrieved_indices]).long(),  # The indices should be a tensor of shape (batch_size, num_docs)
        decoder_start_token_id=rag_model.config.generator.bos_token_id
    )
    answer = rag_tokenizer.decode(generated[0], skip_special_tokens=True)

    return answer

In [130]:
question = "What are the rules for local storage and storing user data?"
answer = rag_answer_question(question, custom_retriever, rag_model, rag_tokenizer)
print(answer)

torch.Size([5, 130])
torch.Size([5, 130])




RuntimeError: batch2 must be a 3D tensor

In [11]:
# Saving the index
faiss.write_index(index, "gdpr_index.faiss")

In [104]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration, TrainingArguments, Trainer

# 1. Initialize tokenizer and model
rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
rag_model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

Some weights of the model checkpoint at facebook/rag-token-nq were not used when initializing RagTokenForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagTokenForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagTokenForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [105]:
# 2. Create retriever (this assumes you've created a custom dataset with the GDPR text)
dataset_path = "gpdr-dpr-dataset.hf"  # dataset saved via *dataset.save_to_disk(...)*
index_path = "gdpr_index.faiss"  # faiss index saved via *dataset.get_index("embeddings").save(...)*

rag_retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq",
    index_name="custom",
    passages_path=dataset_path,
    index_path=index_path,
)
rag_model.set_retriever(rag_retriever)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [106]:
question = "Do I need consent to store personalised data in localstorage?"
input_dict = tokenizer(question, return_tensors="pt")
input_ids = input_dict["input_ids"]
attention_mask = input_dict["attention_mask"]
generated = rag_model.generate(input_ids=input_ids, attention_mask=attention_mask)
answer = rag_tokenizer.decode(generated[0], skip_special_tokens=True)[0]
logger.info("Q: " + question)
logger.info("A: " + answer)

RuntimeError: batch2 must be a 3D tensor

In [107]:
print("Input IDs Shape:", input_ids.shape)
print("Attention Mask Shape:", attention_mask.shape)

Input IDs Shape: torch.Size([1, 16])
Attention Mask Shape: torch.Size([1, 16])


In [2]:
from datasets import load_dataset

dataset = load_dataset("lislia/gdpr_train")

In [15]:
# 3. Prepare training data

from typing import Dict, Union
import torch

def tokenize_function(example: Dict[str, Union[str, int]]) -> Dict[str, torch.Tensor]:
    """Tokenizes a single example using a pre-trained tokenizer.

    Args:
        example: The example containing a sentence to tokenize.

    Returns:
        A dictionary containing tokenized input_ids and attention_mask, both as PyTorch tensors.
    """
    tokenized_example = rag_tokenizer(
        example["question"],
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    return tokenized_example

dataset_split = dataset["train"].train_test_split(test_size=0.3, shuffle=True)
train_tokenized_datasets = dataset_split["train"].map(tokenize_function, batched=True)
test_tokenized_datasets = dataset_split["test"].map(tokenize_function, batched=True)

Map:   0%|          | 0/399 [00:00<?, ? examples/s]

Map:   0%|          | 0/171 [00:00<?, ? examples/s]

In [22]:
# 4. Set training arguments and train


training_args = TrainingArguments(
    output_dir="gpdr-qa",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=rag_model,
    args=training_args,
    train_dataset=train_tokenized_datasets,
)

trainer.train()


  0%|          | 0/150 [00:00<?, ?it/s]

RuntimeError: batch2 must be a 3D tensor

## Fine tune an existing BERT model on our annontated data

In [49]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

def encode(example):
    return tokenizer(example['context'], example['question'], truncation=True, padding='max_length', max_length=512)

dataset_split = dataset["train"].train_test_split(test_size=0.3, shuffle=True)
train_tokenized_datasets = dataset_split["train"].map(tokenize_function, batched=True)
test_tokenized_datasets = dataset_split["test"].map(tokenize_function, batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/399 [00:00<?, ? examples/s]

Map:   0%|          | 0/171 [00:00<?, ? examples/s]

In [51]:
from transformers import AutoModel, Trainer, TrainingArguments, DataCollatorWithPadding

model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")

training_args = TrainingArguments(
    output_dir="gpdr-qa",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    evaluation_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    learning_rate=3e-5,
    remove_unused_columns=False
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_datasets,
    data_collator=data_collator
)

trainer.train()

  0%|          | 0/150 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`id` in this case) have excessive nesting (inputs type `list` where type `int` is expected).