In [1]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [2]:
class_name = "Forgotten"
prefix = "forgotten"
summary_file = f"../tmp/{prefix}_summary.csv"
question_file = f"../tmp/{prefix}_services_question.csv"

path_to_manual_as_csv_file = f"../inputs/documents/{prefix}.parquet"
index_file = f"../inputs/index/{prefix}.parquet"



In [3]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.forgotten
importlib.reload(gdpr_rag.documents.forgotten)
from gdpr_rag.documents.forgotten import Forgotten

doc = Forgotten(path_to_manual_as_csv_file)


In [4]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "forgotten", reference_checker = reference_checker, regulation_df = df)

In [5]:
toc.print_tree()

forgotten []
|-- INTRODUCTION []
|-- 1 [THE GROUNDS OF THE RIGHT TO REQUEST DELISTING UNDER GDPR ]
|   |-- .1 [Ground 1: The Right to request delisting when the personal data are no longer necessary in relation to the search engine provider's processing (Article 17.1.a) ]
|   |-- .2 [Ground 2: The Right to request delisting when the data subject withdraws consent where the legal basis for the processing is pursuant to Article 6.1.a or Article 9.2.a GDPR and where there is no other legal basis for the processing (Article 17.1.b) ]
|   |-- .3 [Ground 3: The Right to request delisting when the data subject has exercised his or her Right to object to the processing of his or her personal data (Article 17.1.c) ]
|   |-- .4 [Ground 4: The Right to request delisting when the personal data have been unlawfully processed (Article 17.1.d) ]
|   |-- .5 [Ground 5: The Right to request delisting when the personal data have to be erased for ]
|   +-- .6 [Ground 6: The Right to request delisting when

In [10]:
from IPython.display import display
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)

sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)

split_df = split_df.drop([0]) # Introduction and summary
# split_df = split_df[split_df["token_count"] > 10] 
split_df.reset_index(drop=True, inplace=True)

display(split_df)

Unnamed: 0,section_reference,text,token_count
0,1,# 1 THE GROUNDS OF THE RIGHT TO REQUEST DELIST...,496
1,1.1,# 1 THE GROUNDS OF THE RIGHT TO REQUEST DELIST...,457
2,1.2,# 1 THE GROUNDS OF THE RIGHT TO REQUEST DELIST...,549
3,1.3,# 1 THE GROUNDS OF THE RIGHT TO REQUEST DELIST...,1015
4,1.4,# 1 THE GROUNDS OF THE RIGHT TO REQUEST DELIST...,300
5,1.5,# 1 THE GROUNDS OF THE RIGHT TO REQUEST DELIST...,211
6,1.6,# 1 THE GROUNDS OF THE RIGHT TO REQUEST DELIST...,559
7,2,# 2 THE EXCEPTIONS TO THE RIGHT TO REQUEST DEL...,346
8,2.1,# 2 THE EXCEPTIONS TO THE RIGHT TO REQUEST DEL...,1395
9,2.2,# 2 THE EXCEPTIONS TO THE RIGHT TO REQUEST DEL...,124


In [11]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [12]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [13]:
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = class_name

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = class_name

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [54]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
# 2 THE EXCEPTIONS TO THE RIGHT TO REQUEST DELISTING UNDER ARTICLE 17.3

## 2.5 Establishment, exercise or defence of legal claims

82. In principle, it is very unlikely that search engine providers can use this exemption to reject Article 17 GDPR delisting requests.

83. It must be further emphasised that a delisting request supposes the suppression of certain results from the search results page provided by the search engine provider when the name of a data subject is normally used as search criteria. The information remains accessible using other search terms.

(a) Union law; or

(b) Member State law to which the controller is subject (…)"
##############
# index: 14, section_reference: 2.5

df_summary.loc[index, "text"] = "82. Search engine providers are unlikely to use the exemption for the establishment, exercise, or defence of legal claims to reject delisting requests under Article 17.

83. A delisting request means that specific results are removed from the search

In [55]:
model_summary

"82. Search engine providers are unlikely to use the exemption for the establishment, exercise, or defence of legal claims to reject delisting requests under Article 17.\n\n83. A delisting request means that specific results are removed from the search results when the individual's name is used as the search term. The information is still accessible using other search terms.\n\n(a) Union law; or\n\n(b) Member State law to which you are subject."

In [56]:
# index: 14, section_reference: 2.5

df_summary.loc[index, "text"] = "Search engine providers are unlikely to use the exemption for the establishment, exercise, or defence of legal claims to reject delisting requests under Article 17.\n\nA delisting request means that specific results are removed from the search results when the individual's name is used as the search term. The information is still accessible using other search terms.\n\n(a) Union law; or\n\n(b) Member State law to which you are subject."

df_questions.loc[index, "text"] = "Could a search engine provider reject a request to remove listings based on the establishment, exercise, or defence of legal claims?|What happens to the information when a delisting request is granted?"


index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
from IPython.display import Markdown, display
if index < len(df_summary):
    print(f"The next section is:")
    display(Markdown(doc.get_text(df_summary.iloc[index]['section_reference'])))
else:
    print("All done")

You have completed 100.00% of the work
All done


In [62]:
from IPython.display import display

# Increase the display width
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)

df_questions

Unnamed: 0,section_reference,source,embedding,document,text
0,1,question,,Forgotten,What is the Right to request delisting?
1,1,question,,Forgotten,What is the Right to be forgotten?
2,1.1,question,,Forgotten,When can an individual request for their infor...
3,1.1,question,,Forgotten,What factors determine if personal data are no...
4,1.2,question,,Forgotten,When can you request delisting of your data?
5,1.2,question,,Forgotten,What happens if you withdraw consent for your ...
6,1.2,question,,Forgotten,How are search engine providers involved in th...
7,1.3,question,,Forgotten,What is the balance that needs to be struck wh...
8,1.3,question,,Forgotten,Who must demonstrate compelling legitimate gro...
9,1.3,question,,Forgotten,What factors are considered when assessing a d...


In [59]:
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [60]:
import pandas as pd
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [61]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [63]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,1,Individuals have the right to request delistin...,summary,,Forgotten
1,1.1,An individual can request a search engine to r...,summary,,Forgotten
2,1.2,Individuals can request the erasure of persona...,summary,,Forgotten
3,1.3,An individual can request the removal of their...,summary,,Forgotten
4,1.4,An individual can request the erasure of their...,summary,,Forgotten
5,1.5,An individual can request a search engine prov...,summary,,Forgotten
6,1.6,An individual can request that a search engine...,summary,,Forgotten
7,2,"Processing can continue, overriding an individ...",summary,,Forgotten
8,2.1,The obligation to delete personal data without...,summary,,Forgotten
9,2.2,Processing data is exempt from delisting reque...,summary,,Forgotten


In [64]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines


In [65]:
df_index.to_parquet(index_file, engine = "pyarrow")
save_parquet_data(df_index, index_file, key)


In [66]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')


gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
forgotten_index = load_parquet_data("../inputs/index/forgotten.parquet", key)

for index, row in gdpr_index[gdpr_index['section_reference'].str.startswith('17')].iterrows():
    print(f"-- {row['text']}")


# for index, row in gdpr_index.iterrows():
#     if 'location' in row['text'].lower():
#         print(f"* {row['text']}")

-- What is the right to erasure and when does it apply?
-- Under what conditions can an individual request the deletion of their personal data?
-- What obligations are there if personal data is made public and needs to be erased?
-- What are the exceptions to the right to erasure?
-- How does the right to erasure balance with the freedom of expression and information?In what scenarios is the right to erasure limited to protect legal claims?
-- Individuals have the right to request the deletion of their personal data from an entity (controller) holding it, without unnecessary delay, under certain conditions such as the data no longer being needed for its initial purpose, the withdrawal of consent, objection to processing, unlawful processing, compliance with legal obligations, or data collected from minors regarding information society services. Controllers are compelled to comply with this request promptly.
If the data has been made public, the controller must communicate the erasure r

In [7]:
covid_index['section_reference'] = covid_index['section_reference'].astype(str)

In [14]:
online_index['section_reference'] = online_index['section_reference'].astype(str)
online_index
save_parquet_data(online_index, index_file, key)

In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)