In [1]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [9]:
class_name = "CovidLocation"
prefix = "covid_location"
summary_file = f"../tmp/{prefix}_summary.csv"
question_file = f"../tmp/{prefix}_services_question.csv"

path_to_manual_as_csv_file = f"../inputs/documents/{prefix}.parquet"
index_file = f"../inputs/index/{prefix}.parquet"



In [3]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.covid_location
importlib.reload(gdpr_rag.documents.covid_location)
from gdpr_rag.documents.covid_location import CovidLocation

doc = CovidLocation(path_to_manual_as_csv_file)



In [4]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "transparency", reference_checker = reference_checker, regulation_df = df)

In [5]:
toc.print_tree()

transparency []
|-- 1 [INTRODUCTION & CONTEXT ]
|-- 2 [USE OF LOCATION DATA ]
|   |-- .1 [Sources of location data ]
|   +-- .2 [Focus on the use of anonymised location data ]
|-- 3 [CONTACT TRACING APPLICATIONS ]
|   |-- .1 [General legal analysis ]
|   +-- .2 [Recommendations and functional requirements ]
|-- 4 [CONCLUSION ]
+-- Annex [CONTACT TRACING APPLICATIONS ANALYSIS GUIDE ]


In [8]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)

split_df = split_df.drop([0, 7, 8]) # Introduction and summary
split_df = split_df[split_df["token_count"] > 10] 
split_df.reset_index(drop=True, inplace=True)
split_df

Unnamed: 0,section_reference,text,token_count
0,2.1,2 USE OF LOCATION DATA \n\n2.1 Sources of loca...,628
1,2.2,2 USE OF LOCATION DATA \n\n2.2 Focus on the us...,732
2,3.1,3 CONTACT TRACING APPLICATIONS \n\n3.1 General...,1674
3,3.2,3 CONTACT TRACING APPLICATIONS \n\n3.2 Recomme...,588
4,4.0,4 CONCLUSION \n48. The world is facing a signi...,186


In [3]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [10]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [11]:
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = class_name

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = class_name

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [22]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
3 CONTACT TRACING APPLICATIONS 

3.2 Recommendations and functional requirements 
40. According to the principle of data minimization, among other measures of Data Protection by Design and by Default[^17], the data processed should be reduced to the strict minimum. The application should not collect unrelated or not needed information, which may include civil status, communication identifiers, equipment directory items, messages, call logs, location data, device identifiers, etc. 
41. Data broadcasted by applications must only include some unique and pseudonymous identifiers, generated by and specific to the application. Those identifiers must be renewed regularly, at a frequency compatible with the purpose of containing the spread of the virus, and sufficient to limit the risk of identification and of physical tracking of individuals. 
42. Implementations for contact tracing can follow a centralized or a decentralized approach[^18]. Both should be considered viable opti

In [20]:
model_summary

'Monitoring locations or contacts on a large scale is a significant privacy intrusion and must be adopted voluntarily. Individuals not using such applications should not face any disadvantages.\n\nThe controller of any contact tracing application should be clearly defined, and national health authorities or other entities could be designated as such. If multiple actors are involved, responsibilities must be clear from the start and explained to users.\n\nPurposes for data processing must be specific to the COVID-19 health crisis, excluding unrelated purposes like commercial or law enforcement. Data use must be adequate, necessary, and proportionate.\n\nContact tracing apps:\n- Should use proximity data instead of tracking user locations.\n- Must prevent re-identification through appropriate measures.\n- Must store data on the user’s device, collecting only necessary information.\n\nProcessing involving storage or access to information on user devices falls under the "ePrivacy" Directiv

In [23]:
# index: 3, section_reference: 3.2

df_summary.loc[index, "text"] = "Data processed should be minimised, avoiding the collection of unrelated information, including civil status, communication identifiers, etc. Application data must include only unique and pseudonymous identifiers, renewed regularly. Both centralized and decentralized approaches for contact tracing should be considered, ensuring proper security measures. Servers must collect only the contact history or pseudonymous identifiers of diagnosed users upon health authority assessment and user consent, or maintain pseudonymous identifiers only long enough to inform potentially exposed users. Any additional information needed for a global methodology should stay on the user terminal and be processed only when necessary with prior consent. Ensure state-of-the-art cryptographic techniques for data security and mutual authentication between the application and server. Reporting infected users must be authorized through a one-time code tied to the pseudonymous identity, linked to a test station or healthcare professional. Public authorities and you must clearly provide the official download link for national contact tracing apps to prevent the use of third-party apps."

df_questions.loc[index, "text"] = "What measures should be taken to minimise the data collected by the application? | What types of data should not be collected by contact tracing applications? | What are the approaches to implementing contact tracing and how should they be evaluated? "




index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 80.00% of the work
The next section is:
4 CONCLUSION 
48. The world is facing a significant public health crisis that requires strong responses, which will have an impact beyond this emergency. Automated data processing and digital technologies can be key components in the fight against COVID-19. However, one should be wary of the "ratchet effect". It is our responsibility to ensure that every measure taken in these extraordinary circumstances are necessary, limited in time, of minimal extent and subject to periodic and genuine review as well as to scientific evaluation. 
49. The EDPB underlines that one should not have to choose between an efficient response to the current crisis and the protection of our fundamental rights: we can achieve both, and moreover data protection principles can play a very important role in the fight against the virus. European data protection law allows for the responsible use of personal data for health management purposes, while also e

In [26]:
df_questions = df_questions.drop([4])
df_questions.reset_index(drop=True, inplace=True)

df_summary = df_summary.drop([4])
df_summary.reset_index(drop=True, inplace=True)


In [28]:
df_summary

Unnamed: 0,section_reference,text,source,embedding,document
0,2.1,Location data can be sourced from electronic c...,summary,,CovidLocation
1,2.2,Consider anonymising location data instead of ...,summary,,CovidLocation
2,3.1,Monitoring locations or contacts on a large sc...,summary,,CovidLocation
3,3.2,"Data processed should be minimised, avoiding t...",summary,,CovidLocation


In [29]:
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [30]:
import pandas as pd
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [31]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [32]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,2.1,Location data can be sourced from electronic c...,summary,,CovidLocation
1,2.2,Consider anonymising location data instead of ...,summary,,CovidLocation
2,3.1,Monitoring locations or contacts on a large sc...,summary,,CovidLocation
3,3.2,"Data processed should be minimised, avoiding t...",summary,,CovidLocation
4,2.1,Under what conditions can location data from c...,question,,CovidLocation
5,2.1,What are the rules for accessing information f...,question,,CovidLocation
6,2.1,When can exceptions to ePrivacy Directive obli...,question,,CovidLocation
7,2.2,Why should anonymised data be preferred over p...,question,,CovidLocation
8,2.2,What does anonymisation involve?,question,,CovidLocation
9,2.2,"How is the ""reasonability test"" for anonymisat...",question,,CovidLocation


In [33]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines


In [8]:
df_index.to_parquet(index_file, engine = "pyarrow")
save_parquet_data(df_index, index_file, key)


NameError: name 'df_index' is not defined

In [10]:
save_parquet_data(covid_index, index_file, key)

In [4]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')


gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
covid_index = load_parquet_data("../inputs/index/covid_location.parquet", key)

# for index, row in gdpr_index[gdpr_index['section_reference'].str.startswith('5')].iterrows():
#     print(f"-- {row['text']}")


for index, row in gdpr_index.iterrows():
    if 'location' in row['text'].lower():
        print(f"* {row['text']}")

* You can transfer personal data to a third country or international organisation without specific authorisation if the European Commission has recognised that location as providing an adequate level of protection. The Commission evaluates adequacy based on the rule of law, human rights respect, relevant legislation, data protection measures, the existence of supervisory authorities, and international commitments related to data protection.
After assessing the protection level, the Commission may declare that a third country, specific territory, or organisation ensures adequate protection, subject to a review at least every four years. This decision will outline its application scope and identify any supervisory authority involved.
The Commission continuously monitors changes that could impact the validity of its adequacy decisions, and if it finds a country or organisation no longer provides adequate protection, it can repeal, amend, or suspend the decision without retroactive effect.

In [7]:
covid_index['section_reference'] = covid_index['section_reference'].astype(str)

In [14]:
online_index['section_reference'] = online_index['section_reference'].astype(str)
online_index
save_parquet_data(online_index, index_file, key)

In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)