In [1]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [3]:
class_name = "OnlineServices"
summary_file = "../tmp/online_services_summary.csv"
question_file = "../tmp/online_services_question.csv"

path_to_manual_as_csv_file = "../inputs/documents/online_services.parquet"
index_file = "../inputs/index/online_services.parquet"



In [1]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.online_services
importlib.reload(gdpr_rag.documents.online_services)
from gdpr_rag.documents.online_services import OnlineServices

path_to_manual_as_csv_file = "../inputs/documents/online_services.parquet"

doc = OnlineServices(path_to_manual_as_csv_file)


In [2]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "transparency", reference_checker = reference_checker, regulation_df = df)

In [3]:
toc.print_tree()

transparency []
|-- 1 [PART 1 - INTRODUCTION ]
|   |-- .1 [Background ]
|   +-- .2 [Scope of these guidelines ]
|-- 2 [PART 2 - ANALYSIS OF ARTICLE 6(1)(B)  ]
|   |-- .1 [General observations ]
|   |-- .2 [Interaction of Article 6(1)(b) with other lawful bases for processing ]
|   |-- .3 [Scope of Article 6(1)(b) ]
|   |-- .4 [Necessity ]
|   |-- .5 [Necessary for performance of a contract with the data subject ]
|   |-- .6 [Termination of contract ]
|   +-- .7 [Necessary for taking steps prior to entering into a contract ]
+-- 3 [PART 3 - APPLICABILITY OF ARTICLE 6(1)(B) IN SPECIFIC SITUATIONS ]
    |-- .1 [Processing for 'service improvement'[^25] 
[^25] Online services may also need to take into account Directive (EU) 2019/770 of the European Parliament and of the Council of 20 May 2019 on certain aspects concerning contracts for the supply of digital content and digital services (OJ L 136, 22.05.2019, p. 1), which will apply as from 1 January 2022.]
    |-- .2 [Processing for 'frau

In [7]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)

split_df = split_df.drop([0, 1, 2]) # Introduction
split_df = split_df[split_df["token_count"] > 25] 
split_df.reset_index(drop=True, inplace=True)
split_df

Unnamed: 0,section_reference,text,token_count
0,2.1,2 PART 2 - ANALYSIS OF ARTICLE 6(1)(B) \n\n2....,984
1,2.2,2 PART 2 - ANALYSIS OF ARTICLE 6(1)(B) \n\n2....,749
2,2.3,2 PART 2 - ANALYSIS OF ARTICLE 6(1)(B) \n\n2....,93
3,2.4,2 PART 2 - ANALYSIS OF ARTICLE 6(1)(B) \n\n2....,754
4,2.5,2 PART 2 - ANALYSIS OF ARTICLE 6(1)(B) \n\n2....,2180
5,2.6,2 PART 2 - ANALYSIS OF ARTICLE 6(1)(B) \n\n2....,746
6,2.7,2 PART 2 - ANALYSIS OF ARTICLE 6(1)(B) \n\n2....,421
7,3.1,3 PART 3 - APPLICABILITY OF ARTICLE 6(1)(B) IN...,305
8,3.2,3 PART 3 - APPLICABILITY OF ARTICLE 6(1)(B) IN...,231
9,3.3,3 PART 3 - APPLICABILITY OF ARTICLE 6(1)(B) IN...,841


In [8]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [9]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [12]:
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = class_name

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = class_name

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [50]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
3 PART 3 - APPLICABILITY OF ARTICLE 6(1)(B) IN SPECIFIC SITUATIONS 

3.4 Processing for personalisation of content[^34]
57. The EDPB acknowledges that personalisation of content may (but does not always) constitute an intrinsic and expected element of certain online services, and therefore may be regarded as necessary for the performance of the contract with the service user in some cases. Whether such processing can be regarded as an intrinsic aspect of an online service, will depend on the nature of the service provided, the expectations of the average data subject in light not only of the terms of service but also the way the service is promoted to users, and whether the service can be provided without personalisation. Where personalisation of content is not objectively necessary for the purpose of the underlying contract, for example where personalised content delivery is intended to increase user engagement with a service but is not an integral part of using the ser

In [51]:
model_summary

"Content personalisation can sometimes be considered an essential element of certain online services, making it necessary for fulfilling a contract with a user. This depends on the type of service, user expectations based on service terms and promotions, and whether the service can function without personalisation. If personalisation is not crucial for the contract, like increasing user engagement without being integral to the service, you should find an alternative legal basis for processing. \n\nExample 7: A hotel search engine using past bookings to recommend hotels to users does not qualify as necessary for contract performance since profiling user's past behaviour and financial data is not essential for providing search results.\n\nExample 8: An online marketplace suggesting products based on users' previous views to increase interactivity is not necessary for providing its services. Hence, processing personal data for this purpose cannot use Article 6(1)(b) as a legal basis."

In [52]:
# index: 10, section_reference: 3.4

df_summary.loc[index, "text"] = "Content personalisation can sometimes be considered an essential element of certain online services, making it necessary for fulfilling a contract with a user. This depends on the type of service, user expectations based on service terms and promotions, and whether the service can function without personalisation. If personalisation is not crucial for the contract, like increasing user engagement without being integral to the service, you should find an alternative legal basis for processing. \n\nExample: A hotel search engine using past bookings to recommend hotels to users does not qualify as necessary for contract performance since profiling user's past behaviour and financial data is not essential for providing search results.\n\nExample: An online marketplace suggesting products based on users' previous views to increase interactivity is not necessary for providing its services. Hence, processing personal data for this purpose cannot use Article 6(1)(b) as a legal basis."

df_questions.loc[index, "text"] = "When can content personalisation be considered necessary for a contract?|What factors determine if personalisation is an intrinsic part of an online service?"



index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 100.00% of the work
All done


In [57]:
df_questions

Unnamed: 0,section_reference,source,embedding,document,text
0,2.1,question,,OnlineServices,What are the key considerations for processing...
1,2.1,question,,OnlineServices,How do fairness and reasonable expectations of...
2,2.1,question,,OnlineServices,What must you consider when processing childre...
3,2.1,question,,OnlineServices,Why must the purposes for data collection be c...
4,2.2,question,,OnlineServices,When might consent be a more appropriate legal...
5,2.2,question,,OnlineServices,What are the key requirements for identifying ...
6,2.2,question,,OnlineServices,How does the principle of fairness affect the ...
7,2.2,question,,OnlineServices,What is the difference between accepting terms...
8,2.3,question,,OnlineServices,When is Article 6(1)(b) applicable?
9,2.3,question,,OnlineServices,When is processing considered necessary for pe...


In [54]:
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [4]:
import pandas as pd
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [5]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [6]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,2.1,Consider the lawful basis for processing under...,summary,,OnlineServices
1,2.2,When processing is not 'necessary for the perf...,summary,,OnlineServices
2,2.3,Article 6(1)(b) applies when the processing is...,summary,,OnlineServices
3,2.4,Processing must be necessary for both performa...,summary,,OnlineServices
4,2.5,To rely on Article 6(1)(b) for processing pers...,summary,,OnlineServices
5,2.6,Identify the appropriate legal basis for proce...,summary,,OnlineServices
6,2.7,Processing personal data is permissible if nec...,summary,,OnlineServices
7,3.1,Processing data for service improvement cannot...,summary,,OnlineServices
8,3.2,Processing for fraud prevention purposes may i...,summary,,OnlineServices
9,3.3,Processing personal data for online behavioura...,summary,,OnlineServices


In [59]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines


In [60]:
df_index.to_parquet(index_file, engine = "pyarrow")
save_parquet_data(df_index, index_file, key)


In [1]:
df_index
#df_index["document"] = "Article_47_BCR"

NameError: name 'df_index' is not defined

In [11]:
gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
online_index = load_parquet_data("../inputs/index/online_services.parquet", key)

for index, row in gdpr_index[gdpr_index['section_reference'].str.startswith('5')].iterrows():
    print(f"-- {row['text']}")


# for index, row in index_df[index_df["section_reference"] == '30'].iterrows():
#     print(f"* {row['text']}")

-- What are the principles associated with personal data?
-- When can I collect personal data?
-- Once I have collected personal data, are there ongoing requirements?
-- What measures are required to secure personal data under GDPR?
-- Who is responsible for demonstrating compliance with GDPR's principles?
-- Can I use personal data that my company has already collected?
-- What steps must be taken to improve international cooperation in data protection?
-- How does international mutual assistance enhance data protection enforcement?
-- In what ways can relevant stakeholders be involved in international data protection efforts?
-- How is the exchange and documentation of data protection practices promoted internationally?
-- What actions are taken to address jurisdictional conflicts in data protection with third countries?
-- What is the role of a supervisory authority?
-- How do supervisory authorities contribute to GDPR?
-- Can a Member State have more than one supervisory authority?

In [14]:
online_index['section_reference'] = online_index['section_reference'].astype(str)
online_index
save_parquet_data(online_index, index_file, key)

In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)