In [2]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [1]:
class_name = "CovidHealth"
prefix = "covid_health"
summary_file = f"../tmp/{prefix}_summary.csv"
question_file = f"../tmp/{prefix}_services_question.csv"

path_to_manual_as_csv_file = f"../inputs/documents/{prefix}.parquet"
index_file = f"../inputs/index/{prefix}.parquet"



In [2]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.covid_health
importlib.reload(gdpr_rag.documents.covid_health)
from gdpr_rag.documents.covid_health import CovidHealth

doc = CovidHealth(path_to_manual_as_csv_file)



In [3]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "transparency", reference_checker = reference_checker, regulation_df = df)

In [4]:
toc.print_tree()

transparency []
|-- 1 [INTRODUCTION ]
|-- 2 [APPLICATION OF THE GDPR ]
|-- 3 [DEFINITIONS ]
|   |-- .1 ["Data concerning health" ]
|   |-- .2 ["Processing for the purpose of scientific research" ]
|   +-- .3 ["Further processing" ]
|-- 4 [LEGAL BASIS FOR THE PROCESSING ]
|   |-- .1 [Consent ]
|   +-- .2 [National legislations ]
|-- 5 [DATA PROTECTION PRINCIPLES ]
|   |-- .1 [Transparency and information to data subjects ]
|   |   |-- .1 [When must the data subject be informed? ]
|   |   +-- .2 [Exemptions ]
|   |-- .2 [Purpose limitation and presumption of compatibility  ]
|   |-- .3 [Data minimisation and storage limitation ]
|   +-- .4 [Integrity and confidentiality ]
|-- 6 [EXERCISE OF THE RIGHTS OF DATA SUBJECTS ]
|-- 7 [INTERNATIONAL DATA TRANSFERS FOR SCIENTIFIC RESEARCH PURPOSES ]
+-- 8 [SUMMARY ]


In [66]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)

split_df = split_df.drop([0, 18]) # Introduction and summary
split_df = split_df[split_df["token_count"] > 77] 
split_df.reset_index(drop=True, inplace=True)
split_df

Unnamed: 0,section_reference,text,token_count
0,2,2 APPLICATION OF THE GDPR \n4. Data protection...,349
1,3.1,"3 DEFINITIONS \n\n3.1 ""Data concerning health""...",351
2,3.2,"3 DEFINITIONS \n\n3.2 ""Processing for the purp...",254
3,3.3,"3 DEFINITIONS \n\n3.3 ""Further processing"" \n1...",279
4,4,4 LEGAL BASIS FOR THE PROCESSING \n15. All pro...,178
5,4.1,4 LEGAL BASIS FOR THE PROCESSING \n\n4.1 Conse...,714
6,4.2,4 LEGAL BASIS FOR THE PROCESSING \n\n4.2 Natio...,454
7,5.1,5 DATA PROTECTION PRINCIPLES \n\n5.1 Transpare...,189
8,5.1.1,5 DATA PROTECTION PRINCIPLES \n5.1 Transparenc...,252
9,5.1.2,5 DATA PROTECTION PRINCIPLES \n5.1 Transparenc...,906


In [8]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [9]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [10]:
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = class_name

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = class_name

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [57]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
7 INTERNATIONAL DATA TRANSFERS FOR SCIENTIFIC RESEARCH PURPOSES 
58. Within the context of research and specifically in the context of the COVID-19 pandemic, there will probably be a need for international cooperation that may also imply international transfers of health data for the purpose of scientific research outside of the EEA. 
59. When personal data is transferred to a non-EEA country or international organisation, in addition to complying with the rules set out in GDPR,[^19] especially its Articles 5 (data protection principles), Article 6 (lawfulness) and Article 9 (special categories of data),[^20] the data exporter shall also comply with Chapter V (data transfers).[^21] 
60. In addition to the regular transparency requirement as mentioned on page 7 of the present guidelines,  a duty rests on the data exporter to inform data subjects that it intends to transfer personal data to a third country or international organisation. This includes information about the 

In [59]:
model_summary

"When transferring health data for scientific research outside the EEA during the COVID-19 pandemic, you must comply with GDPR Articles 5, 6, 9, and Chapter V. Inform individuals about the transfer, including whether an adequacy decision by the European Commission exists or whether other safeguards or exceptions apply. Assess risks to individuals' rights and freedoms, favouring solutions that protect these rights continuously.\n\nWithout an adequacy decision or appropriate safeguards, you may use specific exceptions under Article 49, like important reasons of public interest or explicit consent. During the COVID-19 crisis, public interest in fighting the pandemic can justify transfers. Transfers by private entities for medical research may rely on explicit consent. Public and private entities can use these exceptions temporarily due to the urgent medical situation.\n\nFor repetitive data transfers as part of long-term research projects, appropriate safeguards are required under Article

In [91]:
# index: 14, section_reference: 7

df_summary.loc[index, "text"] = "When transferring health data for scientific research outside the EEA during the COVID-19 pandemic, you must comply with GDPR Articles 5, 6, 9, and Chapter V. Inform individuals about the transfer, including whether an adequacy decision by the European Commission exists or whether other safeguards or exceptions apply. Assess risks to individuals' rights and freedoms, favouring solutions that protect these rights continuously.\n\nWithout an adequacy decision or appropriate safeguards, you may use specific exceptions under Article 49, like important reasons of public interest or explicit consent. During the COVID-19 crisis, public interest in fighting the pandemic can justify transfers. Transfers by private entities for medical research may rely on explicit consent. Public and private entities can use these exceptions temporarily due to the urgent medical situation.\n\nFor repetitive data transfers as part of long-term research projects, appropriate safeguards are required under Article 46. Consider the roles and obligations (controller, processor, joint controller) of all involved parties to determine suitable measures for framing the transfer."

df_questions.loc[index, "text"] = "What is needed for international data transfers for scientific research during a pandemic?|What should you do when transferring personal data to a country outside the European Economic Area (EEA) for research purposes?"



index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 93.75% of the work
The next section is:
8 SUMMARY 
69. The key findings of these guidelines are: 
    1. The GDPR provides special rules for the processing of health data for the purpose of scientific research that are also applicable in the context of the COVID-19 pandemic.  
    2. The national legislator of each Member State may enact specific laws pursuant to Article (9) (2) (i) and (j) GDPR to enable the processing of health data for scientific research purposes. The processing of health data for the purpose of scientific research must also be covered by one of the legal bases in Article 6 (1) GDPR. Therefore, the conditions and the extent for such processing varies depending on the enacted laws of the particular member state. 
    3. All enacted laws based on Article (9) (2) (i) and (j) GDPR must be interpreted in the light of the principles pursuant to Article 5 GDPR and in consideration of the jurisprudence of the ECJ. In particular, derogations and limitatio

In [102]:
df_questions = df_questions.drop([15])
df_questions.reset_index(drop=True, inplace=True)

df_summary = df_summary.drop([15])
df_summary.reset_index(drop=True, inplace=True)


In [107]:
df_questions

Unnamed: 0,section_reference,text,source,embedding,document
0,2,Are there examples of the application of GDPR ...,question,,CovidHealth
1,3.1,What is considered 'data concerning health'?|W...,question,,CovidHealth
2,3.2,"What is considered ""processing for the purpose...",question,,CovidHealth
3,3.3,"What is ""primary use"" of health data in scient...",question,,CovidHealth
4,4,What must all processing of health data comply...,question,,CovidHealth
5,4.1,How should consent for processing health data ...,question,,CovidHealth
6,4.2,What are the permissible legal bases for proce...,question,,CovidHealth
7,5.1,Why is it important to inform individuals abou...,question,,CovidHealth
8,5.1.1,"In a pandemic, can the timelines to notify ind...",question,,CovidHealth
9,5.1.2,When can you be exempt from providing informat...,question,,CovidHealth


In [105]:
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [106]:
import pandas as pd
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [108]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [109]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,2,Data protection rules do not hinder measures t...,summary,,CovidHealth
1,3.1,Data concerning health refers to personal data...,summary,,CovidHealth
2,3.2,"""Processing for the purpose of scientific rese...",summary,,CovidHealth
3,3.3,"""Further processing"" refers to using data init...",summary,,CovidHealth
4,4,Processing personal health data must adhere to...,summary,,CovidHealth
5,4.1,Consent of the individual can be a legal basis...,summary,,CovidHealth
6,4.2,Article 6 (1) e or 6 (1) f in combination with...,summary,,CovidHealth
7,5.1,Personal data must be processed fairly and tra...,summary,,CovidHealth
8,5.1.1,When personal data have not been obtained dire...,summary,,CovidHealth
9,5.1.2,Data Protection Principles: Transparency and I...,summary,,CovidHealth


In [110]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines


In [111]:
df_index.to_parquet(index_file, engine = "pyarrow")
save_parquet_data(df_index, index_file, key)


In [112]:
gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
covid_index = load_parquet_data("../inputs/index/covid_health.parquet", key)

# for index, row in gdpr_index[gdpr_index['section_reference'].str.startswith('5')].iterrows():
#     print(f"-- {row['text']}")


for index, row in gdpr_index.iterrows():
    if 'health' in row['text'].lower():
        print(f"* {row['text']}")

* What is data concerning health?
* What specific purposes allow healthcare data to be processed without explicit consent?
* Processing personal data related to racial or ethnic origin, political opinions, religious or philosophical beliefs, trade union membership, genetic data, biometric data for identification, health, sex life, or sexual orientation is generally prohibited. However, exceptions include: 
1. If the data subject gives explicit consent for specific purposes, unless law states otherwise.
2. When necessary for employment, social security, and social protection obligations/rights, as authorised by law or a collective agreement with safeguards.
3. To protect the vital interests of the data subject or another person when the data subject cannot give consent.
4. By not-for-profit bodies with political, philosophical, religious, or trade union aims, for their members or persons in regular contact, without disclosing outside without consent.
5. When the data subject has made th

In [14]:
online_index['section_reference'] = online_index['section_reference'].astype(str)
online_index
save_parquet_data(online_index, index_file, key)

In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)