In [1]:
import logging

from abc import ABC, abstractmethod

from regulations_rag.corpus_index import DataFrameCorpusIndex
from regulations_rag.rerank import RerankAlgos, rerank

# Create a logger for this module
logger = logging.getLogger(__name__)
DEV_LEVEL = 15
logging.addLevelName(DEV_LEVEL, 'DEV')       


import os
from regulations_rag.embeddings import get_closest_nodes, num_tokens_from_string
from regulations_rag.file_tools import load_parquet_data

from gdpr_rag.gdpr_corpus import GDPRCorpus
import pandas as pd

required_columns_workflow = ["workflow", "text", "embedding"]
key = os.getenv('encryption_key_gdpr')



In [8]:
corpus = GDPRCorpus("./gdpr_rag/documents/")
index_folder = "./inputs/index/"
index_df = pd.DataFrame()
for filename in os.listdir(index_folder):
    if filename.endswith(".parquet"):  
        filepath = os.path.join(index_folder, filename)
        df = load_parquet_data(filepath, key)
        index_df = pd.concat([index_df, df], ignore_index = True)

user_type = "a Controller"
corpus_description = "the General Data Protection Regulation (GDPR)"

definitions = index_df[index_df['source'] == 'definitions'].copy(deep=True)
index = index_df[index_df['source'] != 'definitions'].copy(deep=True)
workflow = pd.DataFrame([], columns = required_columns_workflow)


In [12]:
doc = corpus.get_document("GDPR")

definitions['definition'] = definitions['section_reference'].apply(lambda x: doc.get_text(section_reference = x, add_markdown_decorators = False, add_headings = False))
# Function to remove the leading number, full stop, and space
definitions['definition'] = definitions['definition'].str.replace(r'^\s*\d+\.\s*', '', regex=True)

#definitions['definition'].str.replace(r'^\d{1,2}\. ', '', regex=True)
definitions



Unnamed: 0,section_reference,text,source,embedding,document,definition
1005,4(1),What is personal data?,definitions,"[-0.03953353688120842, 0.04642977565526962, -0...",GDPR,'personal data' means any information relating...
1006,4(2),What is processing?,definitions,"[-0.03899864852428436, 0.04007093608379364, -0...",GDPR,'processing' means any operation or set of ope...
1007,4(3),What is restriction of processing?,definitions,"[-0.052392441779375076, -0.03844074532389641, ...",GDPR,'restriction of processing' means the marking ...
1008,4(4),What is profiling?,definitions,"[-0.03238030523061752, -0.04825564846396446, -...",GDPR,'profiling' means any form of automated proces...
1009,4(5),What is pseudonymisation?,definitions,"[-0.04147166386246681, -0.031792495399713516, ...",GDPR,'pseudonymisation' means the processing of per...
1010,4(6),What is a filing system?,definitions,"[-0.035739850252866745, 0.048365894705057144, ...",GDPR,'filing system' means any structured set of pe...
1011,4(7),What is a controller?,definitions,"[-0.022157778963446617, 0.011915236711502075, ...",GDPR,'controller' means the natural or legal person...
1012,4(8),What is a processor?,definitions,"[-0.020444175228476524, 0.019522828981280327, ...",GDPR,"'processor' means a natural or legal person, p..."
1013,4(9),What is a recipient?,definitions,"[-0.021122010424733162, 0.010544460266828537, ...",GDPR,"'recipient' means a natural or legal person, p..."
1014,4(10),What is a third party?,definitions,"[-0.011242421343922615, -0.06827251613140106, ...",GDPR,"'third party' means a natural or legal person,..."


In [11]:
definitions.iloc[0]['definition']

"    1. 'personal data' means any information relating to an identified or identifiable natural person ('data subject'); an identifiable natural person is one who can be identified, directly or indirectly, in particular by reference to an identifier such as a name, an identification number, location data, an online identifier or to one or more factors specific to the physical, physiological, genetic, mental, economic, cultural or social identity of that natural person;\n"

In [7]:
import pandas as pd

# Sample dataframe
data = {'definition': ['1. This is a definition', '12. Another definition']}
definitions = pd.DataFrame(data)

# Function to remove the leading number, full stop, and space
definitions['definition'] = definitions['definition'].str.replace(r'^\d{1,2}\. ', '', regex=True)

print(definitions)

             definition
0  This is a definition
1    Another definition


In [8]:
columns_in_dfns = ["embedding", "document", "section_reference", "text"]
for column in columns_in_dfns:
    if column not in definitions.columns.to_list():
        print(column)


In [12]:
columns_in_sections = ["embedding", "document", "section_reference", "source", "text"]
for column in columns_in_sections:
    if column not in index.columns.to_list():
        print(column)
