There are lots of documents about GDPR. I need
1) A place for the original (pdf / web text etc)
2) A notebook to 
    a) convert the original into a dataframe; 
    b) to create summaries and questions that can be added to the document index (QUESTION: is this one file / table or one per document?) 
   There notebooks are to be saved in folder ./conversion_notebooks/
3) A document.py wrapper for the dataframe version of the document ./gdpr_rag/documents/
4) A naming convention that that allows a script to check that each original document has been converted into a dataframe, has a document.py wrapper and has been added to the document index. This should also check that there are no additional entries in the document index etc.


This workbook does the recon

In [1]:
original_documentation = "./original/"
df_version_of_document = "./inputs/documents/"
index_for_document = "./inputs/index/"
python_wrappers = "./gdpr_rag/documents/"

In [2]:
import os
import ast

def find_class_names_in_files(directory):
    class_dict = {}
    for filename in os.listdir(directory):
        if filename.endswith(".py"):  # Check for Python files
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as file:
                file_content = file.read()
            tree = ast.parse(file_content)
            for node in ast.walk(tree):
                if isinstance(node, ast.ClassDef):
                    class_name = node.name
                    file_name_without_extension = os.path.splitext(filename)[0]
                    class_dict[file_name_without_extension] = class_name
                    break  # Assuming one class per file, break after finding the first class
    return class_dict

# Usage
directory = python_wrappers  # Specify your folder path
class_names_dict = find_class_names_in_files(directory)
print(class_names_dict)


{'article_30_5': 'Article_30_5', 'article_47_bcr': 'Article_47_BCR', 'article_49_intl_transfer': 'Article_49_Intl_Transfer', 'codes': 'Codes', 'consent': 'Consent', 'covid_health': 'CovidHealth', 'covid_location': 'CovidLocation', 'data_breach': 'DataBreach', 'data_portability': 'DataPortability', 'decision_making': 'DecisionMaking', 'dpia': 'DPIA', 'dpo': 'DPO', 'forgotten': 'Forgotten', 'gdpr': 'GDPR', 'lead_sa': 'Lead_SA', 'online_services': 'OnlineServices', 'protection': 'Protection', 'territorial_scope': 'TerritorialScope', 'transparency': 'Transparency', 'video': 'Video'}


In [3]:
import os
import pandas as pd

def extract_root_names(folder_path):
    root_names = [os.path.splitext(file)[0] for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]
    return root_names

originals = set(extract_root_names(original_documentation))
df_versions = set(extract_root_names(df_version_of_document))
classes_in_df_version = {class_names_dict[df_version_name] for df_version_name in df_versions}

indexes = set(extract_root_names(df_version_of_document))
wrappers = set(extract_root_names(python_wrappers))

names_of_documents_in_index_data = []
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
for file in os.listdir(index_for_document):
    df = load_parquet_data(os.path.join(index_for_document, file), key)
    documents_referenced = set(df["document"].to_list())
    # if not documents_referenced.issubset(originals):
    #     not_in_originals = documents_referenced.difference(originals)
    #     print(f"The file {file} contains a reference to a document that does not exist. The problem reference(s) in the dataframe index are {not_in_originals}")
    
    names_of_documents_in_index_data = names_of_documents_in_index_data + list(documents_referenced)

names_of_documents_in_index_data = set(names_of_documents_in_index_data)

print_all = True
if print_all:
    print(f"originals : {originals}")
    print(f"df_versions : {df_versions}")
    print(f"classes_in_df_version : {classes_in_df_version}")
    print(f"wrappers : {wrappers}")
    print(f"indexes : {indexes}")
    print(f"names_of_documents_in_index_data : {names_of_documents_in_index_data}")



originals : {'video', 'online_services', 'territorial_scope', 'covid_health', 'lead_sa', 'article_30_5', 'data_portability', 'article_49_intl_transfer', 'transparency', 'dpo', 'covid_location', 'protection', 'codes', 'article_47_bcr', 'data_breach', 'gdpr', 'consent', 'forgotten', 'decision_making', 'dpia'}
df_versions : {'video', 'online_services', 'territorial_scope', 'covid_health', 'lead_sa', 'article_30_5', 'data_portability', 'article_49_intl_transfer', 'transparency', 'dpo', 'covid_location', 'protection', 'codes', 'article_47_bcr', 'data_breach', 'gdpr', 'consent', 'forgotten', 'decision_making', 'dpia'}
classes_in_df_version : {'OnlineServices', 'TerritorialScope', 'DPO', 'CovidLocation', 'Protection', 'GDPR', 'Video', 'DataPortability', 'Lead_SA', 'Article_49_Intl_Transfer', 'CovidHealth', 'Consent', 'DataBreach', 'Article_47_BCR', 'Forgotten', 'Transparency', 'Codes', 'Article_30_5', 'DPIA', 'DecisionMaking'}
wrappers : {'video', 'online_services', 'territorial_scope', 'covi

In [4]:
only_in_originals = originals - df_versions
not_in_originals = df_versions - originals

if only_in_originals:
    print(f"Items in original that are not in df_versions: {only_in_originals}")
elif not_in_originals:
    print(f"Items in df_versions that are not in original: {not_in_originals}")
else:
    print("- originals and df_versions match")

only_in_originals = originals - wrappers
not_in_originals = wrappers - originals

if only_in_originals:
    print(f"Items in original that are not in wrapper: {only_in_originals}")
elif not_in_originals:
    print(f"Items in wrapper that are not in original: {not_in_originals}")
else:
    print("- originals and wrappers match")

only_in_originals = originals - indexes
not_in_originals = indexes - originals

if only_in_originals:
    print(f"Items in original that are not indexed: {only_in_originals}")
elif not_in_originals:
    print(f"Items indexed that are not in original: {not_in_originals}")
else:
    print("- originals and indexes match")

classes_only_in_files = classes_in_df_version - names_of_documents_in_index_data
names_only_in_index = names_of_documents_in_index_data - classes_in_df_version

if classes_only_in_files:
    print(f"Class names in python documents that are not in names_of_documents_in_index_data: {classes_only_in_files}")
elif names_only_in_index:
    print(f"Index classes that do not have python document wrappers: {not_in_originals}")
else:
    print("- class names in python documents and index database match")


- originals and df_versions match
- originals and wrappers match
- originals and indexes match
- class names in python documents and index database match


### Now import the class into gdpr_corpus.rag

In [5]:
file_path= '../gdpr/gdpr_rag/gdpr_corpus.py'

def add_import_to_gdpr_corpus(new_py_file_name, class_name, file_path=file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    # Find the last import statement
    last_import_index = 0
    for i, line in enumerate(lines):
        if line.startswith('from gdpr_rag.documents'):
            last_import_index = i

    # Create the new import statement
    new_import = f"from gdpr_rag.documents.{new_py_file_name} import {class_name}\n"
    
    # Insert the new import statement after the last existing import
    lines.insert(last_import_index + 1, new_import)

    # Write the updated lines back to the file
    with open(file_path, 'w') as file:
        file.writelines(lines)

# Usage
add_import_to_gdpr_corpus('protection', 'Protection')