In [1]:
import os
import logging

from abc import ABC, abstractmethod

from regulations_rag.corpus_index import DataFrameCorpusIndex
#from regulations_rag.rerank import RerankAlgos, rerank

# Create a logger for this module
logger = logging.getLogger(__name__)
DEV_LEVEL = 15
logging.addLevelName(DEV_LEVEL, 'DEV')       
from regulations_rag.file_tools import load_parquet_data

from cemad_rag.cemad_corpus import CEMADCorpus
import pandas as pd

key = os.getenv('excon_encryption_key')


In [6]:
corpus = CEMADCorpus("./cemad_rag/documents/")
index_folder = "./inputs/index/"
index_df = pd.DataFrame()
list_of_index_files = ["ad_index.parquet", "ad_index_plus.parquet"]
# for filename in os.listdir(index_folder):
for filename in list_of_index_files:
    if filename.endswith(".parquet"):  
        filepath = os.path.join(index_folder, filename)
        df = load_parquet_data(filepath, key)
        index_df = pd.concat([index_df, df], ignore_index = True)

df_index_df = pd.DataFrame()
list_of_definitions_index_files = ["ad_definitions.parquet"]
for filename in list_of_definitions_index_files:
    if filename.endswith(".parquet"):  
        filepath = os.path.join(index_folder, filename)
        df = pd.read_parquet(filepath, engine="pyarrow") # not encrypted
        df_index_df = pd.concat([df_index_df, df], ignore_index = True)

user_type = "an Authorised Dealer (AD)" 
corpus_description = "South African \'Currency and Exchange Manual for Authorised Dealers\' (CEMAD)"

definitions = df_index_df
definitions["text"] = definitions["definition"]
index = index_df
workflow = pd.read_parquet(os.path.join(index_folder, "workflow.parquet"), engine="pyarrow")


In [5]:
definitions.columns.to_list()


['definition', 'source', 'embedding', 'document', 'section_reference']

In [7]:
columns_in_dfns = ["embedding", "document", "section_reference", "text"]
for column in columns_in_dfns:
    if column not in definitions.columns.to_list():
        print(column)


In [4]:
columns_in_sections = ["embedding", "document", "section_reference", "source", "text"]
for column in columns_in_sections:
    if column not in index.columns.to_list():
        print(column)


In [8]:
#df_plus['document'] = "CEMAD_User_Queries"
df_plus

Unnamed: 0,section_reference,text,source,embedding,document
0,Z.1(A)(i),What does cemad say about commodities?,document,"[-0.05561697483062744, 0.001053479383699596, -...",CEMAD_User_Queries
1,Z.1(B)(i),What rules apply to residents travelling abroad?,document,"[-0.015550563111901283, 0.020780662074685097, ...",CEMAD_User_Queries
2,Z.2(A)(i),What version of the manual are you using?,document,"[-0.003231549635529518, -0.005595962051302195,...",CEMAD_User_Queries
3,Z.2(A)(i),What is the date CEMAD?,document,"[-0.06731810420751572, 0.007594701834022999, -...",CEMAD_User_Queries
4,Z.2(A)(i),What is the current version date of the manual?,document,"[-0.01723528653383255, 0.010368955321609974, -...",CEMAD_User_Queries


In [9]:
#save_parquet_data(df, "../inputs/index/ad_index.parquet", key)
save_parquet_data(df_plus, "../inputs/index/ad_index_plus.parquet", key)


In [5]:
df_dfns = load_parquet_data("../inputs/index/ad_definitions.parquet")
#df_dfns["document"] = "CEMAD"
df_dfns


Unnamed: 0,definition,source,embedding,document,section_reference
0,"Treasury means, in relation to any matter cont...",all,"[0.012373767793178558, 0.03887529298663139, -0...",CEMAD,A.1
1,ADLA Manual means the Currency and Exchanges M...,all,"[0.018092811107635498, 0.010141296312212944, -...",CEMAD,A.1
2,"Affected person means a body corporate, founda...",all,"[-0.037813764065504074, 0.07012207806110382, -...",CEMAD,A.1
3,Africa means any country forming part of the A...,all,"[0.015041325241327286, 0.10257036238908768, -0...",CEMAD,A.1
4,"Authorised Dealer means, in relation to any tr...",all,"[0.02330564334988594, 0.032766155898571014, -0...",CEMAD,A.1
...,...,...,...,...,...
97,Local outsourcing is the subcontracting of bus...,cloud,"[-0.0006997798918746412, 0.012086263857781887,...",CEMAD,J.D(i)
98,International outsourcing is the subcontractin...,cloud,"[-0.00498924357816577, 0.01612837053835392, -0...",CEMAD,J.D(i)
99,Regulatory access to data refers to FinSurv's ...,cloud,"[-0.032946694642305374, 0.11769585311412811, -...",CEMAD,J.D(i)
100,System replication refers to duplication of tr...,cloud,"[0.0027270971331745386, 0.013970836065709591, ...",CEMAD,J.D(i)


In [6]:
# Create a new column 'section_reference' and initialize it with an empty string
df_dfns['section_reference'] = ''

# Set 'section_reference' for rows where 'source' is 'cloud'
df_dfns.loc[df_dfns['source'] == 'cloud', 'section_reference'] = 'J.(D)(i)'
df_dfns.loc[df_dfns['source'] == 'securities', 'section_reference'] = 'G.(A)(ii)'

df_dfns.loc[df_dfns['source'] == 'insurance', 'section_reference'] = 'B.10.(B)'
df_dfns.loc[df_dfns['source'] == 'all', 'section_reference'] = 'A.1'



In [7]:
save_parquet_data(df_dfns, "../inputs/index/ad_definitions.parquet")