In [2]:
import importlib

import src.valid_index
importlib.reload(src.valid_index)
from src.valid_index import get_excon_manual_index

import src.file_tools
importlib.reload(src.file_tools)
from src.file_tools import read_processed_regs_into_dataframe, get_regulation_detail

import src.embeddings
importlib.reload(src.embeddings)
from src.embeddings import get_ada_embedding

import src.tree_tools
importlib.reload(src.tree_tools)
from src.tree_tools import build_tree_for_regulation, split_tree

import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


file_list = []
file_list.append('./manual/excon_manual_A_F.txt')
file_list.append('./manual/excon_manual_G_K.txt')

non_text_labels = ['Table', 'Formula', 'Example', 'Definition']

index_adla = get_excon_manual_index()
df_adla, non_text = read_processed_regs_into_dataframe(file_list=file_list, valid_index_checker=index_adla, non_text_labels=non_text_labels)


In [3]:
import re

def format_text_line(line):
    # Regular expression to match the required pattern
    pattern = re.compile(r'^(\d{3})\s*(\|\s*)?(\d{2})?\s*\|?\s*(.*)$')

    match = pattern.match(line)
    if not match:
        raise ValueError(f"Input line does not conform to the standard: {line}")

    # Extracting the groups
    three_digit_number = match.group(1)
    two_digit_number = match.group(3) if match.group(3) else ''
    text = match.group(4)

    return f"{three_digit_number} | {two_digit_number} | {text}"


In [4]:
import re

# Pattern to identify start and end of blocks, only capturing numbers from 9 to 99
start_pattern = re.compile(r"\s*#Table ([1-9][0-9]|[9])\s*")
end_pattern = re.compile(r"\s*#Table ([1-9][0-9]|[9]) - end\s*")

# Array to hold the extracted blocks
blocks = []
current_block = []
previous_line = None
inside_block = False

counter = 9

with open('./manual/excon_manual_G_K.txt', 'r', encoding='utf-8') as file:
    for line in file:
        # Check if the line is a start of a block
        if start_pattern.match(line) and not end_pattern.match(line):
            start_match = start_pattern.match(line)
            if previous_line is not None:
                current_block.append(previous_line.strip())
            current_block.append(line.strip())
            inside_block = True
            current_block_number = start_match.group(1)
            mismatch = current_block_number == counter
            if mismatch:
                print(f'Block {current_block_number}')
            counter +=1
        # Check if the line is an end of a block
        elif end_match := end_pattern.match(line):
            if end_match.group(1) == current_block_number:
                current_block.append(line.strip())
                blocks.append(current_block)
                current_block = []
                inside_block = False
            else:
                print(f"Huston we have a problem: {line}")
        elif inside_block:
            # Add line to current block if we are inside a block
            try:
                current_block.append(format_text_line(line.strip()))
            except ValueError as e:
                print(f'{current_block_number}: {line.strip()}')
        # Store the line as previous for the next iteration
        previous_line = line

# blocks now contains the extracted text blocks for tables numbered 9 to 99


In [5]:
# Check that each block that should contain a table is formatted correctly
heading_pattern = re.compile(r" see table \d+$", re.IGNORECASE)
start_pattern = re.compile(r"\s*#Table ([1-9][0-9]|[9])\s*")
end_pattern = re.compile(r"\s*#Table ([1-9][0-9]|[9]) - end\s*")

for current_block in blocks:
    if not re.search(heading_pattern, current_block[0]):
        print(f"Heading {current_block[0]}")

    if not re.search(start_pattern, current_block[1]):
        print(f"Start {current_block[1]}")
    
    if not re.search(end_pattern, current_block[-1]):
        print(f"End {current_block[-1]}")


In [6]:
all_codes = []

pattern = re.compile(r"(.*) see table (\d+)$", re.IGNORECASE)

for current_block in blocks:
    match = re.search(pattern, current_block[0])
    if match:
        section_description = match.group(1)
        table_number = int(match.group(2))
        if table_number <= 54:
            current_code_action = "Inward"
        else:
            current_code_action = "Outward"
    else:
        print("Pattern not found:", current_block[0])
    
    for i in range(2, len(current_block)-1):
        current_code = [item.strip() for item in current_block[i].split("|")]
        for text in current_code:
            text = text.strip()
        if len(current_code) != 3:
            print("we have a problem")
        current_code.append(section_description)
        current_code.append(current_code_action)
        all_codes.append(current_code)


In [7]:
len(all_codes)

636

In [8]:
import pandas as pd

df = pd.DataFrame(all_codes, columns = ["Category", "Sub-category", "Category Description", "Section", "Inward or Outward"])


In [39]:
df[["Category", "Sub-category", "Category Description"]]

Unnamed: 0,Category,Sub-category,Category Description
0,100,,Adjustments / Reversals / Refunds applicable t...
1,101,01,Export advance payment (excluding capital good...
2,101,02,Export advance payment - capital goods
3,101,03,Export advance payment – gold
4,101,04,Export advance payment – platinum
...,...,...,...
631,832,,Not allocated
632,833,,Credit/Debit card company settlement as well a...
633,834,,Not allocated
634,835,,Not allocated


In [211]:
df["Embedding"] = df["Category Description"].apply(get_ada_embedding)

: 

In [10]:
df

Unnamed: 0,Category,Sub-category,Category Description,Section,Inward or Outward,Embedding
0,100,,Adjustments / Reversals / Refunds applicable t...,Transaction adjustments,Inward,"[-0.006666785106062889, 0.001338382251560688, ..."
1,101,01,Export advance payment (excluding capital good...,Exports : Advance payments,Inward,"[-0.004169201944023371, -0.01017824001610279, ..."
2,101,02,Export advance payment - capital goods,Exports : Advance payments,Inward,"[-0.005698998924344778, -0.014980997890233994,..."
3,101,03,Export advance payment – gold,Exports : Advance payments,Inward,"[0.0017920573009178042, -0.001992132980376482,..."
4,101,04,Export advance payment – platinum,Exports : Advance payments,Inward,"[-0.004406240303069353, -0.00664734560996294, ..."
...,...,...,...,...,...,...
631,832,,Not allocated,Miscellaneous payments,Outward,"[-0.01834322139620781, -0.021168287843465805, ..."
632,833,,Credit/Debit card company settlement as well a...,Miscellaneous payments,Outward,"[-0.012952337972819805, -0.0034863881301134825..."
633,834,,Not allocated,Miscellaneous payments,Outward,"[-0.018370389938354492, -0.021169312298297882,..."
634,835,,Not allocated,Miscellaneous payments,Outward,"[-0.01834322139620781, -0.021168287843465805, ..."


In [9]:
increment = 10
for i in range(0, len(df), increment):
    chunk = df.iloc[i:i+increment].copy()
    chunk["Embedding"] = chunk["Category Description"].apply(get_ada_embedding)
    df.loc[chunk.index, "Embedding"] = chunk["Embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines
Completed 60 lines
Completed 70 lines
Completed 80 lines
Completed 90 lines
Completed 100 lines
Completed 110 lines
Completed 120 lines
Completed 130 lines
Completed 140 lines
Completed 150 lines
Completed 160 lines
Completed 170 lines
Completed 180 lines
Completed 190 lines
Completed 200 lines
Completed 210 lines
Completed 220 lines
Completed 230 lines
Completed 240 lines
Completed 250 lines
Completed 260 lines
Completed 270 lines
Completed 280 lines
Completed 290 lines
Completed 300 lines
Completed 310 lines
Completed 320 lines
Completed 330 lines
Completed 340 lines
Completed 350 lines
Completed 360 lines
Completed 370 lines
Completed 380 lines
Completed 390 lines
Completed 400 lines
Completed 410 lines
Completed 420 lines
Completed 430 lines
Completed 440 lines
Completed 450 lines
Completed 460 lines
Completed 470 lines
Completed 480 lines
Completed 490 lines
Completed 500 lines
Completed

In [11]:
df.to_parquet("./inputs/bopcodes.parquet", engine="pyarrow")

In [12]:
import pandas as pd
df_codes = pd.read_parquet("./inputs/bopcodes.parquet", engine="pyarrow")

In [37]:
from src.embeddings import get_closest_nodes
question = "shares"
question_embedding = get_ada_embedding(question)
closest_nodes = get_closest_nodes(df_codes, "Embedding", question_embedding, threshold = 0.25)

In [38]:
closest_nodes

Unnamed: 0,Category,Sub-category,Category Description,Section,Inward or Outward,Embedding,cosine_distance
457,301,,Dividends,Income payments,Outward,"[-0.016767853870987892, -0.014169476926326752,...",0.172162
138,301,,Dividends,Income receipts,Inward,"[-0.016767853870987892, -0.014169476926326752,...",0.172162
198,511,01,Disinvestment of capital by a resident individ...,Disinvestment of capital,Inward,"[-0.0011835031909868121, -0.02320314757525921,...",0.183136
257,605,08,Disinvestment of shares by resident - Financia...,Disinvestment by a resident corporate entity,Inward,"[-0.0016097904881462455, -0.02383747324347496,...",0.195130
145,308,,Rental,Income receipts,Inward,"[-0.014055502600967884, -0.013633969239890575,...",0.195489
...,...,...,...,...,...,...,...
629,830,,Details of payments not classified,Miscellaneous payments,Outward,"[-0.0042205145582556725, 0.011468203738331795,...",0.249661
90,241,,Repairs and maintenance on machinery and equip...,Technical related services,Inward,"[-0.013513672165572643, -0.02152956835925579, ...",0.249721
409,241,,Repairs and maintenance on machinery and equip...,Technical related services,Outward,"[-0.013514605350792408, -0.021558206528425217,...",0.249786
444,288,,Payment for accounting services,Other business services obtained,Outward,"[0.012934829108417034, -0.01869451254606247, -...",0.249870


In [17]:
df_codes["Embedding"]


0      [-0.006666785106062889, 0.001338382251560688, ...
1      [-0.004169201944023371, -0.01017824001610279, ...
2      [-0.005698998924344778, -0.014980997890233994,...
3      [0.0017920573009178042, -0.001992132980376482,...
4      [-0.004406240303069353, -0.00664734560996294, ...
                             ...                        
631    [-0.01834322139620781, -0.021168287843465805, ...
632    [-0.012952337972819805, -0.0034863881301134825...
633    [-0.018370389938354492, -0.021169312298297882,...
634    [-0.01834322139620781, -0.021168287843465805, ...
635    [-0.01834322139620781, -0.021168287843465805, ...
Name: Embedding, Length: 636, dtype: object