In [None]:
import re
import json



def extract_text_from_md(md_path):
    """Extract text from a given Markdown (.md) file."""
    with open(md_path, "r", encoding="utf-8") as file:
        text = file.read()
    return text

def split_into_chunks(text, min_length=500, max_length=1000):
    """Chunk text while preserving section numbers."""
    sections = re.split(r'(\n\d+\.\s+)', text)  # Split while keeping section numbers
    chunks = []
    current_chunk = ""

    for i in range(1, len(sections), 2):  # Iterate over section numbers and text
        section_number = sections[i].strip() if i < len(sections) else ""
        section_text = sections[i+1].strip() if i+1 < len(sections) else ""

        if len(current_chunk) + len(section_text) < max_length:
            current_chunk += f"{section_number} {section_text}\n"
        else:
            chunks.append(current_chunk.strip())
            current_chunk = f"{section_number} {section_text}"

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def chunk_legal_document(text):
    # Regular expression to match section headers
    section_pattern = re.compile(r'## Section \d+:|# Offences .+')

    # Split the text into chunks based on section headers
    chunks = []
    current_chunk = []

    lines = text.split('\n')
    for line in lines:
        if section_pattern.match(line):
            if current_chunk:
                chunks.append('\n'.join(current_chunk))
                current_chunk = []
        current_chunk.append(line)

    if current_chunk:
        chunks.append('\n'.join(current_chunk))

    return chunks


In [None]:
# Example usage
with open("../data/bns_instructions.md", 'r', encoding='utf-8') as file:
    text = file.read()

chunks = chunk_legal_document(text)

# Print the first few chunks to verify
for i, chunk in enumerate(chunks[:5]):
    print(f"Chunk {i+1}:\n{chunk}\n{'-'*40}\n")

In [None]:
data = {"BNS": chunks}

with open("legal_chunks.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4)

print(f"Chunking completed!")