In [1]:
b = [
    3,
    24,
    26,
    29,
    32,
    35,
    41,
    44,
    47,
    50,
    63,
    66,
    69,
    72,
    74,
    79,
    82,
    85,
    87,
    88,
    96,
    101,
    104,
    113,
    132,
    136,
    140,
    143,
    147,
    149,
    151,
    158,
    170,
    177,
    184,
    189,
]

In [2]:
from tqdm import tqdm
from pathlib import Path
from src.store.nlp import NLPInterface, NLPFactory
from src.core.schemas.guide import Chunks
from src.core.enums import OpenAIRolesEnum

nlp_openai = NLPFactory.create(provider="openai")

PROMPT = """# Task

You are an expert AI assistant specializing in document analysis and information architecture for Retrieval-Augmented Generation (RAG) systems. Your task is to process a chapter from a software user guide and decompose it into distinct, self-contained, semantically complete chunks.

The goal is to create chunks that each represent a single, complete topic, allowing a vector database to retrieve the full context needed to answer a user's question about that specific topic. Additionally, since retrieval will leverage lexical search on a sparse index (e.g., BM25 or TF-IDF), prioritize chunks that maximize term matching by ensuring key terminologies, definitions, and exact phrases are cohesively grouped and preserved intact within relevant chunks.

## Core Principles for Chunking

1. **Topic-Centric:** Each chunk must be centered around one single concept, feature, procedure, or policy. For example, a valid chunk could be "How to reset your password," "Explanation of the 'User Role' field," or "The company's data retention policy."
2. **Self-Contained:** A chunk must contain all the necessary information to be understood on its own. If a procedure has 5 steps, all 5 steps must be in the same chunk. Do not split them.
3. **Ignore Physical Boundaries:** You are not limited by the document's original structure. Combine multiple sentences or even adjacent short paragraphs if they all contribute to the same single topic. Conversely, split a long paragraph if it discusses multiple distinct topics.
4. **Comprehensive but Concise:** Ensure the entire topic is covered within the chunk, but do not include extraneous information or sentences that belong to a different topic.
5. **Maintain Original Text:** Do not summarize, rephrase, or alter the original text. Your output chunks should be direct extractions from the source text.
6. **Page Number Awareness:** On each chunk include the page number.
7. **Lexical Optimization:** Focus on terminologies and definitions to enhance sparse index retrieval. Keep all instances of a specific term, its definition, synonyms, and usage examples together in one chunk. Avoid splitting sections where exact phrases or technical jargon appear, as this ensures high lexical overlap with user queries containing those terms.
"""


class ProcessService:
    def __init__(self, nlp):
        self.nlp: NLPInterface = nlp

    def split_md_file(self, file_path, separator="---#---"):
        content = Path(file_path).read_text(encoding="utf-8")
        pages = content.split(separator)
        pages = [page.strip() for page in pages if page.strip()]
        return pages

    def split_at_boundaries(self, pages, boundaries, num_toc_pages):
        boundaries = sorted(set(boundaries))
        boundaries = [b + num_toc_pages for b in boundaries]

        start = 0
        out = []
        for b in boundaries:
            out.append(pages[start:b])
            start = b
        out.append(pages[start:])
        out.pop(0)
        return ["\n\n---\n\n".join(map(str, sublist)) for sublist in out]

    def chunk(self, file_path, separator, boundaries, num_toc_pages):
        pages = self.split_md_file(file_path, separator)
        topics = self.split_at_boundaries(pages, boundaries, num_toc_pages)

        many_chunks = list()
        for topic in tqdm(topics, total=len(topics), desc="chunking"):
            response = self.nlp.structured_chat(
                response_model=Chunks,
                model_name="gpt-4.1",
                messages=[
                    {
                        "role": OpenAIRolesEnum.SYSTEM.value,
                        "content": PROMPT,
                    },
                    {"role": OpenAIRolesEnum.USER.value, "content": topic},
                ],
            )
            many_chunks.append(response)
        return many_chunks


chunk_service = ProcessService(nlp_openai)
chunks = chunk_service.chunk("assets\data\customers.md", "---#---", b, 1)

chunking: 100%|██████████| 36/36 [43:52<00:00, 73.11s/it]  


In [3]:
print(len(chunks))
flattened = [chunk for obj in chunks for chunk in obj.chunks]
print(len(flattened))

36
420


In [None]:
import json

# Load the JSON file
with open(r"output\customers_v2.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Flatten all chunks into a single list of texts
texts = [chunk for item in data for chunk in item["chunks"]]

# Now `texts` is a list of all the chunk strings
print(f"Total chunks: {len(texts)}")
print(texts[:3])  # Show first few

FileNotFoundError: [Errno 2] No such file or directory: 'output\\customers_v2.json.json'

In [4]:
flattened

['## متغيرات إدارة العملاء\n\n**الاستخدام:**  \nلكل نظام في إصدارات أونكس أي إكس مجموعة من المتغيرات الخاصة به، والتي تستخدم للتحكم وضبط التفضيلات وضبط البدائل الافتراضية الجاهزة مع النظام، وذلك للتيسير والتكيف مع احتياجات المنشأة ومجالها، وتعتبر شاشة المتغيرات من أهم شاشات النظام كونها تتضمن العديد من المتغيرات التي تعكس سياسات ومتطلبات المنشأة.\n\n**تنبيه:**  \nبعض المتغيرات لا يسمح النظام بتعديلها بعد استخدامها في الشاشات والأنظمة المرتبطة بها، ويصبح لون حقل المتغير باهتاً " لا يمكن تعديله"، لذا ينصح دائماً وبشدة بتحري الدقة أثناء ضبط تلك المتغيرات.\n\n**تنبيه:**  \nيمكن للمستخدم الاطلاع على المتغيرات وكذلك الصلاحيات المرتبطة والمؤثرة في آلية عمل الشاشة المستخدمة على مستوى كل شاشة بكل أنظمة أونكس أي إكس من خلال قائمة عمليات الشاشة ثم اختيار متغيرات الشاشة - أو باستخدام الاختصار (Alt+M) حيث تظهر شاشة أخرى تحتوي على المتغيرات والصلاحيات المؤثرة في الشاشة وعرض إعداداتها على مستوى المستخدم.\n\nرقم الصفحة 3',
 '## طريقة استخدام الشاشة\n\nتستخدم الشاشة بعد النقر على زر "تعديل" على النحو ا

In [10]:
from pinecone import Pinecone
import pyarabic.araby as araby
from pinecone_text.sparse import BM25Encoder
from nltk.stem.isri import ISRIStemmer
import nltk

nltk.download("stopwords")

PINECONE_API_KEY = (
    "pcsk_5Ho56W_T3c3KLAQZBEVoqBueWma8j2C7MjfWrgzUrT3mHmGgxKAihEX4kGgtbp9RErcqot"
)
PINECONE_HOST = "https://onyx-sparse-bxkmeye.svc.aped-4627-b74a.pinecone.io"
pc = Pinecone(api_key=PINECONE_API_KEY, host=PINECONE_HOST)
index = pc.Index(host=PINECONE_HOST)

bm25 = BM25Encoder(
    lower_case=False,
    remove_punctuation=False,
    remove_stopwords=False,
    stem=False,
    language="arabic",
)


def arabic_tokenizer(text):
    tokens = araby.tokenize(text)
    stemmer = ISRIStemmer()
    arabic_stopwords = set(nltk.corpus.stopwords.words("arabic"))
    return [stemmer.stem(t) for t in tokens if t not in arabic_stopwords and len(t) > 1]


bm25._tokenizer = arabic_tokenizer


def preprocess_arabic(text):
    text = araby.strip_tashkeel(text)
    text = araby.normalize_alef(text)
    text = araby.normalize_hamza(text)
    text = araby.strip_tatweel(text)
    return text


processed_docs = [preprocess_arabic(doc) for doc in flattened]
bm25.fit(processed_docs)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ramyu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  0%|          | 0/420 [00:00<?, ?it/s]

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x1efc1dbccd0>

In [11]:
import json

# Manually extract and save core BM25 params (excluding tokenizer config)
params = {
    "avgdl": bm25.avgdl,
    "n_docs": bm25.n_docs,
    "doc_freq": {
        "indices": [int(idx) for idx in bm25.doc_freq.keys()],
        "values": [float(val) for val in bm25.doc_freq.values()],
    },
    "b": bm25.b,
    "k1": bm25.k1,
}

with open("bm25_values_v2.json", "w") as f:
    json.dump(params, f)

In [6]:
vectors_to_upsert = []

for i, doc in enumerate(tqdm(processed_docs)):
    sparse_vector = bm25.encode_documents(doc)
    vectors_to_upsert.append(
        {
            "id": str(i),
            "sparse_values": sparse_vector,
            "metadata": {"text": flattened[i]},
        }
    )

index.upsert(vectors=vectors_to_upsert, namespace="customers-sparse-v2")

100%|██████████| 420/420 [00:00<00:00, 619.10it/s]


{'upserted_count': 420}