In [None]:
from src.graph.workflow import init_workflow
from src.store.nlp import NLPFactory
from src.store.vectordb import VectorDBFactory
from src.core.config import get_settings

SETTINGS = get_settings()

nlp_openai = NLPFactory.create(provider="openai")
nlp_gemini = NLPFactory.create(provider="gemini")
nlp_cohere = NLPFactory.create(provider="cohere")
vectordb_factory = VectorDBFactory()
vectordb = vectordb_factory.create(provider="pinecone", settings=SETTINGS)
vectordb.connect()
workflow = init_workflow(nlp_openai, nlp_gemini, nlp_cohere, vectordb)

query = "ازاي احسب اجمالي هامش الربح؟"
# namespace = "customers_chunking_rewrite"
namespace = "customers_chunk_fixed"
res = ""
for event in workflow.stream({"user_message": query, "namespace": namespace}):
    print(event)
    if event.get("chat"):
        res = event["chat"]["response"]

In [None]:
from IPython.display import display, Markdown

display(Markdown(f"<div dir='rtl'>{res}</div>"))

In [None]:
from pinecone import Pinecone
import pyarabic.araby as araby
from tqdm.auto import tqdm
from pinecone_text.sparse import BM25Encoder
from nltk.stem.isri import ISRIStemmer
import nltk

nltk.download("stopwords")  # Downloads stopwords, including Arabic

# --- Your existing setup code (mostly unchanged) ---


PINECONE_API_KEY = (
    "pcsk_5Ho56W_T3c3KLAQZBEVoqBueWma8j2C7MjfWrgzUrT3mHmGgxKAihEX4kGgtbp9RErcqot"
)
PINECONE_HOST = "https://onyx-sparse-bxkmeye.svc.aped-4627-b74a.pinecone.io"
pc = Pinecone(api_key=PINECONE_API_KEY, host=PINECONE_HOST)
index = pc.Index(host=PINECONE_HOST)

# Initialize BM25 without language (we'll override tokenizer)
bm25 = BM25Encoder()


# Custom tokenizer for Arabic: Tokenize, stem, remove stopwords and short tokens
def arabic_tokenizer(text):
    tokens = araby.tokenize(text)
    stemmer = ISRIStemmer()
    arabic_stopwords = set(nltk.corpus.stopwords.words("arabic"))
    return [stemmer.stem(t) for t in tokens if t not in arabic_stopwords and len(t) > 1]


# Override the default tokenizer
bm25._tokenizer = arabic_tokenizer


def preprocess_arabic(text):
    text = araby.strip_tashkeel(text)
    text = araby.normalize_alef(text)
    text = araby.normalize_hamza(text)
    text = araby.strip_tatweel(text)
    return text


documents = [
    "القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها.",
    "الذكاء الاصطناعي هو فرع من علوم الحاسوب يهدف إلى إنشاء آلات ذكية.",
    "تعتبر الأهرامات في الجيزة من عجائب الدنيا السبع القديمة.",
    "يعمل التعلم الآلي على تحليل البيانات وبناء النماذج التنبؤية.",
    "النيل هو أطول نهر في العالم ويمر عبر العديد من الدول الأفريقية.",
    "تستخدم الشبكات العصبية في تطبيقات التعرف على الصور ومعالجة اللغات الطبيعية.",
]
processed_docs = [preprocess_arabic(doc) for doc in documents]

# Fit the BM25 encoder on your documents before encoding
print("Fitting BM25 encoder...")
bm25.fit(processed_docs)
print("Fit complete.")

vectors_to_upsert = []

for i, doc in enumerate(tqdm(processed_docs)):
    sparse_vector = bm25.encode_documents(doc)
    vectors_to_upsert.append(
        {
            "id": str(i),
            "sparse_values": sparse_vector,
            "metadata": {"text": documents[i]},
        }
    )

# Upsert in batches for better performance, especially with more data
# For this small example, a single upsert is fine.
index.upsert(vectors=vectors_to_upsert, namespace="customers-sparse")
print("Upsert complete.")


def search(query):
    processed_query = preprocess_arabic(query)
    sparse_qv = bm25.encode_queries(processed_query)
    result = index.query(sparse_vector=sparse_qv, top_k=3, include_metadata=True, namespace="customers-sparse")
    print(f"\nSearch results for: '{query}'")
    for match in result["matches"]:
        print(f"  Score: {match['score']:.4f}, Text: {match['metadata']['text']}")


# --- Your search calls (unchanged) ---
search("الشبكات العصبية والتعرف على الصور")
search("ما هي المدينة الرئيسية في مصر؟")

In [9]:
from pinecone import Pinecone, ServerlessSpec
import pyarabic.araby as araby
from pinecone_text.sparse import BM25Encoder
from nltk.stem.isri import ISRIStemmer
import nltk
import os
import json

nltk.download("stopwords")

PINECONE_API_KEY = (
    "pcsk_5Ho56W_T3c3KLAQZBEVoqBueWma8j2C7MjfWrgzUrT3mHmGgxKAihEX4kGgtbp9RErcqot"
)
PINECONE_HOST = "https://onyx-sparse-bxkmeye.svc.aped-4627-b74a.pinecone.io"


# Connect to Pinecone
try:
    pc = Pinecone(api_key=PINECONE_API_KEY)
    index = pc.Index(host=PINECONE_HOST)
    print("Successfully connected to Pinecone.")
except Exception as e:
    print(f"Error connecting to Pinecone: {e}")
    exit()

# Download necessary NLTK data
try:
    nltk.data.find("corpora/stopwords")
except nltk.downloader.DownloadError:
    nltk.download("stopwords")

# --- Arabic Text Preprocessing Functions ---


def preprocess_arabic(text):
    """Normalizes and cleans Arabic text."""
    text = araby.strip_tashkeel(text)
    text = araby.normalize_alef(text)
    text = araby.normalize_hamza(text)
    text = araby.strip_tatweel(text)
    return text


def arabic_tokenizer(text):
    """
    Tokenizes, stems, and removes stopwords from Arabic text.
    Uses ISRIStemmer for stemming.
    """
    tokens = araby.tokenize(text)
    stemmer = ISRIStemmer()
    arabic_stopwords = set(nltk.corpus.stopwords.words("arabic"))
    return [stemmer.stem(t) for t in tokens if t not in arabic_stopwords and len(t) > 1]


# --- Load the BM25 Encoder ---

print("Loading BM25 model from file...")
# **FIX:** The error indicates 'bm25_values.json' might be malformed or from an
# older version. This code robustly loads it by manually checking for and
# adding missing required parameters before initializing the encoder.
try:
    with open("bm25_values.json", "r", encoding="utf-8") as f:
        params = json.load(f)

    # Ensure required keys exist to prevent TypeError.
    # This is a workaround for a malformed model file.
    params.setdefault("remove_punctuation", False)
    params.setdefault("remove_stopwords", False)
    params.setdefault("stem", False)

    # Manually create the encoder and set its parameters
    bm25 = BM25Encoder()
    bm25.set_params(**params)

    # After loading, you MUST re-assign the custom tokenizer function.
    # The JSON file saves the *parameters* but cannot save the function object itself.
    bm25._tokenizer = arabic_tokenizer
    print("BM25 model loaded successfully.")
except FileNotFoundError:
    print(
        "Error: 'bm25_values.json' not found. Please run 'create_bm25_model.py' first."
    )
    exit()
except Exception as e:
    print(f"An error occurred while loading the BM25 model: {e}")
    exit()


# --- Search Function ---


def search_keywords(query, top_k=5):
    """
    Takes a query, processes it, encodes it with BM25, and queries Pinecone.
    """
    if not query:
        return []

    print(f"\nSearching for: '{query}'")

    # 1. Preprocess the query text
    processed_query = preprocess_arabic(query)

    # 2. Encode the query into a sparse vector using the loaded BM25 model
    print("Encoding query...")
    sparse_qv = bm25.encode_queries(processed_query)

    # 3. Query Pinecone
    print("Querying Pinecone index...")
    try:
        result = index.query(
            sparse_vector=sparse_qv,
            top_k=top_k,
            include_metadata=True,
            namespace="customers-sparse-v1",  # Make sure this namespace is correct
        )
    except Exception as e:
        print(f"An error occurred during Pinecone query: {e}")
        return []

    # 4. Extract keywords from metadata
    keywords = [match["metadata"]["text"] for match in result.get("matches", [])]

    print(f"Found keywords: {keywords}")
    return keywords


# --- Example Usage (similar to how a FastAPI endpoint would call it) ---

if __name__ == "__main__":
    # This simulates running the search function
    test_query_1 = "محركات البحث"
    search_results_1 = search_keywords(test_query_1, top_k=3)

    test_query_2 = "الذكاء الاصطناعي"
    search_results_2 = search_keywords(test_query_2, top_k=3)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ramyu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Successfully connected to Pinecone.
Loading BM25 model from file...
BM25 model loaded successfully.

Searching for: 'محركات البحث'
Encoding query...
Querying Pinecone index...
Found keywords: ['{"page_start": 122, "page_end": 122, "chapter": "نظام العملاء (أونكس أي إكس) ERP"}\n\n### [Chapter: نظام العملاء - Section: البيانات التفصيلية]\n\n#### ثامنا: البيانات التفصيلية\n\n**ملاحظة**: حيث أن البيانات الأساسية تمثل الطرف المدين للقيد سواء طرف واحد أو متعدد الأطراف، بينما تمثل البيانات التفصيلية الجانب الدائن.\n\n##### - رقم/اسم الحساب\n\nيستخدم هذا الحقل لاختيار رقم الحساب "الدائن" بواسطة الضغط على زر (F9) أو إدخاله يدوياً، وبمجرد اختيار الحساب يظهر الاسم في الحقل المخصص له، ويمكن البحث عن الحساب في قائمة الحسابات برمز الحساب أو جزء منه أو اسمه أو جزء من الاسم في حقل البحث.\n\n###### ملاحظة عامة\n\nتظهر في الحقول الخاصة بإدخال رقم الحساب - سواء رئيسي أو تحليلي أو تحليلي فرعي - في جميع شاشات النظام بعض المفاتيح المساعدة التي تقوم بمهام ووظائف معينة، وتظهر هذه المفاتيح أسفل يسار الشاشة، كم