In [1]:
from pinecone import Pinecone
import pyarabic.araby as araby
from tqdm.auto import tqdm

In [2]:
PINECONE_API_KEY = "pcsk_5Ho56W_T3c3KLAQZBEVoqBueWma8j2C7MjfWrgzUrT3mHmGgxKAihEX4kGgtbp9RErcqot"
PINECONE_HOST = "https://onyx-sparse-bxkmeye.svc.aped-4627-b74a.pinecone.io"
pc = Pinecone(api_key=PINECONE_API_KEY, host=PINECONE_HOST)
index = pc.Index(host=PINECONE_HOST)

In [None]:
from pinecone_text.sparse import BM25Encoder, SpladeEncoder

BM25Encoder(language="arabic")
splade = SpladeEncoder(model_name_or_path="naver/splade-cocondenser-ensembledistil")

TypeError: SpladeEncoder.__init__() got an unexpected keyword argument 'model_name_or_path'

In [6]:
def preprocess_arabic(text):
    text = araby.strip_tashkeel(text)
    text = araby.normalize_alef(text)
    text = araby.normalize_hamza(text)
    text = araby.strip_tatweel(text)
    return text

In [7]:
documents = [
    "القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها.",
    "الذكاء الاصطناعي هو فرع من علوم الحاسوب يهدف إلى إنشاء آلات ذكية.",
    "تعتبر الأهرامات في الجيزة من عجائب الدنيا السبع القديمة.",
    "يعمل التعلم الآلي على تحليل البيانات وبناء النماذج التنبؤية.",
    "النيل هو أطول نهر في العالم ويمر عبر العديد من الدول الأفريقية.",
    "تستخدم الشبكات العصبية في تطبيقات التعرف على الصور ومعالجة اللغات الطبيعية.",
]

In [8]:
processed_docs = [preprocess_arabic(doc) for doc in documents]

vectors_to_upsert = []

for i, doc in enumerate(tqdm(processed_docs)):
    sparse_vector = splade.encode_documents(doc)
    vectors_to_upsert.append(
        {
            "id": str(i),
            "sparse_values": sparse_vector,
            "metadata": {"text": documents[i]},
        }
    )

index.upsert(vectors=vectors_to_upsert)

  0%|          | 0/6 [00:00<?, ?it/s]

{'upserted_count': 6}

In [9]:
print(index.describe_index_stats())

{'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {'': {'vector_count': 6}},
 'total_vector_count': 6,
 'vector_type': 'sparse'}


In [10]:
def search(query):
    processed_query = preprocess_arabic(query)
    sparse_qv = splade.encode_queries(processed_query)
    result = index.query(sparse_vector=sparse_qv, top_k=3, include_metadata=True)
    print(f"\nSearch results for: '{query}'")
    for match in result["matches"]:
        print(f"  Score: {match['score']:.4f}, Text: {match['metadata']['text']}")

In [11]:
search("الشبكات العصبية والتعرف على الصور")


Search results for: 'الشبكات العصبية والتعرف على الصور'
  Score: 72.3546, Text: القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها.
  Score: 68.6098, Text: تعتبر الأهرامات في الجيزة من عجائب الدنيا السبع القديمة.
  Score: 68.3164, Text: الذكاء الاصطناعي هو فرع من علوم الحاسوب يهدف إلى إنشاء آلات ذكية.


In [12]:
search("ما هي المدينة الرئيسية في مصر؟")


Search results for: 'ما هي المدينة الرئيسية في مصر؟'
  Score: 58.1857, Text: القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها.
  Score: 55.9118, Text: الذكاء الاصطناعي هو فرع من علوم الحاسوب يهدف إلى إنشاء آلات ذكية.
  Score: 54.9159, Text: تعتبر الأهرامات في الجيزة من عجائب الدنيا السبع القديمة.


In [16]:
PINECONE_API_KEY = (
    "pcsk_5Ho56W_T3c3KLAQZBEVoqBueWma8j2C7MjfWrgzUrT3mHmGgxKAihEX4kGgtbp9RErcqot"
)
PINECONE_HOST = "https://onyx-sparse-bxkmeye.svc.aped-4627-b74a.pinecone.io"
pc = Pinecone(api_key=PINECONE_API_KEY, host=PINECONE_HOST)
index = pc.Index(host=PINECONE_HOST)

from FlagEmbedding import BGEM3FlagModel
import pyarabic.araby as araby  # Assuming this is the 'araby' you meant; if it's a different package, adjust accordingly
from tqdm import tqdm

model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)  # use_fp16 for faster computation


def preprocess_arabic(text):
    text = araby.strip_tashkeel(text)
    text = araby.normalize_alef(text)
    text = araby.normalize_hamza(text)
    text = araby.strip_tatweel(text)
    return text


documents = [
    "القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها.",
    "الذكاء الاصطناعي هو فرع من علوم الحاسوب يهدف إلى إنشاء آلات ذكية.",
    "تعتبر الأهرامات في الجيزة من عجائب الدنيا السبع القديمة.",
    "يعمل التعلم الآلي على تحليل البيانات وبناء النماذج التنبؤية.",
    "النيل هو أطول نهر في العالم ويمر عبر العديد من الدول الأفريقية.",
    "تستخدم الشبكات العصبية في تطبيقات التعرف على الصور ومعالجة اللغات الطبيعية.",
]
processed_docs = [preprocess_arabic(doc) for doc in documents]

vectors_to_upsert = []

for i, doc in enumerate(tqdm(processed_docs)):
    # Encode document for sparse (lexical_weights is a dict of {token_id: weight})
    output = model.encode(
        doc, return_dense=False, return_sparse=True, return_colbert_vecs=False
    )
    lexical_weights = output["lexical_weights"]
    sparse_vector = {
        "indices": list(lexical_weights.keys()),
        "values": list(lexical_weights.values()),
    }
    vectors_to_upsert.append(
        {
            "id": str(i),
            "sparse_values": sparse_vector,
            "metadata": {"text": documents[i]},
        }
    )

index.upsert(vectors=vectors_to_upsert)


def search(query):
    processed_query = preprocess_arabic(query)
    # Encode query for sparse
    output = model.encode(
        processed_query,
        return_dense=False,
        return_sparse=True,
        return_colbert_vecs=False,
    )
    lexical_weights = output["lexical_weights"]
    sparse_qv = {
        "indices": list(lexical_weights.keys()),
        "values": list(lexical_weights.values()),
    }
    result = index.query(sparse_vector=sparse_qv, top_k=3, include_metadata=True)
    print(f"\nSearch results for: '{query}'")
    for match in result["matches"]:
        print(f"  Score: {match['score']:.4f}, Text: {match['metadata']['text']}")


# Test queries
search("الشبكات العصبية والتعرف على الصور")
search("ما هي المدينة الرئيسية في مصر؟")

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

bm25.jpg:   0%|          | 0.00/132k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

colbert_linear.pt:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

miracl.jpg:   0%|          | 0.00/576k [00:00<?, ?B/s]

nqa.jpg:   0%|          | 0.00/158k [00:00<?, ?B/s]

mkqa.jpg:   0%|          | 0.00/608k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/485k [00:00<?, ?B/s]

others.webp:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/127k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Constant_7_attr__value:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.onnx:   0%|          | 0.00/725k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.onnx_data:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sparse_linear.pt:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/BAAI/bge-m3/resolve/5617a9f61b028005a4858fdac845db406aefb181/pytorch_model.bin: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/BAAI/bge-m3/resolve/5617a9f61b028005a4858fdac845db406aefb181/onnx/model.onnx_data: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /BAAI/bge-m3/resolve/5617a9f61b028005a4858fdac845db406aefb181/onnx/model.onnx_data (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000020734586950>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: a321b432-b90d-426f-897f-f02e0b3f630d)')' thrown while requesting GET https://huggingface.co/BAAI/bge-m3/resolve/5617a9f61b028005a4858fdac84

pytorch_model.bin:  47%|####7     | 1.07G/2.27G [00:00<?, ?B/s]

model.onnx_data:  48%|####8     | 1.09G/2.27G [00:00<?, ?B/s]

  0%|          | 0/6 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 6/6 [00:05<00:00,  1.08it/s]


SparseValuesTypeError: Found unexpected data in column `sparse_values`. Expected format is `'sparse_values': {'indices': List[int], 'values': List[float]}`.

In [1]:
l = [1, 3, 2, 5, 5]
lc = [1, 3, 255555]
c = lc + l
c

[1, 3, 255555, 1, 3, 2, 5, 5]