In [1]:
import json

with open(r"output\customers_v3.json", "r", encoding="utf-8") as f:
    data = json.load(f)

texts = [chunk for item in data for chunk in item["chunks"]]

In [2]:
from pinecone import Pinecone
import pyarabic.araby as araby
from pinecone_text.sparse import BM25Encoder
from nltk.stem.isri import ISRIStemmer
import nltk

nltk.download("stopwords")

PINECONE_API_KEY = (
    "pcsk_5Ho56W_T3c3KLAQZBEVoqBueWma8j2C7MjfWrgzUrT3mHmGgxKAihEX4kGgtbp9RErcqot"
)
PINECONE_HOST = "https://onyx-sparse-bxkmeye.svc.aped-4627-b74a.pinecone.io"
pc = Pinecone(api_key=PINECONE_API_KEY, host=PINECONE_HOST)
index = pc.Index(host=PINECONE_HOST)

bm25 = BM25Encoder(
    lower_case=False,
    remove_punctuation=False,
    remove_stopwords=False,
    stem=False,
    language="arabic",
)


def arabic_tokenizer(text):
    tokens = araby.tokenize(text)
    stemmer = ISRIStemmer()
    arabic_stopwords = set(nltk.corpus.stopwords.words("arabic"))
    return [stemmer.stem(t) for t in tokens if t not in arabic_stopwords and len(t) > 1]


bm25._tokenizer = arabic_tokenizer


def preprocess_arabic(text):
    text = araby.strip_tashkeel(text)
    text = araby.normalize_alef(text)
    text = araby.normalize_hamza(text)
    text = araby.strip_tatweel(text)
    return text


processed_docs = [preprocess_arabic(doc) for doc in texts]
bm25.fit(processed_docs)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ramyu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  0%|          | 0/105 [00:00<?, ?it/s]

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x1bb158b27d0>

In [3]:
import json

params = {
    "avgdl": bm25.avgdl,
    "n_docs": bm25.n_docs,
    "doc_freq": {
        "indices": [int(idx) for idx in bm25.doc_freq.keys()],
        "values": [float(val) for val in bm25.doc_freq.values()],
    },
    "b": bm25.b,
    "k1": bm25.k1,
}

with open("bm25_values_v3.json", "w") as f:
    json.dump(params, f)

In [4]:
from tqdm import tqdm

vectors_to_upsert = []

for i, doc in enumerate(tqdm(processed_docs)):
    sparse_vector = bm25.encode_documents(doc)
    vectors_to_upsert.append(
        {
            "id": str(i),
            "sparse_values": sparse_vector,
            "metadata": {"text": texts[i]},
        }
    )

index.upsert(vectors=vectors_to_upsert, namespace="customers_v2")

100%|██████████| 105/105 [00:00<00:00, 538.97it/s]


{'upserted_count': 105}