This is the notebook where examples were generated. Trying to run it will result in an error cause it requires a Postgres database with Wikipedia embeddings, a HNSW index on the embeddings, and a Cohere API key.

In [1]:
import pandas as pd


def read_json(path: str) -> pd.DataFrame:
    df = pd.read_json(path)
    return df


def read_csv(path: str, index: str = None) -> pd.DataFrame:
    df = pd.read_csv(path, index_col=index).dropna(axis=1, how="all")
    return df

In [2]:
ar_dict = read_csv("../data/arabic-dictionary/riyadh_dict.csv")
train_df = pd.read_json("../data/shared-task/train.json", encoding="utf-8")

## Automatic feature engineering: adding a new feature to the dataset

Each word in the dictionary has a definition. The definition is a sentence that describes the meaning of the word. However, we can notice that a word can have multiple meanings. For example, the word "أبدا" appeard 2 times, in the first time it means "never" and in the second time it means "forever". So, we will add a new feature `example` to the dataset to indicate the meaning of the word in a real sentence.

## Using Wikipedia embeddings

In the first method, we will use a dataset containing the word embeddings of the Arabic Wikipedia. The dataset is available at [this link](https://huggingface.co/datasets/Cohere/wikipedia-22-12-ar-embeddings). These embeddings are encoded using the [cohere.ai multilingual-22-12](https://txt.cohere.ai/multilingual/) embedding model.

In [4]:
import os
from sqlalchemy import Index, Text, create_engine, select, text
from sqlalchemy import Column, Integer
from sqlalchemy.orm import declarative_base, sessionmaker
from pgvector.sqlalchemy import Vector

engine = create_engine(os.environ.get("CONNECTION_STRING"))
Session = sessionmaker(bind=engine)
session = Session()

Base = declarative_base()


class MixinSearch:
    @classmethod
    def fulltext_search(cls, session, search_string):
        return (
            session.query(cls)
            .filter(
                text(f"search @@ websearch_to_tsquery('arabic', '{search_string}')")
            )
            .limit(20)
            .all()
        )

    @classmethod
    def vector_search(cls, session, vector):
        return session.scalars(
            select(cls).order_by(cls.emb.cosine_distance(vector)).limit(5)
        ).all()


class Wiki(MixinSearch, Base):
    """Wiki model."""

    __tablename__ = "wiki_embeds"

    id = Column(Integer, primary_key=True)
    text = Column(Text)
    emb = Column(Vector(768))

In [5]:
from datasets import load_dataset

# Load the wikipedia dataset for Arabic, no splits required.
ar_wiki_dataset = load_dataset(
    "Cohere/wikipedia-22-12-ar-embeddings",
    streaming=True,
    trust_remote_code=True,
    download_mode="reuse_cache_if_exists",
    split="train",
)

Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

In [6]:
import re
from typing import List
import cohere

co = cohere.Client()


def embed_text(texts: List[str]) -> List[float]:
    response = co.embed(texts=texts, model="multilingual-22-12")
    query_embedding = response.embeddings

    return query_embedding[0]


def find_matching_sentences(word: str, table, window_size=100) -> List[str]:
    matching_sentences: List[str] = []

    if not word or not isinstance(word, str):
        return matching_sentences

    results = table.vector_search(session, embed_text(word))
    print(results[0].text)

    for text_obj in results:
        sentence = text_obj.text
        for match in re.finditer(r"\b" + re.escape(word) + r"\b", sentence):
            if match:
                span = match.span()
                context_start = max(0, span[0] - window_size)
                context_end = min(len(sentence), span[1] + window_size)
                context = sentence[context_start:context_end]
                matching_sentences.append(context)

    return matching_sentences

In [7]:
from concurrent.futures import ThreadPoolExecutor
import concurrent
from tqdm import tqdm


def fetch_and_process_row(index):
    row = train_df.iloc[index]
    query = f"({row['word']}) {row['gloss']}"
    matching_sentences = Wiki.vector_search(session, embed_text([query]))
    matching_sentences = [x.text for x in matching_sentences]
    return {
        **row.to_dict(),
        "examples": matching_sentences,
    }


matching_sentences_dict = []
dict_range = range(train_df.shape[0])

# Initialize a ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=10) as executor:
    # Use a list comprehension to submit all tasks and collect Future objects
    futures = [executor.submit(fetch_and_process_row, i) for i in dict_range]

    # Use tqdm to iterate over futures as they complete
    for future in tqdm(
        concurrent.futures.as_completed(futures),
        total=len(futures),
        desc="Processing Rows",
    ):
        try:
            result = future.result()
            matching_sentences_dict.append(result)
        except Exception as e:
            print(f"Row processing generated an exception: {e}")

Processing Rows:   0%|          | 94/31372 [00:05<31:48, 16.39it/s] 


In [None]:
pd.DataFrame(matching_sentences_dict).to_json(
    "../data/shared-task/train_with_examples.json", index=False
)

In [None]:
matching_sentences_dict[0]

{'word_id': 1,
 'Id': 9698,
 'Word': 'تَأْبِيد',
 'UnDiacWord': 'تأبيد',
 'mainPOS': 'اسم',
 'PoS': 'اسم معنى',
 'Definition': 'تَأْبِيد الأمرِ: تخليده وإبقاؤه مدى الدهر.',
 'examples': ['التَّودُّد لغةً: من الوُدِّ، والوُدُّ مصدر الموَدَّة. والوُدُّ هو الحُبُّ ويكون في جميع مداخل الخير، والتواد التحاب.',
  '- نص - (إذا سرق الحاكم ثيران الشعب وأخفاها أو عاث بحقولهم وزرعهم أو أعطاها إلى الأجنبي فإن أدّاد سيكون له بالمرصاد، وإذا استولى على غنمهم فإن أدد ساقي الأرض والسماء سيبيد ماشيته في مراعيها وسيجعلها طعامًا للشمس).',
  'قال أبو حاتم: (الواجب على العاقل أنْ يتحبَّب إلى النَّاس بلزوم حسن الخلق، وترك سوء الخلق؛ لأنَّ الخلق الحسن يذيب الخطايا كما تذيب الشَّمس الجليد، وإن الخلق السَّيئ ليفسد العمل كما يفسد الخلُّ العسل، وقد تكون في الرَّجل أخلاق كثيرة صالحة كلُّها، وخلق سيئ، فيفسد الخلق السَّيئ الأخلاق الصَّالحة كلَّها).',
  'فأيَّده النَّبي صلى الله عليه وسلم على تودُّدِه إليهم، وإن لم يجد منهم مقابلًا لما يقوم به، إلَّا الإساءة إليه.',
  'ولم يكن استعمال هذا المصطلح القرآني في التصوف إل