In [2]:
import pandas as pd

# utils


def read_json(path: str) -> pd.DataFrame:
    df = pd.read_json(path)
    return df


def read_csv(path: str, index: str = None) -> pd.DataFrame:
    df = pd.read_csv(path, index_col=index).dropna(axis=1, how="all")
    return df

In [3]:
ar_dict = read_csv("../data/arabic-dictionary/riyadh_dict.csv")

## Automatic feature engineering: adding a new feature to the dataset

Each word in the dictionary has a definition. The definition is a sentence that describes the meaning of the word. However, we can notice that a word can have multiple meanings. For example, the word "أبدا" appeard 2 times, in the first time it means "never" and in the second time it means "forever". So, we will add a new feature `example` to the dataset to indicate the meaning of the word in a real sentence.

## Using Wikipedia embeddings

In the first method, we will use a dataset containing the word embeddings of the Arabic Wikipedia. The dataset is available at [this link](https://huggingface.co/datasets/Cohere/wikipedia-22-12-ar-embeddings). These embeddings are encoded using the [cohere.ai multilingual-22-12](https://txt.cohere.ai/multilingual/) embedding model.

In [6]:
import os
from sqlalchemy import Index, Text, create_engine, select, text
from sqlalchemy import Column, Integer
from sqlalchemy.orm import declarative_base, sessionmaker
from pgvector.sqlalchemy import Vector

engine = create_engine(os.environ.get("CONNECTION_STRING"))
Session = sessionmaker(bind=engine)
session = Session()

Base = declarative_base()


class MixinSearch:
    @classmethod
    def fulltext_search(cls, session, search_string):
        return (
            session.query(cls)
            .filter(
                text(f"search @@ websearch_to_tsquery('arabic', '{search_string}')")
            )
            .limit(20)
            .all()
        )

    @classmethod
    def vector_search(cls, session, vector):
        return session.scalars(
            select(cls).order_by(cls.emb.cosine_distance(vector)).limit(5)
        ).all()


class Wiki(MixinSearch, Base):
    """Wiki model."""

    __tablename__ = "wiki_embeds"

    id = Column(Integer, primary_key=True)
    text = Column(Text)
    emb = Column(Vector(768))

In [None]:
from datasets import load_dataset

# Load the wikipedia dataset for Arabic, no splits required.
ar_wiki_dataset = load_dataset(
    "Cohere/wikipedia-22-12-ar-embeddings",
    streaming=True,
    trust_remote_code=True,
    download_mode="reuse_cache_if_exists",
    split="train",
)

In [7]:
import re
from typing import List
import cohere

co = cohere.Client()


def embed_text(texts: List[str]) -> List[float]:
    response = co.embed(texts=texts, model="multilingual-22-12")
    query_embedding = response.embeddings

    return query_embedding[0]


def find_matching_sentences(word: str, table, window_size=100) -> List[str]:
    matching_sentences: List[str] = []

    if not word or not isinstance(word, str):
        return matching_sentences

    results = table.vector_search(session, embed_text(word))
    print(results[0].text)

    for text_obj in results:
        sentence = text_obj.text
        for match in re.finditer(r"\b" + re.escape(word) + r"\b", sentence):
            if match:
                span = match.span()
                context_start = max(0, span[0] - window_size)
                context_end = min(len(sentence), span[1] + window_size)
                context = sentence[context_start:context_end]
                matching_sentences.append(context)

    return matching_sentences

In [8]:
from concurrent.futures import ThreadPoolExecutor
import concurrent
from tqdm import tqdm


def fetch_and_process_row(index):
    row = ar_dict.iloc[index]
    query = f"({row['UnDiacWord']}) {row['Definition']}"
    matching_sentences = Wiki.vector_search(session, embed_text([query]))
    matching_sentences = [x.text for x in matching_sentences]
    return {
        **row.to_dict(),
        "examples": matching_sentences,
    }


matching_sentences_dict = []
dict_range = range(ar_dict.shape[0])

# Initialize a ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=10) as executor:
    # Use a list comprehension to submit all tasks and collect Future objects
    futures = [executor.submit(fetch_and_process_row, i) for i in dict_range]

    # Use tqdm to iterate over futures as they complete
    for future in tqdm(
        concurrent.futures.as_completed(futures),
        total=len(futures),
        desc="Processing Rows",
    ):
        try:
            result = future.result()
            matching_sentences_dict.append(result)
        except Exception as e:
            print(f"Row processing generated an exception: {e}")

Processing Rows: 100%|██████████| 95207/95207 [1:08:20<00:00, 23.22it/s]


In [9]:
pd.DataFrame(matching_sentences_dict).to_csv("examples_cohere_embeds.csv", index=False)

In [11]:
matching_sentences_dict[0]

{'word_id': 1,
 'Id': 9698,
 'Word': 'تَأْبِيد',
 'UnDiacWord': 'تأبيد',
 'mainPOS': 'اسم',
 'PoS': 'اسم معنى',
 'Definition': 'تَأْبِيد الأمرِ: تخليده وإبقاؤه مدى الدهر.',
 'examples': ['التَّودُّد لغةً: من الوُدِّ، والوُدُّ مصدر الموَدَّة. والوُدُّ هو الحُبُّ ويكون في جميع مداخل الخير، والتواد التحاب.',
  '- نص - (إذا سرق الحاكم ثيران الشعب وأخفاها أو عاث بحقولهم وزرعهم أو أعطاها إلى الأجنبي فإن أدّاد سيكون له بالمرصاد، وإذا استولى على غنمهم فإن أدد ساقي الأرض والسماء سيبيد ماشيته في مراعيها وسيجعلها طعامًا للشمس).',
  'قال أبو حاتم: (الواجب على العاقل أنْ يتحبَّب إلى النَّاس بلزوم حسن الخلق، وترك سوء الخلق؛ لأنَّ الخلق الحسن يذيب الخطايا كما تذيب الشَّمس الجليد، وإن الخلق السَّيئ ليفسد العمل كما يفسد الخلُّ العسل، وقد تكون في الرَّجل أخلاق كثيرة صالحة كلُّها، وخلق سيئ، فيفسد الخلق السَّيئ الأخلاق الصَّالحة كلَّها).',
  'فأيَّده النَّبي صلى الله عليه وسلم على تودُّدِه إليهم، وإن لم يجد منهم مقابلًا لما يقوم به، إلَّا الإساءة إليه.',
  'ولم يكن استعمال هذا المصطلح القرآني في التصوف إل

In [None]:
from typing import List
import ast


def generate_context(entries: List[dict]) -> str:
    contexts = []
    for entry in entries:
        context = f"""
        You are a linguist working on a project to enrich a dictionary of Arabic words with examples.

        You've been given a list of words and their definitions, each word has a list of generated examples from a large corpus of Arabic text. Look for semantically close examples for each word based on its definition that helps the reader understand the meaning of the word. 
        Differentiate between the different POS and meanings of the word.

        Your output must match the following format, do not change it:
        word: example1, example2, ...

        Do not put any other titles or explanations. Your output number of lines must match the number of words. Commit to the words given to you, don't go beyond or provide any other words.

        Example:
        إبرة: إبرة الحقن هي إبرة مجوفة تستخدم عادة مع المحقن لحقن مادة في الجسم

        Input starts here:

        """

        examples = ast.literal_eval(entry["examples"])[:7]
        example = examples[0] if examples else ""
        context += f"word: {entry['word']}\ndefinition: {entry['definition']}\nPOS: {entry['pos']}\nMain POS: {entry['mainـpos']}\nexamples: {examples}\n\n"

        contexts.append(context)

    return contexts

In [None]:
# Connect to GPT-4 API
import os
import google.generativeai as genai

genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))

for m in genai.list_models():
    if "generateContent" in m.supported_generation_methods:
        print(m.name)

In [None]:
model = genai.GenerativeModel("gemini-1.0-pro-latest")

In [None]:
examples_dict = examples.to_dict(orient="records")

In [None]:
def chunker(seq, size):
    return (seq[pos : pos + size] for pos in range(0, len(seq), size))

In [None]:
i = 0

for group in chunker(examples_dict, 100):
    examples_dict_modified = []

    for entry in group:
        prompt = generate_context([entry])[0]

        try:
            if eval(entry["examples"]) != []:
                response = model.generate_content(
                    prompt, generation_config={"temperature": 0.1}
                )
                entry["gemini_example"] = " ".join(response.text.split(":")[1:])
                print(f"{entry['word']} :: {entry['gemini_example']}")
            else:
                entry["gemini_example"] = "empty"
        except Exception as e:
            response = model.generate_content(prompt)
            print("rejected")

            entry["gemini_example"] = "rejected"

        examples_dict_modified.append(entry)

        # Delay to avoid rate limiting
        # time.sleep(.5)

    pd.DataFrame(examples_dict_modified).to_json(
        f"examples_modified_{i}.json", index=False, orient="records", force_ascii=False
    )

    i += 1