In [1]:
import pandas as pd

# utils


def read_json(path: str) -> pd.DataFrame:
    df = pd.read_json(path)
    return df


def read_csv(path: str, index: str = None) -> pd.DataFrame:
    df = pd.read_csv(path, index_col=index).dropna(axis=1, how="all")
    return df

In [2]:
ar_dict = read_csv("../data/arabic-dictionary/riyadh_dict.csv")

## Automatic feature engineering: adding a new feature to the dataset

Each word in the dictionary has a definition. The definition is a sentence that describes the meaning of the word. However, we can notice that a word can have multiple meanings. For example, the word "أبدا" appeard 2 times, in the first time it means "never" and in the second time it means "forever". So, we will add a new feature `example` to the dataset to indicate the meaning of the word in a real sentence.

In [3]:
from datasets import load_dataset

# Load the wikipedia dataset for Arabic, no splits required.
ar_wiki_dataset = load_dataset(
    "wikimedia/wikipedia",
    "20231101.ar",
    trust_remote_code=True,
    download_mode="reuse_cache_if_exists",
)

ar_wiki_dataset_pd = ar_wiki_dataset["train"].to_pandas()

Generating train split:   0%|          | 0/1219201 [00:00<?, ? examples/s]

In [14]:
from sqlalchemy import DDL, Text, create_engine, func, text
from sqlalchemy import Column, Integer
from sqlalchemy.orm import declarative_base, sessionmaker
from sqlalchemy.dialects.postgresql import TSVECTOR

CON_STRING = "postgresql://mais@localhost:5432/postgres"
engine = create_engine(CON_STRING)

Base = declarative_base()

class MixinSearch:

    @classmethod
    def fulltext_search(cls, session, search_string):
        return session.query(cls)\
            .filter(text(f"search @@ websearch_to_tsquery('arabic', '{search_string}')")).limit(5).all()


class Wiki(MixinSearch, Base):
    """Wiki model."""

    __tablename__ = "wiki"
    id = Column(Integer, primary_key=True)
    text = Column(Text)

    # Index should be created after the table is created.
    # ALTER TABLE wiki ADD search tsvector GENERATED ALWAYS AS (to_tsvector('arabic', text)) STORED;
    # CREATE INDEX search_idx ON wiki USING GIN (search);
    search = Column(TSVECTOR)

Session = sessionmaker(bind=engine)
session = Session()

In [16]:
ar_wiki_dataset_pd.shape

(1219201, 4)

In [10]:
# Insert the data into the database
from tqdm import tqdm

chunks = [ar_wiki_dataset_pd[i:i + 100000] for i in range(0, ar_wiki_dataset_pd.shape[0], 100000)]

for chunk in tqdm(chunks):
    chunk.to_sql(
        Wiki.__tablename__, con=engine, if_exists="append", index=False, chunksize=10000
    )

100%|██████████| 13/13 [01:43<00:00,  7.97s/it]


In [5]:
ar_dict.head()

Unnamed: 0,word_id,Id,Word,UnDiacWord,mainPOS,PoS,Definition
0,1,9698,تَأْبِيد,تأبيد,اسم,اسم معنى,تَأْبِيد الأمرِ: تخليده وإبقاؤه مدى الدهر.
1,2,9699,مُؤَبَّد,مؤبد,صفة,صفة مفعول,غير منهي، دائم ومستمر.
2,3,9700,أَبَد,أبد,اسم,اسم معنى,زمن طويل غير محدود.
3,4,9701,أَبَدًا,أبدا,حرف (أداة),اسم مبهم,دومًا، طوال الزمن.
4,4,9702,أَبَدًا,أبدا,حرف (أداة),اسم مبهم,مُطلقًا‏، تستخدم للنفي المطلق.


In [21]:
import re
from typing import List
from tqdm import tqdm

session.close()

def find_matching_sentences(word: str, table, window_size=100) -> List[str]:
    matching_sentences: List[str] = []

    if not word or not isinstance(word, str):
        return matching_sentences

    results = table.fulltext_search(session, word)

    for text_obj in results:
        sentence = text_obj.text
        for match in re.finditer(r"\b" + re.escape(word) + r"\b", sentence):
            if match:
                span = match.span()
                context_start = max(0, span[0] - window_size)
                context_end = min(len(sentence), span[1] + window_size)
                context = sentence[context_start:context_end]
                matching_sentences.append(context)

    return matching_sentences


matching_sentences_dict = []
dict_range = range(ar_dict.shape[0])

for i in tqdm(dict_range):
    row = ar_dict.iloc[i]
    word = row["UnDiacWord"]
    matching_sentences = find_matching_sentences(word, Wiki, 1000)
    matching_sentences_dict.append(
        {
            "id": row["Id"],
            "word_id": row["word_id"],
            "word": word,
            "undiac_word": row["UnDiacWord"],
            "pos": row["PoS"],
            "mainـpos": row["mainPOS"],
            "definition": row["Definition"],
            "examples": matching_sentences,
        }
    )

 29%|██▊       | 27299/95207 [02:58<05:41, 198.82it/s]

In [None]:
pd.DataFrame(matching_sentences_dict).to_csv("examples.csv", index=False)

In [None]:
from typing import List


def generate_context(entries: List[dict]) -> str:
    context = f"""
    The following is a list of Arabic paragraphs from Wikipedia, 
    each paragraph with an accompaying word and definition from the dictionary. 
    Extract a sentence to use as an example for each word.

    Your output must match the following format:
    word: sentence

    Do not put any other titles or explanations. 
    Your output number of lines must match the number of words.
    Commit to the words given below, don't go beyond or provide any other words.

    Example:
    إبرة: إبرة الحقن هي إبرة مجوفة تستخدم عادة مع المحقن لحقن مادة في الجسم

    Input starts here:
    """

    for entry in entries:
        context += f"word: {entry['word']}\ndefinition: {entry['paragraph']}\nparagraph: {entry['definition']}\n\n"

    return context


result = generate_context(
    [
        {
            "word": "إبرة",
            "definition": "إبرة الحقن هي إبرة مجوفة تستخدم عادة مع المحقن لحقن مادة في الجسم",
            "examples": ["1", "2"],
        } 
    ]
)

result

"\n    The following is a list of Arabic paragraphs from Wikipedia, \n    each paragraph with an accompaying word and definition from the dictionary. \n    Extract a sentence to use as an example for each word.\n\n    Your output must match the following format:\n    word: sentence\n\n    Do not put any other titles or explanations. \n    Your output number of lines must match the number of words.\n    Commit to the words given below, don't go beyond or provide any other words.\n\n    Example:\n    إبرة: إبرة الحقن هي إبرة مجوفة تستخدم عادة مع المحقن لحقن مادة في الجسم\n\n    Input starts here:\n    word: إبرة\ndefinition: إبرة الحقن هي إبرة مجوفة تستخدم عادة مع المحقن لحقن مادة في الجسم\nparagraph: إبرة الحقن هي إبرة مجوفة تستخدم عادة مع المحقن لحقن مادة في الجسم\n\nword: ماء\ndefinition: الماء هو سائل شفاف عديم اللون والطعم والرائحة\nparagraph: الماء هو سائل شفاف عديم اللون والطعم والرائحة\n\nword: مدرسة\ndefinition: المدرسة هي مؤسسة تعليمية تقوم بتعليم الطلاب\nparagraph: المدرسة هي 