In [34]:
import re
import unicodedata
import pandas as pd
from pathlib import Path

#Text Cleaning and normalization

In [35]:
# Regular expressions for cleaning
DIACRITICS_RE = re.compile(r"[ؐ-ًؚ-ٟۖ-ۭ]")
TATWEEL = "ـ"
ALLOWED_CHARS_RE = re.compile(r"[^ء-غف-ي0-9A-Za-z@ ]")
ARAB2WEST_DIGITS = str.maketrans("٠١٢٣٤٥٦٧٨٩", "0123456789")

In [36]:
RAW_SAMPLES = [
    """
_َ ُِٕ(~عذٌّٕاٍرًَ ٕٕطٌَّبًَية~)« Sَkَََ«lifَ  مٕ.عٕٕتٕمدَ)»٠꧂
0549837023
""",
    """ٓٓســ.ـكـِلـ.ــيـ.ـف
ٓٓ مــعـ.ـتــمد صــحـ.ـتــي ✅
ٓتاريـخ قـديـم تاريـخ جديد ✅
ٓاشـ ـعـار مــ ـرافق ✅
ٓتـ.ـقـ.ـريـــر طـبـي ✅
ٓسعر مميزز0560516730""",
    """♻️*سـ░ـكــ ░ـلَـ يـ░ــ ـفـ░ـ*♻️
✨ آجـ ـ░ـآزٍآتـ ـ░ـ ⭐مـ░ــ ـرٍضــ ░ـيـ░ـةّ
✨ رٍسـ░ـ⭐ـ ـمـ░ـيـ░ﻲ معتمد
✅ࡎ✔️ߺܒߺࡅ𐫥ߺߺى  لدفع بعد الانجاز 0562451912""",
    """س
         ك
         ل
      ي
ف
        ا
           ع
        ذ
     ا
 ر
ط
     ب
     ي
ة
خ
ا
 ص""",
    """🔴 للفائدة 🔴
أي طالب يبحث عن شرح لأي مادة طبية
حرفيًا أي مادة طبية ويبي شرح للمادة سواء يبي للمادة بالكامل أو بعض المحاضرات
أنصحكم بأكاديمية دوكاديمي
@DOCADEMYKSA
@DOCADEMY""",
]

Goal 1: Cleaning and normalizing Arabic text.

In [37]:
def clean_text(text: str) -> str:
    text = unicodedata.normalize("NFC", text)  # Normalize Unicode
    text = DIACRITICS_RE.sub("", text)  # Remove diacritics
    text = text.replace(TATWEEL, "")  # Remove Tatweel
    text = text.translate(ARAB2WEST_DIGITS)  # Convert Arabic digits to Western
    text = ALLOWED_CHARS_RE.sub(" ", text)  # Remove unwanted characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

Goal 2: Reconstructs fragmented Arabic words.

In [38]:
def reconstruct_words(text: str) -> str:
    tokens = text.split()
    reconstructed, buffer = [], []

    for token in tokens:
        if re.fullmatch(r"[ء-ي]", token):  # If single Arabic letter
            buffer.append(token)
        else:
            if buffer:
                reconstructed.append("".join(buffer))
                buffer.clear()
            reconstructed.append(token)

    if buffer:
        reconstructed.append("".join(buffer))

    return " ".join(reconstructed)

Goal 3: Processing and generating the output file

In [39]:
def process_messages(messages):
    cleaned_messages = [reconstruct_words(clean_text(msg)) for msg in messages]
    return cleaned_messages

def save_output(messages, filename="cleaned_messages.txt"):
    Path(filename).write_text("\n".join(messages), encoding="utf-8")
    print(f"✅ Saved → {Path(filename).resolve()}")

if __name__ == "__main__":
    cleaned = process_messages(RAW_SAMPLES)
    save_output(cleaned)

✅ Saved → /content/cleaned_messages.txt


# Feature Engineering File

Goal 1: Define the Regular expressions for feature extraction

In [40]:
PHONE_NUMBER_RE = re.compile(r"\b\d{9,}\b")
TELEGRAM_USERNAME_RE = re.compile(r"@\w+")

AD_KEYWORDS = {
    "سكليف", "اعذار", "إجازات", "مرضية", "تقرير", "طبية", "معتمد", "صحتى",
    "تاريخ", "قديم", "جديد", "اشعار", "مرافق", "سعر", "مميز", "لدفع", "بعد", "الانجاز", "فوري", "خاص"
}

Goal 2: Feature Extraction Functions

In [41]:
def extract_phone_numbers(text: str) -> list:
    return PHONE_NUMBER_RE.findall(text)

def extract_telegram_accounts(text: str) -> list:
    return TELEGRAM_USERNAME_RE.findall(text)

def count_ad_keywords(text: str) -> int:
    return sum(text.count(keyword) for keyword in AD_KEYWORDS)

def is_ad(text: str) -> bool:
    return bool(extract_phone_numbers(text)) or bool(extract_telegram_accounts(text)) or count_ad_keywords(text) > 0

Goal 3: process the list of cleaned messages and build the dataframe

In [42]:
def process_features(messages):
    data = []

    for msg in messages:
        phone_numbers = extract_phone_numbers(msg)
        telegram_accounts = extract_telegram_accounts(msg)
        num_keywords = count_ad_keywords(msg)
        ad_label = "ad" if is_ad(msg) else "not_ad"

        data.append({
            "cleaned_message": msg,
            "phone_numbers": ", ".join(phone_numbers),
            "telegram_accounts": ", ".join(telegram_accounts),
            "num_ad_keywords": num_keywords,
            "is_ad": ad_label
        })

    return pd.DataFrame(data)

Goal 4: saves the dataframe into a csv file

In [43]:
def save_dataframe(df, filename="processed_features.csv"):
    df.to_csv(filename, index=False, encoding="utf-8")
    print(f"✅ Saved → {Path(filename).resolve()}")

if __name__ == "__main__":
    cleaned_messages = process_messages(RAW_SAMPLES)  # Get cleaned messages
    df = process_features(cleaned_messages)
    save_dataframe(df)

✅ Saved → /content/processed_features.csv
