In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets



In [None]:
!pip install pyarabic

Collecting pyarabic
  Downloading PyArabic-0.6.15-py3-none-any.whl.metadata (10 kB)
Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m122.9/126.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.4/126.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarabic
Successfully installed pyarabic-0.6.15


In [None]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

In [None]:
import torch
import re
import pandas as pd
from collections import Counter
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm

In [None]:
# Initialize model and tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained("/content/my_model")
    model = AutoModelForMaskedLM.from_pretrained("/content/my_model")
    print("Loaded fine-tuned model from /content/my_model")
except:
    print("Using pretrained model")
    model_name = "UBC-NLP/MARBERT"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)

In [None]:
# Move model to correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# **Data preprocessing functions**

In [None]:
def preprocess(sentence: str) -> str:
    """Enhanced Arabic text preprocessing with additional normalization"""
    sentence = sentence.replace('أ', 'ا').replace('إ', 'ا').replace('آ', 'ا')
    sentence = re.sub(r'[^\u0600-\u06FF\s]', '', sentence)  # Keep only Arabic characters and spaces
    sentence = re.sub(r'\s+', ' ', sentence).strip()  # Clean up extra spaces
    return sentence


def data_vocab(dataframe, min_freq=3):
    """Create vocabulary with frequency filtering"""
    words_freq = Counter()
    for text in dataframe['text']:
        words_freq.update(text.split())
    return {word: freq for word, freq in words_freq.items() if freq >= min_freq}


def tokenize_function(examples):
    """Tokenization function for dataset"""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )

# **Prediction functions**

In [None]:
def normalize_hamza(word: str) -> str:
    # Normalize common Hamza variants to unify them
    word = word.replace("أ", "ا").replace("إ", "ا").replace("ؤ", "و").replace("ئ", "ي").replace("ء", "")
    return word

def find_misspellings(text: str, vocab: dict, threshold: float = 0.28) -> list:
    """Identify potentially misspelled words using MLM probability and additional context"""
    words = text.split()
    misspelled_indices = []

    for i, word in enumerate(words):
        if word not in vocab and normalize_hamza(word) not in vocab:  # Word not in vocab (may be misspelled)
            masked_words = words.copy()
            masked_words[i] = tokenizer.mask_token
            masked_sentence = " ".join(masked_words)

            inputs = tokenizer(masked_sentence, return_tensors="pt").to(device)
            mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits[0, mask_token_index]
                probs = torch.softmax(logits, dim=-1).squeeze()
                word_id = tokenizer.encode(word, add_special_tokens=False)
                word_prob = torch.mean(probs[word_id]) if word_id else 0

            # Use lower threshold to catch more potential errors
            if word_prob < threshold:
                misspelled_indices.append(i)

    return misspelled_indices


def generate_masked_sentences(text: str, misspelled_indices: list) -> list:
    """Generate masked sentences for each misspelled word"""
    words = text.split()
    return [
        " ".join(words[:idx] + [tokenizer.mask_token] + words[idx + 1:])
        for idx in misspelled_indices
    ]


from Levenshtein import distance as levenshtein_distance

def predict(masked_sentence: str, top_k=25) -> list:
    """Predict top-k masked words from MLM"""
    inputs = tokenizer(masked_sentence, return_tensors="pt").to(device)
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits[0, mask_token_index]
    probs = torch.softmax(logits, dim=-1).squeeze()
    top_k_tokens = torch.topk(probs, top_k)

    predictions = []
    for token_id in top_k_tokens.indices:
        token = tokenizer.decode([token_id]).strip()
        # نتاكد إن الكلمة عربية ولها طول معقول
        if re.match(r'^[\u0600-\u06FF]{2,}$', token):
            predictions.append(token)

    return predictions

In [None]:
test_cases = [
    ("وززارة [MASK] والتعليم", "التربية"),
    ("يوم [MASK]", "السبت"),
    ("الطقس اليوم [MASK]", "حر"),
    ("اللغة [MASK] صعبة", "العربية"),
    ("ذهب محمد إلى [MASK]", "المدرسة"),
    ("أنا أحب [MASK] في المساء", "اللعب"),
    ("السيارة [MASK] في الطريق", "سريعة"),
    ("كرة [MASK] هي المفضلة لدي", "القدم")
]

for sentence, expected in test_cases:
    preds = predict(sentence)
    is_correct = expected in preds
    print(f"Model {'✅' if is_correct else '❌'} {sentence} → {preds} (Expected: {expected})")


Model ✅ وززارة [MASK] والتعليم → ['التربية', 'التربيه', 'للتربية', 'الصحه', 'الصحة', 'والتربية', 'النقل', 'بالتربية', 'طيب', 'بالصحه', 'تربيه', 'والصحه', 'تربية', 'انا', 'طب', 'المعلم', 'الانتساب', 'المدرسه', 'المعلمين', 'التعليم', 'التعلم', 'المناهج', 'قياس', 'بالصحة'] (Expected: التربية)
Model ✅ يوم [MASK] → ['جميل', 'الجمعه', 'ميلادي', 'الخميس', 'مميز', 'الجمعة', 'حلو', 'لطيف', 'الاحد', 'سعيد', 'جمييل', 'جديد', 'السبت', 'التلات', 'عظيم', 'المعلم', 'الثلاثاء', 'الاربعاء', 'تاريخي', 'العظماء', 'حافل', 'عالمي', 'خميس', 'العلم'] (Expected: السبت)
Model ✅ الطقس اليوم [MASK] → ['جميل', 'حلو', 'بارد', 'رايع', 'لطيف', 'حار', 'خرافي', 'غايم', 'مختلف', 'امطار', 'رووعه', 'هه', 'ربيعي', 'حر', 'غبار', 'حلوو', 'جمييل', 'ممتاز', 'روعه', 'ضباب', 'رووعة', 'ماطر'] (Expected: حر)
Model ✅ اللغة [MASK] صعبة → ['العربية', 'الانجليزية', 'الفرنسية', 'الالمانية', 'صارت', 'الاسبانية', 'الصينية', 'الفارسية', 'التركية', 'العربيه', 'الايطالية', 'جدا', 'الروسية', 'بقت', 'مرة', 'الفصحى', 'عندك', 'العالمية', 'دي',

# **Pipeline function**

In [None]:
def pipeline(input_text: str, vocab: dict, verbose: bool = True) -> str:
    processed_text = preprocess(input_text)
    vocab = data_vocab(df, min_freq=3)
    misspelled_indices = find_misspellings(processed_text, vocab)

    if not misspelled_indices:
        if verbose:
            print("✅ لا توجد أخطاء إملائية واضحة.")
        return processed_text

    masked_sentences = generate_masked_sentences(processed_text, misspelled_indices)
    words = processed_text.split()
    corrections = {}

    for idx, masked in zip(misspelled_indices, masked_sentences):
        original_word = words[idx]
        candidates = predict(masked)
        if candidates:
            best_candidate = min(candidates, key=lambda c: levenshtein_distance(c, original_word))
            corrections[original_word] = best_candidate
            words[idx] = best_candidate

    corrected_sentence = " ".join(words)

    if verbose:
        print("🔍 الكلمات التي تم تصحيحها:")
        for original, corrected in corrections.items():
            print(f" - {original} ➤ {corrected}")

    return corrected_sentence

In [None]:
import pandas as pd

# Read the .txt file as if it's a CSV
df = pd.read_csv('/content/drive/MyDrive/arabic_dataset_classifiction.txt', encoding='utf-8')

# Optional: Fix column name if needed
df.columns = ['text', 'target']

print(df.head())


                                                text  target
0  بين أستوديوهات ورزازات وصحراء مرزوكة وآثار ولي...       0
1  قررت النجمة الأمريكية أوبرا وينفري ألا يقتصر ع...       0
2  أخبارنا المغربية الوزاني تصوير الشملالي ألهب ا...       0
3  اخبارنا المغربية قال ابراهيم الراشدي محامي سعد...       0
4  تزال صناعة الجلود في المغرب تتبع الطريقة التقل...       0


In [None]:
# Clean dataset
df = df.drop(columns=['targe'], errors='ignore').dropna().drop_duplicates()
df['text'] = df['text'].apply(preprocess)
df['text'] = df['text'].apply(lambda x: x if len(x.split()) > 5 else None)
df = df.dropna().reset_index(drop=True)

In [None]:
# Create vocabulary
words_freq = data_vocab(df)

In [None]:
# Prepare Hugging Face dataset
dataset = Dataset.from_pandas(df[:10000])  # Use smaller subset for training
dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])
dataset = dataset.train_test_split(test_size=0.2, shuffle=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
# Training configuration
training_args = TrainingArguments(
    output_dir="./model",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_strategy="steps",
    learning_rate=5e-5,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    fp16=True,
    logging_steps=100,
    optim="adamw_torch"
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15
)

In [None]:
# Optional: Use a smaller subset for faster testing
train_subset = dataset["train"].select(range(1000))
test_subset = dataset["test"].select(range(200))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=test_subset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [None]:
# Train and save model
trainer.train()
trainer.save_model("/content/my_model")
tokenizer.save_pretrained("/content/my_model")

[34m[1mwandb[0m: Currently logged in as: [33myahiahanii45[0m ([33myahiahanii45-helwan-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
input_texts = [
    "وززارة النربية والتعليم تعلن عن تعطبل الدراسة رسميا يوم السسبت نظرا للظروف الجوية الحالييه وحفاظا على سلامة المعلمون و الططلاب",
    "بعد أن قرر القضاء الفنسي الإبقاء على المغني المغربي سعد لمجرد الذي قبض عليه يوم الحميس بتهمتي القتتل و التحرش",
    "قال الفنان المغربي محمد الخياري نحن احرار الامه الاسلاميه و لن نسمح بحدوث حربب في الوطن",
    "الطلاب يذههبون الى المدرسه في الصباح الباككر لتلقي الدروس",
    "وصل الريس الى القاهر صباح اليوم لعقد اجتماع مهم مع الوزراء",
    "يعاني المزارعون من مشاكل في توفر المايه للري في فصل الصيف",
    "اصدر القاضي حكمه النهائي بعد مداوله طويله بين اعضاء المحكمه",
    "المنتخب الوطني يخض مباراة مهمه في التصفيات المؤهله لكأس العاللم",
]

for input_text in input_texts:
    # Pass words_freq as the vocab argument
    true_sentence = pipeline(input_text, words_freq)
    print('Incorrect Sentence:', input_text)
    print('Corrected Sentence:', true_sentence)
    print('-' * 20)

🔍 الكلمات التي تم تصحيحها:
 - وززارة ➤ وزارة
 - النربية ➤ التربية
 - تعطبل ➤ تعطيل
 - السسبت ➤ السبت
 - الحالييه ➤ الحاليه
 - المعلمون ➤ المعلمين
 - الططلاب ➤ الطلاب
Incorrect Sentence: وززارة النربية والتعليم تعلن عن تعطبل الدراسة رسميا يوم السسبت نظرا للظروف الجوية الحالييه وحفاظا على سلامة المعلمون و الططلاب
Corrected Sentence: وزارة التربية والتعليم تعلن عن تعطيل الدراسة رسميا يوم السبت نظرا للظروف الجوية الحاليه وحفاظا على سلامة المعلمين و الطلاب
--------------------
🔍 الكلمات التي تم تصحيحها:
 - الفنسي ➤ الفرنسي
 - الحميس ➤ الخميس
 - القتتل ➤ القتل
Incorrect Sentence: بعد أن قرر القضاء الفنسي الإبقاء على المغني المغربي سعد لمجرد الذي قبض عليه يوم الحميس بتهمتي القتتل و التحرش
Corrected Sentence: بعد ان قرر القضاء الفرنسي الابقاء على المغني المغربي سعد لمجرد الذي قبض عليه يوم الخميس بتهمتي القتل و التحرش
--------------------
🔍 الكلمات التي تم تصحيحها:
 - الاسلاميه ➤ الاسلاميه
 - حربب ➤ خراب
Incorrect Sentence: قال الفنان المغربي محمد الخياري نحن احرار الامه الاسلاميه و لن نسمح بحد

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np

# Define test dataset with ground-truth corrections
test_df = pd.DataFrame({
    'incorrect': input_texts,
    'correct': [
        "وزارة التربية والتعليم تعلن عن تعطيل الدراسة رسميا يوم السبت نظرا للظروف الجوية الحالية وحفاظا على سلامة المعلمون و الطلاب",
        "بعد ان قرر القضاء الفرنسي الابقاء على المغني المغربي سعد لمجرد الذي قبض عليه يوم الخميس بتهمتي القتل و التحرش",
        "قال الفنان المغربي محمد الخياري نحن احرار الامه الاسلامية و لن نسمح بحدوث حرب في الوطن",
        "الطلاب يذهبون الى المدرسة في الصباح الباكر لتلقي الدروس",
        "وصل الريس الى القاهرة صباح اليوم لعقد اجتماع مهم مع الوزراء",
        "يعاني المزارعون من مشاكل في توفر الماء للري في فصل الصيف",
        "اصدر القاضي حكمه النهائي بعد مداولة طويلة بين اعضاء المحكمة",
        "المنتخب الوطني يخوض مباراة مهمة في التصفيات المؤهلة لكاس العالم"
    ]
})

def evaluate_pipeline(test_df, pipeline_func, df):
    y_true_words = []
    y_pred_words = []
    sentence_correct = []

    for idx, row in test_df.iterrows():
        incorrect = row['incorrect']
        true_sentence = row['correct']

        # Run pipeline
        pred_sentence = pipeline_func(incorrect, df, verbose=False)

        # Split sentences into words
        true_words = true_sentence.split()
        pred_words = pred_sentence.split()
        incorrect_words = incorrect.split()

        # Ensure same length for comparison
        min_len = min(len(true_words), len(pred_words), len(incorrect_words))
        true_words = true_words[:min_len]
        pred_words = pred_words[:min_len]
        incorrect_words = incorrect_words[:min_len]

        # Word-level comparison for misspelled words
        for i in range(min_len):
            if incorrect_words[i] != true_words[i]:  # Misspelled word
                y_true_words.append(true_words[i])
                y_pred_words.append(pred_words[i])

        # Sentence-level accuracy
        sentence_correct.append(true_sentence == pred_sentence)

    # Calculate metrics
    word_accuracy = accuracy_score(y_true_words, y_pred_words)
    word_f1 = f1_score(y_true_words, y_pred_words, average='weighted', zero_division=0)
    conf_matrix = confusion_matrix(y_true_words, y_pred_words)
    sentence_accuracy = np.mean(sentence_correct)

    return {
        'word_accuracy': word_accuracy,
        'word_f1': word_f1,
        'confusion_matrix': conf_matrix,
        'sentence_accuracy': sentence_accuracy,
        'y_true_words': y_true_words,
        'y_pred_words': y_pred_words
    }

In [None]:
# Evaluation Section
print("\n=== Evaluation Section ===")

# Run evaluation
results = evaluate_pipeline(test_df, pipeline, df)

# Print evaluation results
print("Evaluation Results:")
print(f"Word-Level Accuracy: {results['word_accuracy']:.4f}")
print(f"Word-Level F1 Score: {results['word_f1']:.4f}")
print(f"Sentence-Level Accuracy: {results['sentence_accuracy']:.4f}")
print("Confusion Matrix (Word-Level):")
print(results['confusion_matrix'])
print("\nDetailed Errors:")
for true, pred in zip(results['y_true_words'], results['y_pred_words']):
    if true != pred:
        print(f"True: {true}, Predicted: {pred}")


=== Evaluation Section ===
Evaluation Results:
Word-Level Accuracy: 0.5769
Word-Level F1 Score: 0.5769
Sentence-Level Accuracy: 0.2500
Confusion Matrix (Word-Level):
[[1 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 1]]

Detailed Errors:
True: الحالية, Predicted: الحاليه
True: الاسلامية, Predicted: الاسلاميه
True: حرب, Predicted: خراب
True: المدرسة, Predicted: المدرسه
True: الماء, Predicted: المياه
True: مداولة, Predicted: منافسه
True: طويلة, Predicted: جلسه
True: المحكمة, Predicted: المحكمه
True: يخوض, Predicted: يخض
True: مهمة, Predicted: مهمه
True: المؤهلة, Predicted: الموهلة


In [None]:
from google.colab import files

!zip -r my_model.zip /content/my_model
files.download('my_model.zip')

  adding: content/my_model/ (stored 0%)
  adding: content/my_model/special_tokens_map.json (deflated 14%)
  adding: content/my_model/vocab.txt (deflated 61%)
  adding: content/my_model/tokenizer_config.json (deflated 14%)
  adding: content/my_model/model.safetensors (deflated 14%)
  adding: content/my_model/training_args.bin (deflated 15%)
  adding: content/my_model/config.json (deflated 13%)
  adding: content/my_model/tokenizer.json (deflated 14%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install gradio transformers torch pandas python-Levenshtein



In [None]:
import gradio as gr
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import pandas as pd
import re
from collections import Counter
from Levenshtein import distance as levenshtein_distance

# ==== Load model ====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "/content/my_model"

tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
model = AutoModelForMaskedLM.from_pretrained("asafaya/bert-base-arabic").to(device)
model.eval()

# Read the .txt file as if it's a CSV
df = pd.read_csv('/content/drive/MyDrive/arabic_dataset_classifiction.txt', encoding='utf-8')

# Optional: Fix column name if needed
df.columns = ['text', 'target']


def preprocess(sentence: str) -> str:
    sentence = sentence.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    sentence = re.sub(r"[^\u0600-\u06FF\s]", "", sentence)
    sentence = re.sub(r"\s+", " ", sentence).strip()
    return sentence

def data_vocab(dataframe, min_freq=3):
    words_freq = Counter()
    for text in dataframe["text"]:
        words_freq.update(text.split())
    return {word: freq for word, freq in words_freq.items() if freq >= min_freq}

def normalize_hamza(word: str) -> str:
    return (
        word.replace("أ", "ا")
        .replace("إ", "ا")
        .replace("ؤ", "و")
        .replace("ئ", "ي")
        .replace("ء", "")
    )

def find_misspellings(text: str, vocab: dict, threshold: float = 0.28) -> list:
    words = text.split()
    misspelled_indices = []
    for i, word in enumerate(words):
        if word not in vocab and normalize_hamza(word) not in vocab:
            masked_words = words.copy()
            masked_words[i] = tokenizer.mask_token
            masked_sentence = " ".join(masked_words)
            inputs = tokenizer(masked_sentence, return_tensors="pt").to(device)
            mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits[0, mask_token_index]
                probs = torch.softmax(logits, dim=-1).squeeze()
                word_id = tokenizer.encode(word, add_special_tokens=False)
                word_prob = torch.mean(probs[word_id]) if word_id else 0
            if word_prob < threshold:
                misspelled_indices.append(i)
    return misspelled_indices

def generate_masked_sentences(text: str, misspelled_indices: list) -> list:
    words = text.split()
    return [
        " ".join(words[:idx] + [tokenizer.mask_token] + words[idx + 1:])
        for idx in misspelled_indices
    ]

def predict(masked_sentence: str, top_k=25) -> list:
    inputs = tokenizer(masked_sentence, return_tensors="pt").to(device)
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits[0, mask_token_index]
    probs = torch.softmax(logits, dim=-1).squeeze()
    top_k_tokens = torch.topk(probs, top_k)
    predictions = []
    for token_id in top_k_tokens.indices:
        token = tokenizer.decode([token_id]).strip()
        if re.match(r"^[\u0600-\u06FF]{2,}$", token):
            predictions.append(token)
    return predictions

def pipeline(input_text: str, vocab: dict) -> str:
    processed_text = preprocess(input_text)
    misspelled_indices = find_misspellings(processed_text, vocab)

    if not misspelled_indices:
        return f"✅ لا توجد أخطاء إملائية:\n\n{processed_text}"

    masked_sentences = generate_masked_sentences(processed_text, misspelled_indices)
    words = processed_text.split()
    corrections = {}

    for idx, masked in zip(misspelled_indices, masked_sentences):
        original_word = words[idx]
        candidates = predict(masked)
        if candidates:
            best_candidate = min(candidates, key=lambda c: levenshtein_distance(c, original_word))
            corrections[original_word] = best_candidate
            words[idx] = best_candidate

    corrected_sentence = " ".join(words)
    corrections_text = "🔍 الكلمات التي تم تصحيحها:\n"
    for original, corrected_word in corrections.items():
        corrections_text += f" - {original} ➤ {corrected_word}\n"

    return f"❌ قبل التصحيح:\n{input_text}\n\n✅ بعد التصحيح:\n{corrected_sentence}\n\n{corrections_text}"

# Prepare vocab
df = df.drop(columns=["targe"], errors="ignore").dropna().drop_duplicates()
df["text"] = df["text"].apply(preprocess)
df["text"] = df["text"].apply(lambda x: x if len(x.split()) > 5 else None)
df = df.dropna().reset_index(drop=True)
words_freq = data_vocab(df)

# ==== Gradio Interface ====
gr.Interface(
    fn=lambda x: pipeline(x, words_freq),
    inputs=gr.Textbox(lines=5, label="أدخل النص"),
    outputs=gr.Textbox(label="النص المصحح"),
    title="تصحيح الأخطاء الإملائية للنصوص العربية",
    description="أدخل جملة باللغة العربية وسنقوم بمحاولة تصحيح الأخطاء الإملائية."
).launch()


Some weights of the model checkpoint at asafaya/bert-base-arabic were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://10a41491113f6caf97.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


